@@ -2238,5 +2238,96 @@ private static uint FindFirstMatchedLane(Vector128<byte> compareResult)
22382238 // Find the first lane that is set inside compareResult.
22392239 return ( uint ) BitOperations . TrailingZeroCount ( selectedLanes ) >> 2 ;
22402240 }
2241+
2242+ public static void Reverse ( ref byte buf , nuint length )
2243+ {
2244+ if ( Avx2 . IsSupported && ( nuint ) Vector256 < byte > . Count * 2 <= length )
2245+ {
2246+ Vector256 < byte > reverseMask = Vector256 . Create (
2247+ ( byte ) 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 , // first 128-bit lane
2248+ 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 ) ; // second 128-bit lane
2249+ nuint numElements = ( nuint ) Vector256 < byte > . Count ;
2250+ nuint numIters = ( length / numElements ) / 2 ;
2251+ for ( nuint i = 0 ; i < numIters ; i ++ )
2252+ {
2253+ nuint firstOffset = i * numElements ;
2254+ nuint lastOffset = length - ( ( 1 + i ) * numElements ) ;
2255+
2256+ // Load in values from beginning and end of the array.
2257+ Vector256 < byte > tempFirst = Vector256 . LoadUnsafe ( ref buf , firstOffset ) ;
2258+ Vector256 < byte > tempLast = Vector256 . LoadUnsafe ( ref buf , lastOffset ) ;
2259+
2260+ // Avx2 operates on two 128-bit lanes rather than the full 256-bit vector.
2261+ // Perform a shuffle to reverse each 128-bit lane, then permute to finish reversing the vector:
2262+ // +-------------------------------------------------------------------------------+
2263+ // | A1 | B1 | C1 | D1 | E1 | F1 | G1 | H1 | I1 | J1 | K1 | L1 | M1 | N1 | O1 | P1 |
2264+ // +-------------------------------------------------------------------------------+
2265+ // | A2 | B2 | C2 | D2 | E2 | F2 | G2 | H2 | I2 | J2 | K2 | L2 | M2 | N2 | O2 | P2 |
2266+ // +-------------------------------------------------------------------------------+
2267+ // Shuffle --->
2268+ // +-------------------------------------------------------------------------------+
2269+ // | P1 | O1 | N1 | M1 | L1 | K1 | J1 | I1 | H1 | G1 | F1 | E1 | D1 | C1 | B1 | A1 |
2270+ // +-------------------------------------------------------------------------------+
2271+ // | P2 | O2 | N2 | M2 | L2 | K2 | J2 | I2 | H2 | G2 | F2 | E2 | D2 | C2 | B2 | A2 |
2272+ // +-------------------------------------------------------------------------------+
2273+ // Permute --->
2274+ // +-------------------------------------------------------------------------------+
2275+ // | P2 | O2 | N2 | M2 | L2 | K2 | J2 | I2 | H2 | G2 | F2 | E2 | D2 | C2 | B2 | A2 |
2276+ // +-------------------------------------------------------------------------------+
2277+ // | P1 | O1 | N1 | M1 | L1 | K1 | J1 | I1 | H1 | G1 | F1 | E1 | D1 | C1 | B1 | A1 |
2278+ // +-------------------------------------------------------------------------------+
2279+ tempFirst = Avx2 . Shuffle ( tempFirst , reverseMask ) ;
2280+ tempFirst = Avx2 . Permute2x128 ( tempFirst , tempFirst , 0b00_01 ) ;
2281+ tempLast = Avx2 . Shuffle ( tempLast , reverseMask ) ;
2282+ tempLast = Avx2 . Permute2x128 ( tempLast , tempLast , 0b00_01 ) ;
2283+
2284+ // Store the reversed vectors
2285+ tempLast . StoreUnsafe ( ref buf , firstOffset ) ;
2286+ tempFirst . StoreUnsafe ( ref buf , lastOffset ) ;
2287+ }
2288+ buf = ref Unsafe . Add ( ref buf , numIters * numElements ) ;
2289+ length -= numIters * numElements * 2 ;
2290+ }
2291+ else if ( Sse2 . IsSupported && ( nuint ) Vector128 < byte > . Count * 2 <= length )
2292+ {
2293+ Vector128 < byte > reverseMask = Vector128 . Create ( ( byte ) 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 ) ;
2294+ nuint numElements = ( nuint ) Vector128 < byte > . Count ;
2295+ nuint numIters = ( length / numElements ) / 2 ;
2296+ for ( nuint i = 0 ; i < numIters ; i ++ )
2297+ {
2298+ nuint firstOffset = i * numElements ;
2299+ nuint lastOffset = length - ( ( 1 + i ) * numElements ) ;
2300+
2301+ // Load in values from beginning and end of the array.
2302+ Vector128 < byte > tempFirst = Vector128 . LoadUnsafe ( ref buf , firstOffset ) ;
2303+ Vector128 < byte > tempLast = Vector128 . LoadUnsafe ( ref buf , lastOffset ) ;
2304+
2305+ // Shuffle to reverse each vector:
2306+ // +---------------------------------------------------------------+
2307+ // | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P |
2308+ // +---------------------------------------------------------------+
2309+ // --->
2310+ // +---------------------------------------------------------------+
2311+ // | P | O | N | M | L | K | J | I | H | G | F | E | D | C | B | A |
2312+ // +---------------------------------------------------------------+
2313+ tempFirst = Ssse3 . Shuffle ( tempFirst , reverseMask ) ;
2314+ tempLast = Ssse3 . Shuffle ( tempLast , reverseMask ) ;
2315+
2316+ // Store the reversed vectors
2317+ tempLast . StoreUnsafe ( ref buf , firstOffset ) ;
2318+ tempFirst . StoreUnsafe ( ref buf , lastOffset ) ;
2319+ }
2320+ buf = ref Unsafe . Add ( ref buf , numIters * numElements ) ;
2321+ length -= numIters * numElements * 2 ;
2322+ }
2323+
2324+ // Store any remaining values one-by-one
2325+ for ( nuint i = 0 ; i < ( length / 2 ) ; i ++ )
2326+ {
2327+ ref byte first = ref Unsafe . Add ( ref buf , i ) ;
2328+ ref byte last = ref Unsafe . Add ( ref buf , length - 1 - i ) ;
2329+ ( last , first ) = ( first , last ) ;
2330+ }
2331+ }
22412332 }
22422333}
0 commit comments