11// Licensed to the .NET Foundation under one or more agreements.
22// The .NET Foundation licenses this file to you under the MIT license.
33
4+ using System . Buffers . Binary ;
45using System . Diagnostics ;
56using System . Diagnostics . CodeAnalysis ;
67using System . Numerics ;
@@ -1129,21 +1130,24 @@ private static unsafe nuint UnalignedCountVector128(ref byte searchSpace)
11291130
11301131 public static void Reverse ( ref byte buf , nuint length )
11311132 {
1132- if ( Avx2 . IsSupported && ( nuint ) Vector256 < byte > . Count * 2 <= length )
1133+ Debug . Assert ( length > 1 ) ;
1134+
1135+ nint remainder = ( nint ) length ;
1136+ nint offset = 0 ;
1137+
1138+ // overlapping has a positive performance benefit around 48 elements
1139+ if ( Avx2 . IsSupported && remainder >= ( nint ) ( Vector256 < byte > . Count * 1.5 ) )
11331140 {
11341141 Vector256 < byte > reverseMask = Vector256 . Create (
11351142 ( byte ) 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 , // first 128-bit lane
11361143 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 ) ; // second 128-bit lane
1137- nuint numElements = ( nuint ) Vector256 < byte > . Count ;
1138- nuint numIters = ( length / numElements ) / 2 ;
1139- for ( nuint i = 0 ; i < numIters ; i ++ )
1140- {
1141- nuint firstOffset = i * numElements ;
1142- nuint lastOffset = length - ( ( 1 + i ) * numElements ) ;
11431144
1144- // Load in values from beginning and end of the array.
1145- Vector256 < byte > tempFirst = Vector256 . LoadUnsafe ( ref buf , firstOffset ) ;
1146- Vector256 < byte > tempLast = Vector256 . LoadUnsafe ( ref buf , lastOffset ) ;
1145+ nint lastOffset = remainder - Vector256 < byte > . Count ;
1146+ do
1147+ {
1148+ // Load the values into vectors
1149+ Vector256 < byte > tempFirst = Vector256 . LoadUnsafe ( ref buf , ( nuint ) offset ) ;
1150+ Vector256 < byte > tempLast = Vector256 . LoadUnsafe ( ref buf , ( nuint ) lastOffset ) ;
11471151
11481152 // Avx2 operates on two 128-bit lanes rather than the full 256-bit vector.
11491153 // Perform a shuffle to reverse each 128-bit lane, then permute to finish reversing the vector:
@@ -1170,24 +1174,23 @@ public static void Reverse(ref byte buf, nuint length)
11701174 tempLast = Avx2 . Permute2x128 ( tempLast , tempLast , 0b00_01 ) ;
11711175
11721176 // Store the reversed vectors
1173- tempLast . StoreUnsafe ( ref buf , firstOffset ) ;
1174- tempFirst . StoreUnsafe ( ref buf , lastOffset ) ;
1175- }
1176- buf = ref Unsafe . Add ( ref buf , numIters * numElements ) ;
1177- length -= numIters * numElements * 2 ;
1177+ tempLast . StoreUnsafe ( ref buf , ( nuint ) offset ) ;
1178+ tempFirst . StoreUnsafe ( ref buf , ( nuint ) lastOffset ) ;
1179+
1180+ offset += Vector256 < byte > . Count ;
1181+ lastOffset -= Vector256 < byte > . Count ;
1182+ } while ( lastOffset >= offset ) ;
1183+
1184+ remainder = lastOffset + Vector256 < byte > . Count - offset ;
11781185 }
1179- else if ( Vector128 . IsHardwareAccelerated && ( nuint ) Vector128 < byte > . Count * 2 <= length )
1186+ else if ( Vector128 . IsHardwareAccelerated && remainder >= Vector128 < byte > . Count * 2 )
11801187 {
1181- nuint numElements = ( nuint ) Vector128 < byte > . Count ;
1182- nuint numIters = ( length / numElements ) / 2 ;
1183- for ( nuint i = 0 ; i < numIters ; i ++ )
1188+ nint lastOffset = remainder - Vector128 < byte > . Count ;
1189+ do
11841190 {
1185- nuint firstOffset = i * numElements ;
1186- nuint lastOffset = length - ( ( 1 + i ) * numElements ) ;
1187-
1188- // Load in values from beginning and end of the array.
1189- Vector128 < byte > tempFirst = Vector128 . LoadUnsafe ( ref buf , firstOffset ) ;
1190- Vector128 < byte > tempLast = Vector128 . LoadUnsafe ( ref buf , lastOffset ) ;
1191+ // Load the values into vectors
1192+ Vector128 < byte > tempFirst = Vector128 . LoadUnsafe ( ref buf , ( nuint ) offset ) ;
1193+ Vector128 < byte > tempLast = Vector128 . LoadUnsafe ( ref buf , ( nuint ) lastOffset ) ;
11911194
11921195 // Shuffle to reverse each vector:
11931196 // +---------------------------------------------------------------+
@@ -1203,15 +1206,58 @@ public static void Reverse(ref byte buf, nuint length)
12031206 ( byte ) 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 ) ) ;
12041207
12051208 // Store the reversed vectors
1206- tempLast . StoreUnsafe ( ref buf , firstOffset ) ;
1207- tempFirst . StoreUnsafe ( ref buf , lastOffset ) ;
1208- }
1209- buf = ref Unsafe . Add ( ref buf , numIters * numElements ) ;
1210- length -= numIters * numElements * 2 ;
1209+ tempLast . StoreUnsafe ( ref buf , ( nuint ) offset ) ;
1210+ tempFirst . StoreUnsafe ( ref buf , ( nuint ) lastOffset ) ;
1211+
1212+ offset += Vector128 < byte > . Count ;
1213+ lastOffset -= Vector128 < byte > . Count ;
1214+ } while ( lastOffset >= offset ) ;
1215+
1216+ remainder = lastOffset + Vector128 < byte > . Count - offset ;
1217+ }
1218+
1219+ if ( remainder >= sizeof ( long ) )
1220+ {
1221+ nint lastOffset = ( nint ) length - offset - sizeof ( long ) ;
1222+ do
1223+ {
1224+ long tempFirst = Unsafe . ReadUnaligned < long > ( ref Unsafe . Add ( ref buf , offset ) ) ;
1225+ long tempLast = Unsafe . ReadUnaligned < long > ( ref Unsafe . Add ( ref buf , lastOffset ) ) ;
1226+
1227+ // swap and store in reversed position
1228+ Unsafe . WriteUnaligned ( ref Unsafe . Add ( ref buf , offset ) , BinaryPrimitives . ReverseEndianness ( tempLast ) ) ;
1229+ Unsafe . WriteUnaligned ( ref Unsafe . Add ( ref buf , lastOffset ) , BinaryPrimitives . ReverseEndianness ( tempFirst ) ) ;
1230+
1231+ offset += sizeof ( long ) ;
1232+ lastOffset -= sizeof ( long ) ;
1233+ } while ( lastOffset >= offset ) ;
1234+
1235+ remainder = lastOffset + sizeof ( long ) - offset ;
12111236 }
12121237
1213- // Store any remaining values one-by-one
1214- ReverseInner ( ref buf , length ) ;
1238+ if ( remainder >= sizeof ( int ) )
1239+ {
1240+ nint lastOffset = ( nint ) length - offset - sizeof ( int ) ;
1241+ do
1242+ {
1243+ int tempFirst = Unsafe . ReadUnaligned < int > ( ref Unsafe . Add ( ref buf , offset ) ) ;
1244+ int tempLast = Unsafe . ReadUnaligned < int > ( ref Unsafe . Add ( ref buf , lastOffset ) ) ;
1245+
1246+ // swap and store in reversed position
1247+ Unsafe . WriteUnaligned ( ref Unsafe . Add ( ref buf , offset ) , BinaryPrimitives . ReverseEndianness ( tempLast ) ) ;
1248+ Unsafe . WriteUnaligned ( ref Unsafe . Add ( ref buf , lastOffset ) , BinaryPrimitives . ReverseEndianness ( tempFirst ) ) ;
1249+
1250+ offset += sizeof ( int ) ;
1251+ lastOffset -= sizeof ( int ) ;
1252+ } while ( lastOffset >= offset ) ;
1253+
1254+ remainder = lastOffset + sizeof ( int ) - offset ;
1255+ }
1256+
1257+ if ( remainder > 1 )
1258+ {
1259+ ReverseInner ( ref Unsafe . Add ( ref buf , offset ) , ( nuint ) remainder ) ;
1260+ }
12151261 }
12161262 }
12171263}
0 commit comments