Skip to content

Commit fe6eb87

Browse files
authored
Performance improvements to vectorized Span.Reverse (#78650)
1 parent b24d4c2 commit fe6eb87

File tree

3 files changed

+208
-139
lines changed

3 files changed

+208
-139
lines changed

src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs

Lines changed: 78 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4+
using System.Buffers.Binary;
45
using System.Diagnostics;
56
using System.Diagnostics.CodeAnalysis;
67
using System.Numerics;
@@ -1129,21 +1130,24 @@ private static unsafe nuint UnalignedCountVector128(ref byte searchSpace)
11291130

11301131
public static void Reverse(ref byte buf, nuint length)
11311132
{
1132-
if (Avx2.IsSupported && (nuint)Vector256<byte>.Count * 2 <= length)
1133+
Debug.Assert(length > 1);
1134+
1135+
nint remainder = (nint)length;
1136+
nint offset = 0;
1137+
1138+
// overlapping has a positive performance benefit around 48 elements
1139+
if (Avx2.IsSupported && remainder >= (nint)(Vector256<byte>.Count * 1.5))
11331140
{
11341141
Vector256<byte> reverseMask = Vector256.Create(
11351142
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
11361143
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane
1137-
nuint numElements = (nuint)Vector256<byte>.Count;
1138-
nuint numIters = (length / numElements) / 2;
1139-
for (nuint i = 0; i < numIters; i++)
1140-
{
1141-
nuint firstOffset = i * numElements;
1142-
nuint lastOffset = length - ((1 + i) * numElements);
11431144

1144-
// Load in values from beginning and end of the array.
1145-
Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref buf, firstOffset);
1146-
Vector256<byte> tempLast = Vector256.LoadUnsafe(ref buf, lastOffset);
1145+
nint lastOffset = remainder - Vector256<byte>.Count;
1146+
do
1147+
{
1148+
// Load the values into vectors
1149+
Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref buf, (nuint)offset);
1150+
Vector256<byte> tempLast = Vector256.LoadUnsafe(ref buf, (nuint)lastOffset);
11471151

11481152
// Avx2 operates on two 128-bit lanes rather than the full 256-bit vector.
11491153
// Perform a shuffle to reverse each 128-bit lane, then permute to finish reversing the vector:
@@ -1170,24 +1174,23 @@ public static void Reverse(ref byte buf, nuint length)
11701174
tempLast = Avx2.Permute2x128(tempLast, tempLast, 0b00_01);
11711175

11721176
// Store the reversed vectors
1173-
tempLast.StoreUnsafe(ref buf, firstOffset);
1174-
tempFirst.StoreUnsafe(ref buf, lastOffset);
1175-
}
1176-
buf = ref Unsafe.Add(ref buf, numIters * numElements);
1177-
length -= numIters * numElements * 2;
1177+
tempLast.StoreUnsafe(ref buf, (nuint)offset);
1178+
tempFirst.StoreUnsafe(ref buf, (nuint)lastOffset);
1179+
1180+
offset += Vector256<byte>.Count;
1181+
lastOffset -= Vector256<byte>.Count;
1182+
} while (lastOffset >= offset);
1183+
1184+
remainder = lastOffset + Vector256<byte>.Count - offset;
11781185
}
1179-
else if (Vector128.IsHardwareAccelerated && (nuint)Vector128<byte>.Count * 2 <= length)
1186+
else if (Vector128.IsHardwareAccelerated && remainder >= Vector128<byte>.Count * 2)
11801187
{
1181-
nuint numElements = (nuint)Vector128<byte>.Count;
1182-
nuint numIters = (length / numElements) / 2;
1183-
for (nuint i = 0; i < numIters; i++)
1188+
nint lastOffset = remainder - Vector128<byte>.Count;
1189+
do
11841190
{
1185-
nuint firstOffset = i * numElements;
1186-
nuint lastOffset = length - ((1 + i) * numElements);
1187-
1188-
// Load in values from beginning and end of the array.
1189-
Vector128<byte> tempFirst = Vector128.LoadUnsafe(ref buf, firstOffset);
1190-
Vector128<byte> tempLast = Vector128.LoadUnsafe(ref buf, lastOffset);
1191+
// Load the values into vectors
1192+
Vector128<byte> tempFirst = Vector128.LoadUnsafe(ref buf, (nuint)offset);
1193+
Vector128<byte> tempLast = Vector128.LoadUnsafe(ref buf, (nuint)lastOffset);
11911194

11921195
// Shuffle to reverse each vector:
11931196
// +---------------------------------------------------------------+
@@ -1203,15 +1206,58 @@ public static void Reverse(ref byte buf, nuint length)
12031206
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
12041207

12051208
// Store the reversed vectors
1206-
tempLast.StoreUnsafe(ref buf, firstOffset);
1207-
tempFirst.StoreUnsafe(ref buf, lastOffset);
1208-
}
1209-
buf = ref Unsafe.Add(ref buf, numIters * numElements);
1210-
length -= numIters * numElements * 2;
1209+
tempLast.StoreUnsafe(ref buf, (nuint)offset);
1210+
tempFirst.StoreUnsafe(ref buf, (nuint)lastOffset);
1211+
1212+
offset += Vector128<byte>.Count;
1213+
lastOffset -= Vector128<byte>.Count;
1214+
} while (lastOffset >= offset);
1215+
1216+
remainder = lastOffset + Vector128<byte>.Count - offset;
1217+
}
1218+
1219+
if (remainder >= sizeof(long))
1220+
{
1221+
nint lastOffset = (nint)length - offset - sizeof(long);
1222+
do
1223+
{
1224+
long tempFirst = Unsafe.ReadUnaligned<long>(ref Unsafe.Add(ref buf, offset));
1225+
long tempLast = Unsafe.ReadUnaligned<long>(ref Unsafe.Add(ref buf, lastOffset));
1226+
1227+
// swap and store in reversed position
1228+
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, offset), BinaryPrimitives.ReverseEndianness(tempLast));
1229+
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, lastOffset), BinaryPrimitives.ReverseEndianness(tempFirst));
1230+
1231+
offset += sizeof(long);
1232+
lastOffset -= sizeof(long);
1233+
} while (lastOffset >= offset);
1234+
1235+
remainder = lastOffset + sizeof(long) - offset;
12111236
}
12121237

1213-
// Store any remaining values one-by-one
1214-
ReverseInner(ref buf, length);
1238+
if (remainder >= sizeof(int))
1239+
{
1240+
nint lastOffset = (nint)length - offset - sizeof(int);
1241+
do
1242+
{
1243+
int tempFirst = Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref buf, offset));
1244+
int tempLast = Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref buf, lastOffset));
1245+
1246+
// swap and store in reversed position
1247+
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, offset), BinaryPrimitives.ReverseEndianness(tempLast));
1248+
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, lastOffset), BinaryPrimitives.ReverseEndianness(tempFirst));
1249+
1250+
offset += sizeof(int);
1251+
lastOffset -= sizeof(int);
1252+
} while (lastOffset >= offset);
1253+
1254+
remainder = lastOffset + sizeof(int) - offset;
1255+
}
1256+
1257+
if (remainder > 1)
1258+
{
1259+
ReverseInner(ref Unsafe.Add(ref buf, offset), (nuint)remainder);
1260+
}
12151261
}
12161262
}
12171263
}

src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs

Lines changed: 45 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -733,23 +733,26 @@ private static unsafe nint UnalignedCountVector128(ref char searchSpace)
733733

734734
public static void Reverse(ref char buf, nuint length)
735735
{
736-
if (Avx2.IsSupported && (nuint)Vector256<short>.Count * 2 <= length)
736+
Debug.Assert(length > 1);
737+
738+
nint remainder = (nint)length;
739+
nint offset = 0;
740+
741+
// overlapping has a positive performance benefit around 24 elements
742+
if (Avx2.IsSupported && remainder >= (nint)(Vector256<ushort>.Count * 1.5))
737743
{
738-
ref byte bufByte = ref Unsafe.As<char, byte>(ref buf);
739-
nuint byteLength = length * sizeof(char);
740744
Vector256<byte> reverseMask = Vector256.Create(
741745
(byte)14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, // first 128-bit lane
742746
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); // second 128-bit lane
743-
nuint numElements = (nuint)Vector256<byte>.Count;
744-
nuint numIters = (byteLength / numElements) / 2;
745-
for (nuint i = 0; i < numIters; i++)
747+
748+
nint lastOffset = remainder - Vector256<ushort>.Count;
749+
do
746750
{
747-
nuint firstOffset = i * numElements;
748-
nuint lastOffset = byteLength - ((1 + i) * numElements);
751+
ref byte first = ref Unsafe.As<char, byte>(ref Unsafe.Add(ref buf, offset));
752+
ref byte last = ref Unsafe.As<char, byte>(ref Unsafe.Add(ref buf, lastOffset));
749753

750-
// Load in values from beginning and end of the array.
751-
Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref bufByte, firstOffset);
752-
Vector256<byte> tempLast = Vector256.LoadUnsafe(ref bufByte, lastOffset);
754+
Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref first);
755+
Vector256<byte> tempLast = Vector256.LoadUnsafe(ref last);
753756

754757
// Avx2 operates on two 128-bit lanes rather than the full 256-bit vector.
755758
// Perform a shuffle to reverse each 128-bit lane, then permute to finish reversing the vector:
@@ -770,27 +773,25 @@ public static void Reverse(ref char buf, nuint length)
770773
tempLast = Avx2.Permute2x128(tempLast, tempLast, 0b00_01);
771774

772775
// Store the reversed vectors
773-
tempLast.StoreUnsafe(ref bufByte, firstOffset);
774-
tempFirst.StoreUnsafe(ref bufByte, lastOffset);
775-
}
776-
bufByte = ref Unsafe.Add(ref bufByte, numIters * numElements);
777-
length -= numIters * (nuint)Vector256<short>.Count * 2;
778-
// Store any remaining values one-by-one
779-
buf = ref Unsafe.As<byte, char>(ref bufByte);
776+
tempLast.StoreUnsafe(ref first);
777+
tempFirst.StoreUnsafe(ref last);
778+
779+
offset += Vector256<ushort>.Count;
780+
lastOffset -= Vector256<ushort>.Count;
781+
} while (lastOffset >= offset);
782+
783+
remainder = (lastOffset + Vector256<ushort>.Count - offset);
780784
}
781-
else if (Vector128.IsHardwareAccelerated && (nuint)Vector128<short>.Count * 2 <= length)
785+
else if (Vector128.IsHardwareAccelerated && remainder >= Vector128<ushort>.Count * 2)
782786
{
783-
ref short bufShort = ref Unsafe.As<char, short>(ref buf);
784-
nuint numElements = (nuint)Vector128<short>.Count;
785-
nuint numIters = (length / numElements) / 2;
786-
for (nuint i = 0; i < numIters; i++)
787+
nint lastOffset = remainder - Vector128<ushort>.Count;
788+
do
787789
{
788-
nuint firstOffset = i * numElements;
789-
nuint lastOffset = length - ((1 + i) * numElements);
790+
ref ushort first = ref Unsafe.As<char, ushort>(ref Unsafe.Add(ref buf, offset));
791+
ref ushort last = ref Unsafe.As<char, ushort>(ref Unsafe.Add(ref buf, lastOffset));
790792

791-
// Load in values from beginning and end of the array.
792-
Vector128<short> tempFirst = Vector128.LoadUnsafe(ref bufShort, firstOffset);
793-
Vector128<short> tempLast = Vector128.LoadUnsafe(ref bufShort, lastOffset);
793+
Vector128<ushort> tempFirst = Vector128.LoadUnsafe(ref first);
794+
Vector128<ushort> tempLast = Vector128.LoadUnsafe(ref last);
794795

795796
// Shuffle to reverse each vector:
796797
// +-------------------------------+
@@ -800,19 +801,25 @@ public static void Reverse(ref char buf, nuint length)
800801
// +-------------------------------+
801802
// | H | G | F | E | D | C | B | A |
802803
// +-------------------------------+
803-
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create(7, 6, 5, 4, 3, 2, 1, 0));
804-
tempLast = Vector128.Shuffle(tempLast, Vector128.Create(7, 6, 5, 4, 3, 2, 1, 0));
804+
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create((ushort)7, 6, 5, 4, 3, 2, 1, 0));
805+
tempLast = Vector128.Shuffle(tempLast, Vector128.Create((ushort)7, 6, 5, 4, 3, 2, 1, 0));
805806

806807
// Store the reversed vectors
807-
tempLast.StoreUnsafe(ref bufShort, firstOffset);
808-
tempFirst.StoreUnsafe(ref bufShort, lastOffset);
809-
}
810-
bufShort = ref Unsafe.Add(ref bufShort, numIters * numElements);
811-
length -= numIters * (nuint)Vector128<short>.Count * 2;
812-
// Store any remaining values one-by-one
813-
buf = ref Unsafe.As<short, char>(ref bufShort);
808+
tempLast.StoreUnsafe(ref first);
809+
tempFirst.StoreUnsafe(ref last);
810+
811+
offset += Vector128<ushort>.Count;
812+
lastOffset -= Vector128<ushort>.Count;
813+
} while (lastOffset >= offset);
814+
815+
remainder = (lastOffset + Vector128<ushort>.Count - offset);
816+
}
817+
818+
// Store any remaining values one-by-one
819+
if (remainder > 1)
820+
{
821+
ReverseInner(ref Unsafe.Add(ref buf, offset), (nuint)remainder);
814822
}
815-
ReverseInner(ref buf, length);
816823
}
817824
}
818825
}

0 commit comments

Comments
 (0)