Switch to iSimdVector and Align WidenAsciiToUtf16 (dotnet#99982)

DeepakRajendrakumaran · matouskozak · commit b90cec43a654 · 2024-04-30T10:59:47.000+02:00
* Add AnyMatches() to iSimdVector interface * Switch to iSimdVector and Align WidenAsciiToUtf16. * Fixing perf * Addressing Review Comments. * Mirroring API change : dotnet#98055 (comment)
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/ISimdVector_2.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/ISimdVector_2.cs
@@ -553,6 +553,19 @@ static virtual bool TryCopyTo(TSelf vector, Span<T> destination)
         // New Surface Area
         //
 
+        /// <summary>Checks if any of the vector lanes are equivalent to value.</summary>
+        /// <param name="vector">The Vector.</param>
+        /// <param name="value">The Value to check.</param>
+        /// <returns><c>true</c> if <paramref name="vector" /> has any lanes equivalent to <paramref name="value" /> otherwise, <c>false</c> if none of the lanes are equivalent to <paramref name="value" /> />.</returns>
+        /// <exception cref="NotSupportedException">The type of the elements in the vector (<typeparamref name="T" />) is not supported.</exception>
+        static abstract bool Any(TSelf vector, T value);
+
+        /// <summary>Checks if any of the vector lanes have All Bits set.</summary>
+        /// <param name="vector">The Vector to check.</param>
+        /// <returns><c>true</c> if <paramref name="vector" /> has any lanes with All Bits set otherwise, <c>false</c> if none of the lanes have All Bits set />.</returns>
+        /// <exception cref="NotSupportedException">The type of the elements in the vector (<typeparamref name="T" />) is not supported.</exception>
+        static abstract bool AnyWhereAllBitsSet(TSelf vector);
+
         static abstract int IndexOfLastMatch(TSelf vector);
     }
 }
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128_1.cs
@@ -692,6 +692,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
         // New Surface Area
         //
 
+        static bool ISimdVector<Vector128<T>, T>.AnyWhereAllBitsSet(Vector128<T> vector)
+        {
+            return (Vector128.EqualsAny(vector, Vector128<T>.AllBitsSet));
+        }
+
+        static bool ISimdVector<Vector128<T>, T>.Any(Vector128<T> vector, T value)
+        {
+            return (Vector128.EqualsAny(vector, Vector128.Create((T)value)));
+        }
+
         static int ISimdVector<Vector128<T>, T>.IndexOfLastMatch(Vector128<T> vector)
         {
             uint mask = vector.ExtractMostSignificantBits();
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs
@@ -682,6 +682,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
         // New Surface Area
         //
 
+        static bool ISimdVector<Vector256<T>, T>.AnyWhereAllBitsSet(Vector256<T> vector)
+        {
+            return (Vector256.EqualsAny(vector, Vector256<T>.AllBitsSet));
+        }
+
+        static bool ISimdVector<Vector256<T>, T>.Any(Vector256<T> vector, T value)
+        {
+            return (Vector256.EqualsAny(vector, Vector256.Create((T)value)));
+        }
+
         static int ISimdVector<Vector256<T>, T>.IndexOfLastMatch(Vector256<T> vector)
         {
             uint mask = vector.ExtractMostSignificantBits();
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512_1.cs
@@ -682,6 +682,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
         // New Surface Area
         //
 
+        static bool ISimdVector<Vector512<T>, T>.AnyWhereAllBitsSet(Vector512<T> vector)
+        {
+            return (Vector512.EqualsAny(vector, Vector512<T>.AllBitsSet));
+        }
+
+        static bool ISimdVector<Vector512<T>, T>.Any(Vector512<T> vector, T value)
+        {
+            return (Vector512.EqualsAny(vector, Vector512.Create((T)value)));
+        }
+
         static int ISimdVector<Vector512<T>, T>.IndexOfLastMatch(Vector512<T> vector)
         {
             ulong mask = vector.ExtractMostSignificantBits();
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64_1.cs
@@ -757,6 +757,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
         // New Surface Area
         //
 
+        static bool ISimdVector<Vector64<T>, T>.AnyWhereAllBitsSet(Vector64<T> vector)
+        {
+            return (Vector64.EqualsAny(vector, Vector64<T>.AllBitsSet));
+        }
+
+        static bool ISimdVector<Vector64<T>, T>.Any(Vector64<T> vector, T value)
+        {
+            return (Vector64.EqualsAny(vector, Vector64.Create((T)value)));
+        }
+
         static int ISimdVector<Vector64<T>, T>.IndexOfLastMatch(Vector64<T> vector)
         {
             uint mask = vector.ExtractMostSignificantBits();
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
@@ -2038,79 +2038,17 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
 
             if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128<byte>.Count)
             {
-                ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer;
-
-                if (Vector512.IsHardwareAccelerated && elementCount >= (uint)Vector512<byte>.Count)
+                if (Vector512.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector512<byte>.Count)
                 {
-                    // Calculating the destination address outside the loop results in significant
-                    // perf wins vs. relying on the JIT to fold memory addressing logic into the
-                    // write instructions. See: https://github.com/dotnet/runtime/issues/33002
-                    nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector512<byte>.Count;
-
-                    do
-                    {
-                        Vector512<byte> asciiVector = Vector512.Load(pAsciiBuffer + currentOffset);
-
-                        if (asciiVector.ExtractMostSignificantBits() != 0)
-                        {
-                            break;
-                        }
-
-                        (Vector512<ushort> utf16LowVector, Vector512<ushort> utf16HighVector) = Vector512.Widen(asciiVector);
-                        utf16LowVector.Store(pCurrentWriteAddress);
-                        utf16HighVector.Store(pCurrentWriteAddress + Vector512<ushort>.Count);
-
-                        currentOffset += (nuint)Vector512<byte>.Count;
-                        pCurrentWriteAddress += (nuint)Vector512<byte>.Count;
-                    } while (currentOffset <= finalOffsetWhereCanRunLoop);
+                    WidenAsciiToUtf1_Vector<Vector512<byte>, Vector512<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount);
                 }
-                else if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256<byte>.Count)
+                else if (Vector256.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector256<byte>.Count)
                 {
-                    // Calculating the destination address outside the loop results in significant
-                    // perf wins vs. relying on the JIT to fold memory addressing logic into the
-                    // write instructions. See: https://github.com/dotnet/runtime/issues/33002
-                    nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256<byte>.Count;
-
-                    do
-                    {
-                        Vector256<byte> asciiVector = Vector256.Load(pAsciiBuffer + currentOffset);
-
-                        if (asciiVector.ExtractMostSignificantBits() != 0)
-                        {
-                            break;
-                        }
-
-                        (Vector256<ushort> utf16LowVector, Vector256<ushort> utf16HighVector) = Vector256.Widen(asciiVector);
-                        utf16LowVector.Store(pCurrentWriteAddress);
-                        utf16HighVector.Store(pCurrentWriteAddress + Vector256<ushort>.Count);
-
-                        currentOffset += (nuint)Vector256<byte>.Count;
-                        pCurrentWriteAddress += (nuint)Vector256<byte>.Count;
-                    } while (currentOffset <= finalOffsetWhereCanRunLoop);
+                    WidenAsciiToUtf1_Vector<Vector256<byte>, Vector256<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount);
                 }
-                else
+                else if (Vector128.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector128<byte>.Count)
                 {
-                    // Calculating the destination address outside the loop results in significant
-                    // perf wins vs. relying on the JIT to fold memory addressing logic into the
-                    // write instructions. See: https://github.com/dotnet/runtime/issues/33002
-                    nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector128<byte>.Count;
-
-                    do
-                    {
-                        Vector128<byte> asciiVector = Vector128.Load(pAsciiBuffer + currentOffset);
-
-                        if (VectorContainsNonAsciiChar(asciiVector))
-                        {
-                            break;
-                        }
-
-                        (Vector128<ushort> utf16LowVector, Vector128<ushort> utf16HighVector) = Vector128.Widen(asciiVector);
-                        utf16LowVector.Store(pCurrentWriteAddress);
-                        utf16HighVector.Store(pCurrentWriteAddress + Vector128<ushort>.Count);
-
-                        currentOffset += (nuint)Vector128<byte>.Count;
-                        pCurrentWriteAddress += (nuint)Vector128<byte>.Count;
-                    } while (currentOffset <= finalOffsetWhereCanRunLoop);
+                    WidenAsciiToUtf1_Vector<Vector128<byte>, Vector128<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount);
                 }
             }
 
@@ -2212,6 +2150,85 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
             goto Finish;
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void WidenAsciiToUtf1_Vector<TVectorByte, TVectorUInt16>(byte* pAsciiBuffer, char* pUtf16Buffer, ref nuint currentOffset, nuint elementCount)
+            where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte>
+            where TVectorUInt16 : unmanaged, ISimdVector<TVectorUInt16, ushort>
+        {
+            ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer;
+            // Calculating the destination address outside the loop results in significant
+            // perf wins vs. relying on the JIT to fold memory addressing logic into the
+            // write instructions. See: https://github.com/dotnet/runtime/issues/33002
+            nuint finalOffsetWhereCanRunLoop = elementCount - (nuint)TVectorByte.Count;
+            TVectorByte asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset);
+            if (!HasMatch<TVectorByte>(asciiVector))
+            {
+                (TVectorUInt16 utf16LowVector, TVectorUInt16 utf16HighVector) = Widen<TVectorByte, TVectorUInt16>(asciiVector);
+                utf16LowVector.Store(pCurrentWriteAddress);
+                utf16HighVector.Store(pCurrentWriteAddress + TVectorUInt16.Count);
+                pCurrentWriteAddress += (nuint)(TVectorUInt16.Count * 2);
+                if (((nuint)pCurrentWriteAddress % sizeof(char)) == 0)
+                {
+                    // Bump write buffer up to the next aligned boundary
+                    pCurrentWriteAddress = (ushort*)((nuint)pCurrentWriteAddress & ~(nuint)(TVectorUInt16.Alignment - 1));
+                    nuint numBytesWritten = (nuint)pCurrentWriteAddress - (nuint)pUtf16Buffer;
+                    currentOffset += (nuint)numBytesWritten / 2;
+                }
+                else
+                {
+                    // If input isn't char aligned, we won't be able to align it to a Vector
+                    currentOffset += (nuint)TVectorByte.Count;
+                }
+                while (currentOffset <= finalOffsetWhereCanRunLoop)
+                {
+                    asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset);
+                    if (HasMatch<TVectorByte>(asciiVector))
+                    {
+                        break;
+                    }
+                    (utf16LowVector, utf16HighVector) = Widen<TVectorByte, TVectorUInt16>(asciiVector);
+                    utf16LowVector.Store(pCurrentWriteAddress);
+                    utf16HighVector.Store(pCurrentWriteAddress + TVectorUInt16.Count);
+
+                    currentOffset += (nuint)TVectorByte.Count;
+                    pCurrentWriteAddress += (nuint)(TVectorUInt16.Count * 2);
+                }
+            }
+            return;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe bool HasMatch<TVectorByte>(TVectorByte vector)
+            where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte>
+        {
+            return !(vector & TVectorByte.Create((byte)0x80)).Equals(TVectorByte.Zero);
+        }
+
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe (TVectorUInt16 Lower, TVectorUInt16 Upper) Widen<TVectorByte, TVectorUInt16>(TVectorByte vector)
+            where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte>
+            where TVectorUInt16 : unmanaged, ISimdVector<TVectorUInt16, ushort>
+        {
+            if (typeof(TVectorByte) == typeof(Vector256<byte>))
+            {
+                (Vector256<ushort> Lower256, Vector256<ushort> Upper256) = Vector256.Widen((Vector256<byte>)(object)vector);
+                return ((TVectorUInt16)(object)Lower256, (TVectorUInt16)(object)Upper256);
+            }
+            else if (typeof(TVectorByte) == typeof(Vector512<byte>))
+            {
+                (Vector512<ushort> Lower512, Vector512<ushort> Upper512) = Vector512.Widen((Vector512<byte>)(object)vector);
+                return ((TVectorUInt16)(object)Lower512, (TVectorUInt16)(object)Upper512);
+            }
+            else
+            {
+                Debug.Assert(typeof(TVectorByte) == typeof(Vector128<byte>));
+                (Vector128<ushort> Lower128, Vector128<ushort> Upper128) = Vector128.Widen((Vector128<byte>)(object)vector);
+                return ((TVectorUInt16)(object)Lower128, (TVectorUInt16)(object)Upper128);
+            }
+        }
+
+
         /// <summary>
         /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
         /// writes them to the output buffer with machine endianness.