@@ -2038,79 +2038,17 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
20382038
20392039 if ( BitConverter . IsLittleEndian && Vector128 . IsHardwareAccelerated && elementCount >= ( uint ) Vector128 < byte > . Count )
20402040 {
2041- ushort * pCurrentWriteAddress = ( ushort * ) pUtf16Buffer;
2042-
2043- if ( Vector512 . IsHardwareAccelerated && elementCount >= ( uint ) Vector512 < byte > . Count )
2041+ if ( Vector512 . IsHardwareAccelerated && ( elementCount - currentOffset ) >= ( uint ) Vector512 < byte > . Count )
20442042 {
2045- // Calculating the destination address outside the loop results in significant
2046- // perf wins vs. relying on the JIT to fold memory addressing logic into the
2047- // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2048- nuint finalOffsetWhereCanRunLoop = elementCount - ( uint ) Vector512 < byte > . Count ;
2049-
2050- do
2051- {
2052- Vector512 < byte > asciiVector = Vector512 . Load ( pAsciiBuffer + currentOffset ) ;
2053-
2054- if ( asciiVector . ExtractMostSignificantBits ( ) != 0 )
2055- {
2056- break ;
2057- }
2058-
2059- ( Vector512 < ushort > utf16LowVector , Vector512 < ushort > utf16HighVector ) = Vector512 . Widen ( asciiVector ) ;
2060- utf16LowVector . Store ( pCurrentWriteAddress ) ;
2061- utf16HighVector . Store ( pCurrentWriteAddress + Vector512 < ushort > . Count ) ;
2062-
2063- currentOffset += ( nuint ) Vector512 < byte > . Count ;
2064- pCurrentWriteAddress += ( nuint ) Vector512 < byte > . Count ;
2065- } while ( currentOffset < = finalOffsetWhereCanRunLoop ) ;
2043+ WidenAsciiToUtf1_Vector < Vector512 < byte > , Vector512 < ushort > > ( pAsciiBuffer , pUtf16Buffer , ref currentOffset , elementCount ) ;
20662044 }
2067- else if ( Vector256 . IsHardwareAccelerated && elementCount >= ( uint ) Vector256 < byte > . Count )
2045+ else if ( Vector256 . IsHardwareAccelerated && ( elementCount - currentOffset ) >= ( uint ) Vector256 < byte > . Count )
20682046 {
2069- // Calculating the destination address outside the loop results in significant
2070- // perf wins vs. relying on the JIT to fold memory addressing logic into the
2071- // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2072- nuint finalOffsetWhereCanRunLoop = elementCount - ( uint ) Vector256 < byte > . Count ;
2073-
2074- do
2075- {
2076- Vector256 < byte > asciiVector = Vector256 . Load ( pAsciiBuffer + currentOffset ) ;
2077-
2078- if ( asciiVector . ExtractMostSignificantBits ( ) != 0 )
2079- {
2080- break ;
2081- }
2082-
2083- ( Vector256 < ushort > utf16LowVector , Vector256 < ushort > utf16HighVector ) = Vector256 . Widen ( asciiVector ) ;
2084- utf16LowVector . Store ( pCurrentWriteAddress ) ;
2085- utf16HighVector . Store ( pCurrentWriteAddress + Vector256 < ushort > . Count ) ;
2086-
2087- currentOffset += ( nuint ) Vector256 < byte > . Count ;
2088- pCurrentWriteAddress += ( nuint ) Vector256 < byte > . Count ;
2089- } while ( currentOffset < = finalOffsetWhereCanRunLoop ) ;
2047+ WidenAsciiToUtf1_Vector < Vector256 < byte > , Vector256 < ushort > > ( pAsciiBuffer , pUtf16Buffer , ref currentOffset , elementCount ) ;
20902048 }
2091- else
2049+ else if ( Vector128 . IsHardwareAccelerated && ( elementCount - currentOffset ) >= ( uint ) Vector128 < byte > . Count )
20922050 {
2093- // Calculating the destination address outside the loop results in significant
2094- // perf wins vs. relying on the JIT to fold memory addressing logic into the
2095- // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2096- nuint finalOffsetWhereCanRunLoop = elementCount - ( uint ) Vector128 < byte > . Count ;
2097-
2098- do
2099- {
2100- Vector128 < byte > asciiVector = Vector128. Load( pAsciiBuffer + currentOffset) ;
2101-
2102- if ( VectorContainsNonAsciiChar( asciiVector) )
2103- {
2104- break ;
2105- }
2106-
2107- ( Vector128< ushort > utf16LowVector , Vector128 < ushort > utf16HighVector ) = Vector128 . Widen ( asciiVector ) ;
2108- utf16LowVector. Store( pCurrentWriteAddress) ;
2109- utf16HighVector. Store( pCurrentWriteAddress + Vector128< ushort > . Count) ;
2110-
2111- currentOffset += ( nuint ) Vector128< byte > . Count;
2112- pCurrentWriteAddress += ( nuint ) Vector128< byte > . Count;
2113- } while ( currentOffset <= finalOffsetWhereCanRunLoop) ;
2051+ WidenAsciiToUtf1_Vector < Vector128 < byte > , Vector128 < ushort > > ( pAsciiBuffer , pUtf16Buffer , ref currentOffset , elementCount ) ;
21142052 }
21152053 }
21162054
@@ -2212,6 +2150,85 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
22122150 goto Finish ;
22132151 }
22142152
2153+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
2154+ private static unsafe void WidenAsciiToUtf1_Vector< TVectorByte, TVectorUInt16 > ( byte * pAsciiBuffer , char * pUtf16Buffer , ref nuint currentOffset , nuint elementCount )
2155+ where TVectorByte : unmanaged , ISimdVector < TVectorByte , byte >
2156+ where TVectorUInt16 : unmanaged , ISimdVector < TVectorUInt16 , ushort >
2157+ {
2158+ ushort * pCurrentWriteAddress = ( ushort * ) pUtf16Buffer;
2159+ // Calculating the destination address outside the loop results in significant
2160+ // perf wins vs. relying on the JIT to fold memory addressing logic into the
2161+ // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2162+ nuint finalOffsetWhereCanRunLoop = elementCount - ( nuint ) TVectorByte . Count ;
2163+ TVectorByte asciiVector = TVectorByte . Load ( pAsciiBuffer + currentOffset ) ;
2164+ if ( ! HasMatch < TVectorByte > ( asciiVector ) )
2165+ {
2166+ ( TVectorUInt16 utf16LowVector , TVectorUInt16 utf16HighVector ) = Widen < TVectorByte , TVectorUInt16 > ( asciiVector ) ;
2167+ utf16LowVector . Store ( pCurrentWriteAddress ) ;
2168+ utf16HighVector . Store ( pCurrentWriteAddress + TVectorUInt16 . Count ) ;
2169+ pCurrentWriteAddress += ( nuint ) ( TVectorUInt16 . Count * 2 ) ;
2170+ if ( ( ( nuint ) pCurrentWriteAddress % sizeof ( char ) ) == 0 )
2171+ {
2172+ // Bump write buffer up to the next aligned boundary
2173+ pCurrentWriteAddress = ( ushort * ) ( ( nuint ) pCurrentWriteAddress & ~ ( nuint ) ( TVectorUInt16. Alignment - 1 ) ) ;
2174+ nuint numBytesWritten = ( nuint ) pCurrentWriteAddress - ( nuint ) pUtf16Buffer;
2175+ currentOffset += ( nuint ) numBytesWritten / 2 ;
2176+ }
2177+ else
2178+ {
2179+ // If input isn't char aligned, we won't be able to align it to a Vector
2180+ currentOffset += ( nuint ) TVectorByte. Count;
2181+ }
2182+ while ( currentOffset <= finalOffsetWhereCanRunLoop)
2183+ {
2184+ asciiVector = TVectorByte. Load( pAsciiBuffer + currentOffset) ;
2185+ if ( HasMatch< TVectorByte> ( asciiVector) )
2186+ {
2187+ break ;
2188+ }
2189+ ( utf16LowVector, utf16HighVector) = Widen< TVectorByte, TVectorUInt16> ( asciiVector) ;
2190+ utf16LowVector. Store( pCurrentWriteAddress) ;
2191+ utf16HighVector. Store( pCurrentWriteAddress + TVectorUInt16. Count) ;
2192+
2193+ currentOffset += ( nuint ) TVectorByte. Count;
2194+ pCurrentWriteAddress += ( nuint ) ( TVectorUInt16. Count * 2 ) ;
2195+ }
2196+ }
2197+ return ;
2198+ }
2199+
2200+ [ MethodImpl( MethodImplOptions. AggressiveInlining) ]
2201+ private static unsafe bool HasMatch< TVectorByte> ( TVectorByte vector)
2202+ where TVectorByte : unmanaged , ISimdVector < TVectorByte , byte >
2203+ {
2204+ return ! ( vector & TVectorByte . Create ( ( byte ) 0x80 ) ) . Equals ( TVectorByte . Zero ) ;
2205+ }
2206+
2207+
2208+ [ MethodImpl( MethodImplOptions. AggressiveInlining) ]
2209+ private static unsafe ( TVectorUInt16 Lower, TVectorUInt16 Upper ) Widen < TVectorByte, TVectorUInt16 > ( TVectorByte vector)
2210+ where TVectorByte : unmanaged , ISimdVector < TVectorByte , byte >
2211+ where TVectorUInt16 : unmanaged , ISimdVector < TVectorUInt16 , ushort >
2212+ {
2213+ if ( typeof ( TVectorByte ) == typeof ( Vector256 < byte > ) )
2214+ {
2215+ ( Vector256 < ushort > Lower256 , Vector256 < ushort > Upper256 ) = Vector256 . Widen ( ( Vector256 < byte > ) ( object ) vector) ;
2216+ return ( ( TVectorUInt16 ) ( object ) Lower256, ( TVectorUInt16 ) ( object ) Upper256) ;
2217+ }
2218+ else if ( typeof ( TVectorByte ) == typeof ( Vector512 < byte > ) )
2219+ {
2220+ ( Vector512 < ushort > Lower512 , Vector512 < ushort > Upper512 ) = Vector512 . Widen ( ( Vector512 < byte > ) ( object ) vector) ;
2221+ return ( ( TVectorUInt16 ) ( object ) Lower512, ( TVectorUInt16 ) ( object ) Upper512) ;
2222+ }
2223+ else
2224+ {
2225+ Debug . Assert ( typeof ( TVectorByte ) == typeof ( Vector128 < byte > ) ) ;
2226+ ( Vector128 < ushort > Lower128 , Vector128 < ushort > Upper128 ) = Vector128 . Widen ( ( Vector128 < byte > ) ( object ) vector) ;
2227+ return ( ( TVectorUInt16 ) ( object ) Lower128, ( TVectorUInt16 ) ( object ) Upper128) ;
2228+ }
2229+ }
2230+
2231+
22152232 /// <summary>
22162233 /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
22172234 /// writes them to the output buffer with machine endianness.
0 commit comments