dotnet · xtqqczze · Aug 10, 2023 · Aug 10, 2023 · Aug 10, 2023 · EgorBo
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Latin1Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Latin1Utility.cs
@@ -945,6 +945,13 @@ private static unsafe nuint NarrowUtf16ToLatin1_Sse2(char* pUtf16Buffer, byte* p
         /// </summary>
         public static unsafe void WidenLatin1ToUtf16(byte* pLatin1Buffer, char* pUtf16Buffer, nuint elementCount)
         {
+            if (((nuint)pUtf16Buffer & 1) != 0)
+            {
+                // Input isn't char aligned, we won't be able to vectorize.
+                WidenLatin1ToUtf16_MisalignedAddress(pLatin1Buffer, pUtf16Buffer, elementCount);
 if (elementCount >= SizeOfVector128) 
 { 
     // First, perform an unaligned 1x 64-bit read from the input buffer and an unaligned 
     // 1x 128-bit write to the destination buffer. 
     latin1Vector = Sse2.LoadScalarVector128((ulong*)pLatin1Buffer).AsByte(); // unaligned load 
     Sse2.Store((byte*)pUtf16Buffer, Sse2.UnpackLow(latin1Vector, zeroVector)); // unaligned write 
     // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment 
     // point, then use that as the base offset going forward. Remember the >> 1 to account for 
     // that we wrote chars, not bytes. This means we may re-read data in the next iteration of 
     // the loop, but this is ok. 
     currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1)); 
     Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char)); 
     // Calculating the destination address outside the loop results in significant 
     // perf wins vs. relying on the JIT to fold memory addressing logic into the 
     // write instructions. See: https://github.com/dotnet/runtime/issues/33002 
     char* pCurrentWriteAddress = pUtf16Buffer + currentOffset; 
     // Now run the main 1x 128-bit read + 2x 128-bit write loop. 
     nuint finalOffsetWhereCanIterateLoop = elementCount - SizeOfVector128; 
     while (currentOffset <= finalOffsetWhereCanIterateLoop) 
 Sse2.StoreAligned(pLatin1Buffer + currentOffsetInElements, latin1Vector); // aligned 
 if (elementCount >= SizeOfVector128) 
 { 
     // First, perform an unaligned 1x 64-bit read from the input buffer and an unaligned 
     // 1x 128-bit write to the destination buffer. 
  
     latin1Vector = Sse2.LoadScalarVector128((ulong*)pLatin1Buffer).AsByte(); // unaligned load 
     Sse2.Store((byte*)pUtf16Buffer, Sse2.UnpackLow(latin1Vector, zeroVector)); // unaligned write 
  
     // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment 
     // point, then use that as the base offset going forward. Remember the >> 1 to account for 
     // that we wrote chars, not bytes. This means we may re-read data in the next iteration of 
     // the loop, but this is ok. 
  
     currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1)); 
     Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char)); 
  
     // Calculating the destination address outside the loop results in significant 
     // perf wins vs. relying on the JIT to fold memory addressing logic into the 
     // write instructions. See: https://github.com/dotnet/runtime/issues/33002 
  
     char* pCurrentWriteAddress = pUtf16Buffer + currentOffset; 
  
     // Now run the main 1x 128-bit read + 2x 128-bit write loop. 
  
     nuint finalOffsetWhereCanIterateLoop = elementCount - SizeOfVector128; 
     while (currentOffset <= finalOffsetWhereCanIterateLoop) 
 Sse2.StoreAligned(pLatin1Buffer + currentOffsetInElements, latin1Vector); // aligned 
+                return;
+            }
+
             // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
             // code below. This has two benefits: (a) we can take advantage of specific instructions like
             // punpcklbw which we know are optimized, and (b) we can avoid downclocking the processor while
@@ -1106,5 +1113,19 @@ private static unsafe void WidenLatin1ToUtf16_Fallback(byte* pLatin1Buffer, char
                 currentOffset++;
             }
         }
+
+        private static unsafe void WidenLatin1ToUtf16_MisalignedAddress(byte* pLatin1Buffer, char* pUtf16Buffer, nuint elementCount)
+        {
+            if (elementCount != 0)
+            {
+                do
+                {
+                    Unsafe.WriteUnaligned(pUtf16Buffer, (char)*pLatin1Buffer);
+                    pUtf16Buffer++;
+                    pLatin1Buffer++;
+                }
+                while (--elementCount != 0);
+            }
+        }
     }
 }