@@ -14,33 +14,38 @@ internal static partial class SimdUtils
1414{
1515 public static class HwIntrinsics
1616 {
17- public static ReadOnlySpan < byte > PermuteMaskDeinterleave8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
17+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ] // too much IL for JIT to inline, so give a hint
18+ public static Vector256 < int > PermuteMaskDeinterleave8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsInt32 ( ) ;
1819
19- public static ReadOnlySpan < byte > PermuteMaskEvenOdd8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
20+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
21+ public static Vector256 < uint > PermuteMaskEvenOdd8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
2022
21- public static ReadOnlySpan < byte > PermuteMaskSwitchInnerDWords8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
23+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
24+ public static Vector256 < uint > PermuteMaskSwitchInnerDWords8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
2225
23- private static ReadOnlySpan < byte > MoveFirst24BytesToSeparateLanes => new byte [ ] { 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
26+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
27+ private static Vector256 < uint > MoveFirst24BytesToSeparateLanes ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
2428
25- internal static ReadOnlySpan < byte > ExtractRgb => new byte [ ] { 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF , 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF } ;
29+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
30+ internal static Vector256 < byte > ExtractRgb ( ) => Vector256 . Create ( 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF , 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF ) ;
2631
27- private static ReadOnlySpan < byte > ShuffleMaskPad4Nx16 => new byte [ ] { 0 , 1 , 2 , 0x80 , 3 , 4 , 5 , 0x80 , 6 , 7 , 8 , 0x80 , 9 , 10 , 11 , 0x80 } ;
32+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
33+ private static Vector128 < byte > ShuffleMaskPad4Nx16 ( ) => Vector128 . Create ( 0 , 1 , 2 , 0x80 , 3 , 4 , 5 , 0x80 , 6 , 7 , 8 , 0x80 , 9 , 10 , 11 , 0x80 ) ;
2834
29- private static ReadOnlySpan < byte > ShuffleMaskSlice4Nx16 => new byte [ ] { 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 0x80 , 0x80 , 0x80 , 0x80 } ;
35+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
36+ private static Vector128 < byte > ShuffleMaskSlice4Nx16 ( ) => Vector128 . Create ( 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 0x80 , 0x80 , 0x80 , 0x80 ) ;
3037
31- private static ReadOnlySpan < byte > ShuffleMaskShiftAlpha =>
32- new byte [ ]
33- {
34- 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ,
35- 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15
36- } ;
38+ #pragma warning disable SA1003 , SA1116 , SA1117 // Parameters should be on same line or separate lines
39+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
40+ private static Vector256 < byte > ShuffleMaskShiftAlpha ( ) => Vector256 . Create ( ( byte )
41+ 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ,
42+ 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ) ;
3743
38- public static ReadOnlySpan < byte > PermuteMaskShiftAlpha8x32 =>
39- new byte [ ]
40- {
41- 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 ,
42- 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0
43- } ;
44+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
45+ public static Vector256 < uint > PermuteMaskShiftAlpha8x32 ( ) => Vector256 . Create (
46+ 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 ,
47+ 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
48+ #pragma warning restore SA1003 , SA1116 , SA1117 // Parameters should be on same line or separate lines
4449
4550 /// <summary>
4651 /// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
@@ -190,7 +195,7 @@ public static void Shuffle4Slice3Reduce(
190195 {
191196 if ( Ssse3 . IsSupported )
192197 {
193- int remainder = source . Length % ( Vector128 < byte > . Count * 4 ) ;
198+ int remainder = source . Length & ( ( Vector128 < byte > . Count * 4 ) - 1 ) ; // bit-hack for modulo
194199
195200 int sourceCount = source . Length - remainder ;
196201 int destCount = ( int ) ( ( uint ) sourceCount * 3 / 4 ) ;
@@ -254,7 +259,7 @@ private static void Shuffle4(
254259 ref Vector128 < float > destBase =
255260 ref Unsafe . As < float , Vector128 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
256261
257- nint n = ( nint ) ( uint ) dest . Length / Vector128 < float > . Count ;
262+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector128 < float > . Count ) ;
258263 nint m = Numerics . Modulo4 ( n ) ;
259264 nint u = n - m ;
260265
@@ -307,7 +312,7 @@ private static void Shuffle4(
307312 ref Vector256 < byte > destBase =
308313 ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
309314
310- nint n = ( nint ) ( uint ) dest . Length / Vector256 < byte > . Count ;
315+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector256 < byte > . Count ) ;
311316 nint m = Numerics . Modulo4 ( n ) ;
312317 nint u = n - m ;
313318
@@ -343,7 +348,7 @@ private static void Shuffle4(
343348 ref Vector128 < byte > destBase =
344349 ref Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
345350
346- nint n = ( nint ) ( uint ) dest . Length / Vector128 < byte > . Count ;
351+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector128 < byte > . Count ) ;
347352 nint m = Numerics . Modulo4 ( n ) ;
348353 nint u = n - m ;
349354
@@ -376,10 +381,8 @@ private static void Shuffle3(
376381 {
377382 if ( Ssse3 . IsSupported )
378383 {
379- ref byte vmaskBase = ref MemoryMarshal . GetReference ( ShuffleMaskPad4Nx16 ) ;
380- Vector128 < byte > vmask = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskBase ) ;
381- ref byte vmaskoBase = ref MemoryMarshal . GetReference ( ShuffleMaskSlice4Nx16 ) ;
382- Vector128 < byte > vmasko = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskoBase ) ;
384+ Vector128 < byte > vmask = ShuffleMaskPad4Nx16 ( ) ;
385+ Vector128 < byte > vmasko = ShuffleMaskSlice4Nx16 ( ) ;
383386 Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) ;
384387
385388 Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -441,8 +444,7 @@ private static void Pad3Shuffle4(
441444 {
442445 if ( Ssse3 . IsSupported )
443446 {
444- ref byte vmaskBase = ref MemoryMarshal . GetReference ( ShuffleMaskPad4Nx16 ) ;
445- Vector128 < byte > vmask = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskBase ) ;
447+ Vector128 < byte > vmask = ShuffleMaskPad4Nx16 ( ) ;
446448 Vector128 < byte > vfill = Vector128 . Create ( 0xff000000ff000000ul ) . AsByte ( ) ;
447449
448450 Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -485,8 +487,7 @@ private static void Shuffle4Slice3(
485487 {
486488 if ( Ssse3 . IsSupported )
487489 {
488- ref byte vmaskoBase = ref MemoryMarshal . GetReference ( ShuffleMaskSlice4Nx16 ) ;
489- Vector128 < byte > vmasko = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskoBase ) ;
490+ Vector128 < byte > vmasko = ShuffleMaskSlice4Nx16 ( ) ;
490491 Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) ;
491492
492493 Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -543,9 +544,9 @@ private static void Shuffle4Slice3(
543544 /// <returns>The <see cref="Vector256{T}"/>.</returns>
544545 [ MethodImpl ( InliningOptions . AlwaysInline ) ]
545546 public static Vector256 < float > MultiplyAdd (
546- in Vector256 < float > va ,
547- in Vector256 < float > vm0 ,
548- in Vector256 < float > vm1 )
547+ Vector256 < float > va ,
548+ Vector256 < float > vm0 ,
549+ Vector256 < float > vm1 )
549550 {
550551 if ( Fma . IsSupported )
551552 {
@@ -594,9 +595,9 @@ public static Vector128<float> MultiplyAdd(
594595 /// <returns>The <see cref="Vector256{T}"/>.</returns>
595596 [ MethodImpl ( InliningOptions . ShortMethod ) ]
596597 public static Vector256 < float > MultiplySubtract (
597- in Vector256 < float > vs ,
598- in Vector256 < float > vm0 ,
599- in Vector256 < float > vm1 )
598+ Vector256 < float > vs ,
599+ Vector256 < float > vm0 ,
600+ Vector256 < float > vm1 )
600601 {
601602 if ( Fma . IsSupported )
602603 {
@@ -616,9 +617,9 @@ public static Vector256<float> MultiplySubtract(
616617 /// <returns>The <see cref="Vector256{T}"/>.</returns>
617618 [ MethodImpl ( InliningOptions . ShortMethod ) ]
618619 public static Vector256 < float > MultiplyAddNegated (
619- in Vector256 < float > a ,
620- in Vector256 < float > b ,
621- in Vector256 < float > c )
620+ Vector256 < float > a ,
621+ Vector256 < float > b ,
622+ Vector256 < float > c )
622623 {
623624 if ( Fma . IsSupported )
624625 {
@@ -684,7 +685,7 @@ internal static unsafe void ByteToNormalizedFloat(
684685 ref Vector256 < float > destBase =
685686 ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
686687
687- var scale = Vector256 . Create ( 1 / ( float ) byte . MaxValue ) ;
688+ Vector256 < float > scale = Vector256 . Create ( 1 / ( float ) byte . MaxValue ) ;
688689
689690 for ( nuint i = 0 ; i < n ; i ++ )
690691 {
@@ -717,7 +718,7 @@ internal static unsafe void ByteToNormalizedFloat(
717718 ref Vector128 < float > destBase =
718719 ref Unsafe . As < float , Vector128 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
719720
720- var scale = Vector128 . Create ( 1 / ( float ) byte . MaxValue ) ;
721+ Vector128 < float > scale = Vector128 . Create ( 1 / ( float ) byte . MaxValue ) ;
721722 Vector128 < byte > zero = Vector128 < byte > . Zero ;
722723
723724 for ( nuint i = 0 ; i < n ; i ++ )
@@ -819,9 +820,8 @@ internal static void NormalizedFloatToByteSaturate(
819820 ref Vector256 < byte > destBase =
820821 ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
821822
822- var scale = Vector256 . Create ( ( float ) byte . MaxValue ) ;
823- ref byte maskBase = ref MemoryMarshal . GetReference ( PermuteMaskDeinterleave8x32 ) ;
824- Vector256 < int > mask = Unsafe . As < byte , Vector256 < int > > ( ref maskBase ) ;
823+ Vector256 < float > scale = Vector256 . Create ( ( float ) byte . MaxValue ) ;
824+ Vector256 < int > mask = PermuteMaskDeinterleave8x32 ( ) ;
825825
826826 for ( nuint i = 0 ; i < n ; i ++ )
827827 {
@@ -858,7 +858,7 @@ internal static void NormalizedFloatToByteSaturate(
858858 ref Vector128 < byte > destBase =
859859 ref Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
860860
861- var scale = Vector128 . Create ( ( float ) byte . MaxValue ) ;
861+ Vector128 < float > scale = Vector128 . Create ( ( float ) byte . MaxValue ) ;
862862
863863 for ( nuint i = 0 ; i < n ; i ++ )
864864 {
@@ -895,14 +895,12 @@ internal static void PackFromRgbPlanesAvx2Reduce(
895895
896896 nuint count = redChannel . Vector256Count < byte > ( ) ;
897897
898- ref byte control1Bytes = ref MemoryMarshal . GetReference ( PermuteMaskEvenOdd8x32 ) ;
899- Vector256 < uint > control1 = Unsafe . As < byte , Vector256 < uint > > ( ref control1Bytes ) ;
898+ Vector256 < uint > control1 = PermuteMaskEvenOdd8x32 ( ) ;
900899
901- ref byte control2Bytes = ref MemoryMarshal . GetReference ( PermuteMaskShiftAlpha8x32 ) ;
902- Vector256 < uint > control2 = Unsafe . As < byte , Vector256 < uint > > ( ref control2Bytes ) ;
903- var a = Vector256 . Create ( ( byte ) 255 ) ;
900+ Vector256 < uint > control2 = PermuteMaskShiftAlpha8x32 ( ) ;
901+ Vector256 < byte > a = Vector256 . Create ( ( byte ) 255 ) ;
904902
905- Vector256 < byte > shuffleAlpha = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ShuffleMaskShiftAlpha ) ) ;
903+ Vector256 < byte > shuffleAlpha = ShuffleMaskShiftAlpha ( ) ;
906904
907905 for ( nuint i = 0 ; i < count ; i ++ )
908906 {
@@ -966,9 +964,8 @@ internal static void PackFromRgbPlanesAvx2Reduce(
966964 ref Vector256 < byte > dBase = ref Unsafe . As < Rgba32 , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( destination ) ) ;
967965
968966 nuint count = redChannel . Vector256Count < byte > ( ) ;
969- ref byte control1Bytes = ref MemoryMarshal . GetReference ( PermuteMaskEvenOdd8x32 ) ;
970- Vector256 < uint > control1 = Unsafe . As < byte , Vector256 < uint > > ( ref control1Bytes ) ;
971- var a = Vector256 . Create ( ( byte ) 255 ) ;
967+ Vector256 < uint > control1 = PermuteMaskEvenOdd8x32 ( ) ;
968+ Vector256 < byte > a = Vector256 . Create ( ( byte ) 255 ) ;
972969
973970 for ( nuint i = 0 ; i < count ; i ++ )
974971 {
@@ -1017,8 +1014,8 @@ internal static void UnpackToRgbPlanesAvx2Reduce(
10171014 ref Vector256 < float > destGRef = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( greenChannel ) ) ;
10181015 ref Vector256 < float > destBRef = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( blueChannel ) ) ;
10191016
1020- Vector256 < uint > extractToLanesMask = Unsafe . As < byte , Vector256 < uint > > ( ref MemoryMarshal . GetReference ( MoveFirst24BytesToSeparateLanes ) ) ;
1021- Vector256 < byte > extractRgbMask = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ExtractRgb ) ) ;
1017+ Vector256 < uint > extractToLanesMask = MoveFirst24BytesToSeparateLanes ( ) ;
1018+ Vector256 < byte > extractRgbMask = ExtractRgb ( ) ;
10221019 Vector256 < byte > rgb , rg , bx ;
10231020 Vector256 < float > r , g , b ;
10241021
0 commit comments