@@ -41,13 +41,11 @@ public static int GreatestCommonDivisor(int a, int b)
4141
4242        /// <summary> 
4343        /// Determine the Least Common Multiple (LCM) of two numbers. 
44+         /// See https://en.wikipedia.org/wiki/Least_common_multiple#Reduction_by_the_greatest_common_divisor. 
4445        /// </summary> 
4546        [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ] 
4647        public  static int  LeastCommonMultiple ( int  a ,  int  b ) 
47-         { 
48-             // https://en.wikipedia.org/wiki/Least_common_multiple#Reduction_by_the_greatest_common_divisor 
49-             return  ( a  /  GreatestCommonDivisor ( a ,  b ) )  *  b ; 
50-         } 
48+             =>  a  /  GreatestCommonDivisor ( a ,  b )  *  b ; 
5149
5250        /// <summary> 
5351        /// Calculates <paramref name="x"/> % 2 
@@ -290,10 +288,14 @@ public static void Clamp(Span<byte> span, byte min, byte max)
290288
291289            if  ( remainder . Length  >  0 ) 
292290            { 
293-                 for  ( int  i  =  0 ;  i  <  remainder . Length ;  i ++ ) 
291+                 ref  byte  remainderStart  =  ref  MemoryMarshal . GetReference ( remainder ) ; 
292+                 ref  byte  remainderEnd  =  ref  Unsafe . Add ( ref  remainderStart ,  remainder . Length ) ; 
293+ 
294+                 while  ( Unsafe . IsAddressLessThan ( ref  remainderStart ,  ref  remainderEnd ) ) 
294295                { 
295-                     ref  byte  v  =  ref  remainder [ i ] ; 
296-                     v  =  Clamp ( v ,  min ,  max ) ; 
296+                     remainderStart  =  Clamp ( remainderStart ,  min ,  max ) ; 
297+ 
298+                     remainderStart  =  ref  Unsafe . Add ( ref  remainderStart ,  1 ) ; 
297299                } 
298300            } 
299301        } 
@@ -311,10 +313,14 @@ public static void Clamp(Span<uint> span, uint min, uint max)
311313
312314            if  ( remainder . Length  >  0 ) 
313315            { 
314-                 for  ( int  i  =  0 ;  i  <  remainder . Length ;  i ++ ) 
316+                 ref  uint  remainderStart  =  ref  MemoryMarshal . GetReference ( remainder ) ; 
317+                 ref  uint  remainderEnd  =  ref  Unsafe . Add ( ref  remainderStart ,  remainder . Length ) ; 
318+ 
319+                 while  ( Unsafe . IsAddressLessThan ( ref  remainderStart ,  ref  remainderEnd ) ) 
315320                { 
316-                     ref  uint  v  =  ref  remainder [ i ] ; 
317-                     v  =  Clamp ( v ,  min ,  max ) ; 
321+                     remainderStart  =  Clamp ( remainderStart ,  min ,  max ) ; 
322+ 
323+                     remainderStart  =  ref  Unsafe . Add ( ref  remainderStart ,  1 ) ; 
318324                } 
319325            } 
320326        } 
@@ -332,10 +338,14 @@ public static void Clamp(Span<int> span, int min, int max)
332338
333339            if  ( remainder . Length  >  0 ) 
334340            { 
335-                 for  ( int  i  =  0 ;  i  <  remainder . Length ;  i ++ ) 
341+                 ref  int  remainderStart  =  ref  MemoryMarshal . GetReference ( remainder ) ; 
342+                 ref  int  remainderEnd  =  ref  Unsafe . Add ( ref  remainderStart ,  remainder . Length ) ; 
343+ 
344+                 while  ( Unsafe . IsAddressLessThan ( ref  remainderStart ,  ref  remainderEnd ) ) 
336345                { 
337-                     ref  int  v  =  ref  remainder [ i ] ; 
338-                     v  =  Clamp ( v ,  min ,  max ) ; 
346+                     remainderStart  =  Clamp ( remainderStart ,  min ,  max ) ; 
347+ 
348+                     remainderStart  =  ref  Unsafe . Add ( ref  remainderStart ,  1 ) ; 
339349                } 
340350            } 
341351        } 
@@ -353,10 +363,14 @@ public static void Clamp(Span<float> span, float min, float max)
353363
354364            if  ( remainder . Length  >  0 ) 
355365            { 
356-                 for  ( int  i  =  0 ;  i  <  remainder . Length ;  i ++ ) 
366+                 ref  float  remainderStart  =  ref  MemoryMarshal . GetReference ( remainder ) ; 
367+                 ref  float  remainderEnd  =  ref  Unsafe . Add ( ref  remainderStart ,  remainder . Length ) ; 
368+ 
369+                 while  ( Unsafe . IsAddressLessThan ( ref  remainderStart ,  ref  remainderEnd ) ) 
357370                { 
358-                     ref  float  v  =  ref  remainder [ i ] ; 
359-                     v  =  Clamp ( v ,  min ,  max ) ; 
371+                     remainderStart  =  Clamp ( remainderStart ,  min ,  max ) ; 
372+ 
373+                     remainderStart  =  ref  Unsafe . Add ( ref  remainderStart ,  1 ) ; 
360374                } 
361375            } 
362376        } 
@@ -374,10 +388,14 @@ public static void Clamp(Span<double> span, double min, double max)
374388
375389            if  ( remainder . Length  >  0 ) 
376390            { 
377-                 for  ( int  i  =  0 ;  i  <  remainder . Length ;  i ++ ) 
391+                 ref  double  remainderStart  =  ref  MemoryMarshal . GetReference ( remainder ) ; 
392+                 ref  double  remainderEnd  =  ref  Unsafe . Add ( ref  remainderStart ,  remainder . Length ) ; 
393+ 
394+                 while  ( Unsafe . IsAddressLessThan ( ref  remainderStart ,  ref  remainderEnd ) ) 
378395                { 
379-                     ref  double  v  =  ref  remainder [ i ] ; 
380-                     v  =  Clamp ( v ,  min ,  max ) ; 
396+                     remainderStart  =  Clamp ( remainderStart ,  min ,  max ) ; 
397+ 
398+                     remainderStart  =  ref  Unsafe . Add ( ref  remainderStart ,  1 ) ; 
381399                } 
382400            } 
383401        } 
@@ -407,33 +425,42 @@ private static void ClampImpl<T>(Span<T> span, T min, T max)
407425            where  T  :  unmanaged
408426        { 
409427            ref  T  sRef  =  ref  MemoryMarshal . GetReference ( span ) ; 
410-             ref  Vector < T >  vsBase  =  ref  Unsafe . As < T ,  Vector < T > > ( ref  MemoryMarshal . GetReference ( span ) ) ; 
411428            var  vmin  =  new  Vector < T > ( min ) ; 
412429            var  vmax  =  new  Vector < T > ( max ) ; 
413430
414431            int  n  =  span . Length  /  Vector < T > . Count ; 
415432            int  m  =  Modulo4 ( n ) ; 
416433            int  u  =  n  -  m ; 
417434
418-             for  ( int  i  =  0 ;  i  <  u ;  i  +=  4 ) 
419-             { 
420-                 ref  Vector < T >  vs0  =  ref  Unsafe . Add ( ref  vsBase ,  i ) ; 
421-                 ref  Vector < T >  vs1  =  ref  Unsafe . Add ( ref  vs0 ,  1 ) ; 
422-                 ref  Vector < T >  vs2  =  ref  Unsafe . Add ( ref  vs0 ,  2 ) ; 
423-                 ref  Vector < T >  vs3  =  ref  Unsafe . Add ( ref  vs0 ,  3 ) ; 
435+             ref  Vector < T >  vs0  =  ref  Unsafe . As < T ,  Vector < T > > ( ref  MemoryMarshal . GetReference ( span ) ) ; 
436+             ref  Vector < T >  vs1  =  ref  Unsafe . Add ( ref  vs0 ,  1 ) ; 
437+             ref  Vector < T >  vs2  =  ref  Unsafe . Add ( ref  vs0 ,  2 ) ; 
438+             ref  Vector < T >  vs3  =  ref  Unsafe . Add ( ref  vs0 ,  3 ) ; 
439+             ref  Vector < T >  vsEnd  =  ref  Unsafe . Add ( ref  vs0 ,  u ) ; 
424440
441+             while  ( Unsafe . IsAddressLessThan ( ref  vs0 ,  ref  vsEnd ) ) 
442+             { 
425443                vs0  =  Vector . Min ( Vector . Max ( vmin ,  vs0 ) ,  vmax ) ; 
426444                vs1  =  Vector . Min ( Vector . Max ( vmin ,  vs1 ) ,  vmax ) ; 
427445                vs2  =  Vector . Min ( Vector . Max ( vmin ,  vs2 ) ,  vmax ) ; 
428446                vs3  =  Vector . Min ( Vector . Max ( vmin ,  vs3 ) ,  vmax ) ; 
447+ 
448+                 vs0  =  ref  Unsafe . Add ( ref  vs0 ,  4 ) ; 
449+                 vs1  =  ref  Unsafe . Add ( ref  vs1 ,  4 ) ; 
450+                 vs2  =  ref  Unsafe . Add ( ref  vs2 ,  4 ) ; 
451+                 vs3  =  ref  Unsafe . Add ( ref  vs3 ,  4 ) ; 
429452            } 
430453
431454            if  ( m  >  0 ) 
432455            { 
433-                 for  ( int  i  =  u ;  i  <  n ;  i ++ ) 
456+                 vs0  =  ref  vsEnd ; 
457+                 vsEnd  =  ref  Unsafe . Add ( ref  vsEnd ,  m ) ; 
458+ 
459+                 while  ( Unsafe . IsAddressLessThan ( ref  vs0 ,  ref  vsEnd ) ) 
434460                { 
435-                     ref  Vector < T >  vs0  =  ref  Unsafe . Add ( ref  vsBase ,  i ) ; 
436461                    vs0  =  Vector . Min ( Vector . Max ( vmin ,  vs0 ) ,  vmax ) ; 
462+ 
463+                     vs0  =  ref  Unsafe . Add ( ref  vs0 ,  1 ) ; 
437464                } 
438465            } 
439466        } 
@@ -472,10 +499,8 @@ public static void Premultiply(Span<Vector4> vectors)
472499#if SUPPORTS_RUNTIME_INTRINSICS 
473500            if  ( Avx2 . IsSupported  &&  vectors . Length  >=  2 ) 
474501            { 
475-                 ref  Vector256 < float >  vectorsBase  = 
476-                     ref  Unsafe . As < Vector4 ,  Vector256 < float > > ( ref  MemoryMarshal . GetReference ( vectors ) ) ; 
477- 
478502                // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float> 
503+                 ref  Vector256 < float >  vectorsBase  =  ref  Unsafe . As < Vector4 ,  Vector256 < float > > ( ref  MemoryMarshal . GetReference ( vectors ) ) ; 
479504                ref  Vector256 < float >  vectorsLast  =  ref  Unsafe . Add ( ref  vectorsBase ,  ( IntPtr ) ( ( uint ) vectors . Length  /  2u ) ) ; 
480505
481506                while  ( Unsafe . IsAddressLessThan ( ref  vectorsBase ,  ref  vectorsLast ) ) 
@@ -495,12 +520,14 @@ public static void Premultiply(Span<Vector4> vectors)
495520            else 
496521#endif
497522            { 
498-                 ref  Vector4  baseRef  =  ref  MemoryMarshal . GetReference ( vectors ) ; 
523+                 ref  Vector4  vectorsStart  =  ref  MemoryMarshal . GetReference ( vectors ) ; 
524+                 ref  Vector4  vectorsEnd  =  ref  Unsafe . Add ( ref  vectorsStart ,  vectors . Length ) ; 
499525
500-                 for   ( int   i   =   0 ;   i   <   vectors . Length ;   i ++ ) 
526+                 while   ( Unsafe . IsAddressLessThan ( ref   vectorsStart ,   ref   vectorsEnd ) ) 
501527                { 
502-                     ref  Vector4  v  =  ref  Unsafe . Add ( ref  baseRef ,  i ) ; 
503-                     Premultiply ( ref  v ) ; 
528+                     Premultiply ( ref  vectorsStart ) ; 
529+ 
530+                     vectorsStart  =  ref  Unsafe . Add ( ref  vectorsStart ,  1 ) ; 
504531                } 
505532            } 
506533        } 
@@ -515,10 +542,8 @@ public static void UnPremultiply(Span<Vector4> vectors)
515542#if SUPPORTS_RUNTIME_INTRINSICS 
516543            if  ( Avx2 . IsSupported  &&  vectors . Length  >=  2 ) 
517544            { 
518-                 ref  Vector256 < float >  vectorsBase  = 
519-                     ref  Unsafe . As < Vector4 ,  Vector256 < float > > ( ref  MemoryMarshal . GetReference ( vectors ) ) ; 
520- 
521545                // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float> 
546+                 ref  Vector256 < float >  vectorsBase  =  ref  Unsafe . As < Vector4 ,  Vector256 < float > > ( ref  MemoryMarshal . GetReference ( vectors ) ) ; 
522547                ref  Vector256 < float >  vectorsLast  =  ref  Unsafe . Add ( ref  vectorsBase ,  ( IntPtr ) ( ( uint ) vectors . Length  /  2u ) ) ; 
523548
524549                while  ( Unsafe . IsAddressLessThan ( ref  vectorsBase ,  ref  vectorsLast ) ) 
@@ -538,12 +563,14 @@ public static void UnPremultiply(Span<Vector4> vectors)
538563            else 
539564#endif
540565            { 
541-                 ref  Vector4  baseRef  =  ref  MemoryMarshal . GetReference ( vectors ) ; 
566+                 ref  Vector4  vectorsStart  =  ref  MemoryMarshal . GetReference ( vectors ) ; 
567+                 ref  Vector4  vectorsEnd  =  ref  Unsafe . Add ( ref  vectorsStart ,  vectors . Length ) ; 
542568
543-                 for   ( int   i   =   0 ;   i   <   vectors . Length ;   i ++ ) 
569+                 while   ( Unsafe . IsAddressLessThan ( ref   vectorsStart ,   ref   vectorsEnd ) ) 
544570                { 
545-                     ref  Vector4  v  =  ref  Unsafe . Add ( ref  baseRef ,  i ) ; 
546-                     UnPremultiply ( ref  v ) ; 
571+                     UnPremultiply ( ref  vectorsStart ) ; 
572+ 
573+                     vectorsStart  =  ref  Unsafe . Add ( ref  vectorsStart ,  1 ) ; 
547574                } 
548575            } 
549576        } 
@@ -633,53 +660,54 @@ public static unsafe void CubeRootOnXYZ(Span<Vector4> vectors)
633660                    vectors128Ref  =  y4 ; 
634661                    vectors128Ref  =  ref  Unsafe . Add ( ref  vectors128Ref ,  1 ) ; 
635662                } 
636- 
637-                 return ; 
638663            } 
664+             else 
639665#endif
640-             ref  Vector4  vectorsRef  =  ref  MemoryMarshal . GetReference ( vectors ) ; 
641-             ref  Vector4  vectorsEnd  =  ref  Unsafe . Add ( ref  vectorsRef ,  vectors . Length ) ; 
642- 
643-             // Fallback with scalar preprocessing and vectorized approximation steps 
644-             while  ( Unsafe . IsAddressLessThan ( ref  vectorsRef ,  ref  vectorsEnd ) ) 
645666            { 
646-                 Vector4  v  =  vectorsRef ; 
667+                 ref  Vector4  vectorsRef  =  ref  MemoryMarshal . GetReference ( vectors ) ; 
668+                 ref  Vector4  vectorsEnd  =  ref  Unsafe . Add ( ref  vectorsRef ,  vectors . Length ) ; 
647669
648-                 double 
649-                     x64  =  v . X , 
650-                     y64  =  v . Y , 
651-                     z64  =  v . Z ; 
652-                 float  a  =  v . W ; 
653- 
654-                 ulong 
655-                     xl  =  * ( ulong * ) & x64 , 
656-                     yl =  * ( ulong * ) & y64 , 
657-                     zl =  * ( ulong * ) & z64 ; 
658- 
659-                 // Here we use a trick to compute the starting value x0 for the cube root. This is because doing 
660-                 // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case, 
661-                 // this means what we actually want is to find the cube root of our clamped values. 
662-                 // For more info on the  constant below, see: 
663-                 // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. 
664-                 // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and 
665-                 // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit 
666-                 // register, and use it to accelerate two steps of the Newton approximation using SIMD. 
667-                 xl  =  0x2a9f8a7be393b600  +  ( xl  /  3 ) ; 
668-                 yl  =  0x2a9f8a7be393b600  +  ( yl  /  3 ) ; 
669-                 zl  =  0x2a9f8a7be393b600  +  ( zl  /  3 ) ; 
670- 
671-                 Vector4  y4 ; 
672-                 y4. X =  ( float ) * ( double * ) & xl ; 
673-                 y4. Y  =  ( float ) * ( double * ) & yl ; 
674-                 y4. Z  =  ( float ) * ( double * ) & zl ; 
675-                 y4. W  =  0 ; 
676- 
677-                 y4  =  ( 2  /  3f  *  y4 )  +  ( 1  /  3f  *  ( v  /  ( y4  *  y4 ) ) ) ; 
678-                 y4  =  ( 2  /  3f  *  y4 )  +  ( 1  /  3f  *  ( v  /  ( y4  *  y4 ) ) ) ; 
679-                 y4 . W  =  a ; 
680- 
681-                 vectorsRef =  y4 ; 
682-                 vectorsRef =  ref  Unsafe . Add ( ref  vectorsRef ,  1 ) ; 
670+                 // Fallback with scalar preprocessing and vectorized approximation steps 
671+                 while  ( Unsafe . IsAddressLessThan ( ref  vectorsRef ,  ref  vectorsEnd ) ) 
672+                 { 
673+                     Vector4  v  =  vectorsRef ; 
674+ 
675+                     double 
676+                         x64  =  v . X , 
677+                         y64  =  v . Y , 
678+                         z64  =  v . Z ; 
679+                     float  a  =  v . W ; 
680+ 
681+                     ulong 
682+                         xl  =  * ( ulong * ) & x64 , 
683+                         yl  =  * ( ulong * ) & y64 , 
684+                         zl  =  * ( ulong * ) & z64 ; 
685+ 
686+                     // Here we use a trick to compute the starting value x0 for the cube root. This is because doing 
687+                     // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case, 
688+                     // this means what we actually want is to find the cube root of our clamped values. 
689+                     // For more info on the  constant below, see: 
690+                     // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. 
691+                     // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and 
692+                     // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit 
693+                     // register, and use it to accelerate two steps of the Newton approximation using SIMD. 
694+                     xl  =  0x2a9f8a7be393b600  +  ( xl  /  3 ) ; 
695+                     yl  =  0x2a9f8a7be393b600  +  ( yl  /  3 ) ; 
696+                     zl  =  0x2a9f8a7be393b600  +  ( zl  /  3 ) ; 
697+ 
698+                     Vector4  y4 ; 
699+                     y4 . X  =  ( float ) * ( double * ) & xl ; 
700+                     y4. Y  =  ( float ) * ( double * ) & yl ; 
701+                     y4. Z  =  ( float ) * ( double * ) & zl ; 
702+                     y4. W  =  0 ; 
703+ 
704+                     y4  =  ( 2  /  3f  *  y4 )  +  ( 1  /  3f  *  ( v  /  ( y4  *  y4 ) ) ) ; 
705+                     y4  =  ( 2  /  3f  *  y4 )  +  ( 1  /  3f  *  ( v  /  ( y4  *  y4 ) ) ) ; 
706+                     y4 . W  =  a ; 
707+ 
708+                     vectorsRef =  y4 ; 
709+                     vectorsRef =  ref  Unsafe . Add ( ref  vectorsRef ,  1 ) ; 
710+                 } 
683711            } 
684712        } 
685713    } 
0 commit comments