From 23ed7eb2c1be68e402e702e8af3e3019bfc89a30 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Thu, 8 Aug 2024 16:05:10 -0700
Subject: [PATCH 1/7] Ensure that aggregation is consistent regardless of data
 alignment

---
 .../TensorPrimitives.IAggregationOperator.cs  | 21 +++++++++----
 .../TensorPrimitives.Single.netstandard.cs    | 31 +++----------------
 2 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
index df03ad1634d288..cb9af1edd7f166 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
@@ -1227,9 +1227,12 @@ static T Vectorized128(ref T xRef, ref T yRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -1418,9 +1421,12 @@ static T Vectorized256(ref T xRef, ref T yRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -1609,9 +1615,12 @@ static T Vectorized512(ref T xRef, ref T yRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
index c9474cb470fd7f..559a50051eb056 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
@@ -467,8 +467,6 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
                 Vector<float> end = binaryOp.Invoke(AsVector(ref xRef, remainder - (uint)(Vector<float>.Count)),
                                                     AsVector(ref yRef, remainder - (uint)(Vector<float>.Count)));
 
-                nuint misalignment = 0;
-
                 if (remainder > (uint)(Vector<float>.Count * 8))
                 {
                     // Pinning is cheap and will be short lived for small inputs and unlikely to be impactful
@@ -480,29 +478,9 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
                         float* xPtr = px;
                         float* yPtr = py;
 
-                        // We need to the ensure the underlying data can be aligned and only align
-                        // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
-
-                        bool canAlign = ((nuint)(xPtr) % sizeof(float)) == 0;
-
-                        if (canAlign)
-                        {
-                            // Compute by how many elements we're misaligned and adjust the pointers accordingly
-                            //
-                            // Noting that we are only actually aligning dPtr. This is because unaligned stores
-                            // are more expensive than unaligned loads and aligning both is significantly more
-                            // complex.
-
-                            misalignment = ((uint)(sizeof(Vector<float>)) - ((nuint)(xPtr) % (uint)(sizeof(Vector<float>)))) / sizeof(float);
-
-                            xPtr += misalignment;
-                            yPtr += misalignment;
-
-                            Debug.Assert(((nuint)(xPtr) % (uint)(sizeof(Vector<float>))) == 0);
-
-                            remainder -= misalignment;
-                        }
+                        // Unlike many other vectorization algorithms, we cannot align for aggregation
+                        // because that changes how results compound together and can cause a significant
+                        // difference in the output.
 
                         Vector<float> vector1;
                         Vector<float> vector2;
@@ -564,7 +542,6 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
                 // Store the first block. Handling this separately simplifies the latter code as we know
                 // they come after and so we can relegate it to full blocks or the trailing elements
 
-                beg = Vector.ConditionalSelect(CreateAlignmentMaskSingleVector((int)(misalignment)), beg, new Vector<float>(aggregationOp.IdentityValue));
                 vresult = aggregationOp.Invoke(vresult, beg);
 
                 // Process the remaining [0, Count * 7] elements via a jump table
@@ -575,7 +552,7 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
 
                 nuint blocks = remainder / (nuint)(Vector<float>.Count);
                 nuint trailing = remainder - (blocks * (nuint)(Vector<float>.Count));
-                blocks -= (misalignment == 0) ? 1u : 0u;
+                blocks -= 1u;
                 remainder -= trailing;
 
                 switch (blocks)

From b3d460ede48262d51a4280bd88b70d89b6e76404 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Aug 2024 09:26:36 -0700
Subject: [PATCH 2/7] Ensure we handle for all aggregation helpers

---
 .../TensorPrimitives.IAggregationOperator.cs  | 21 +++++++++----
 .../TensorPrimitives.Single.netstandard.cs    | 30 +++----------------
 2 files changed, 19 insertions(+), 32 deletions(-)

diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
index cb9af1edd7f166..8a1c29d0ddd564 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
@@ -141,9 +141,12 @@ static T Vectorized128(ref T xRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -310,9 +313,12 @@ static T Vectorized256(ref T xRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -479,9 +485,12 @@ static T Vectorized512(ref T xRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
index 559a50051eb056..b5af49a24fa5f9 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
@@ -164,8 +164,6 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran
                 Vector<float> beg = transformOp.Invoke(AsVector(ref xRef));
                 Vector<float> end = transformOp.Invoke(AsVector(ref xRef, remainder - (uint)(Vector<float>.Count)));
 
-                nuint misalignment = 0;
-
                 if (remainder > (uint)(Vector<float>.Count * 8))
                 {
                     // Pinning is cheap and will be short lived for small inputs and unlikely to be impactful
@@ -175,28 +173,9 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran
                     {
                         float* xPtr = px;
 
-                        // We need to the ensure the underlying data can be aligned and only align
-                        // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
-
-                        bool canAlign = ((nuint)(xPtr) % sizeof(float)) == 0;
-
-                        if (canAlign)
-                        {
-                            // Compute by how many elements we're misaligned and adjust the pointers accordingly
-                            //
-                            // Noting that we are only actually aligning dPtr. This is because unaligned stores
-                            // are more expensive than unaligned loads and aligning both is significantly more
-                            // complex.
-
-                            misalignment = ((uint)(sizeof(Vector<float>)) - ((nuint)(xPtr) % (uint)(sizeof(Vector<float>)))) / sizeof(float);
-
-                            xPtr += misalignment;
-
-                            Debug.Assert(((nuint)(xPtr) % (uint)(sizeof(Vector<float>))) == 0);
-
-                            remainder -= misalignment;
-                        }
+                        // Unlike many other vectorization algorithms, we cannot align for aggregation
+                        // because that changes how results compound together and can cause a significant
+                        // difference in the output.
 
                         Vector<float> vector1;
                         Vector<float> vector2;
@@ -248,7 +227,6 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran
                 // Store the first block. Handling this separately simplifies the latter code as we know
                 // they come after and so we can relegate it to full blocks or the trailing elements
 
-                beg = Vector.ConditionalSelect(CreateAlignmentMaskSingleVector((int)(misalignment)), beg, new Vector<float>(aggregationOp.IdentityValue));
                 vresult = aggregationOp.Invoke(vresult, beg);
 
                 // Process the remaining [0, Count * 7] elements via a jump table
@@ -259,7 +237,7 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran
 
                 nuint blocks = remainder / (nuint)(Vector<float>.Count);
                 nuint trailing = remainder - (blocks * (nuint)(Vector<float>.Count));
-                blocks -= (misalignment == 0) ? 1u : 0u;
+                blocks -= 1u;
                 remainder -= trailing;
 
                 switch (blocks)

From 864920d60d72d0977d36e87d527410ef8062ded8 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Aug 2024 11:17:45 -0700
Subject: [PATCH 3/7] Ensure we don't process beg twice

---
 .../TensorPrimitives.IAggregationOperator.cs  | 63 +++++++++++++++++++
 .../TensorPrimitives.Single.netstandard.cs    | 16 ++++-
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
index 8a1c29d0ddd564..a733e37cf64df0 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
@@ -164,6 +164,16 @@ static T Vectorized128(ref T xRef, nuint remainder)
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            xPtr += (uint)Vector128<T>.Count;
+
+                            remainder -= (uint)Vector128<T>.Count;
+                        }
 
                         Vector128<T> vector1;
                         Vector128<T> vector2;
@@ -336,6 +346,16 @@ static T Vectorized256(ref T xRef, nuint remainder)
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            xPtr += (uint)Vector256<T>.Count;
+
+                            remainder -= (uint)Vector256<T>.Count;
+                        }
 
                         Vector256<T> vector1;
                         Vector256<T> vector2;
@@ -508,6 +528,16 @@ static T Vectorized512(ref T xRef, nuint remainder)
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            xPtr += (uint)Vector512<T>.Count;
+
+                            remainder -= (uint)Vector512<T>.Count;
+                        }
 
                         Vector512<T> vector1;
                         Vector512<T> vector2;
@@ -1260,6 +1290,17 @@ static T Vectorized128(ref T xRef, ref T yRef, nuint remainder)
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            xPtr += (uint)Vector128<T>.Count;
+                            yPtr += (uint)Vector128<T>.Count;
+
+                            remainder -= (uint)Vector128<T>.Count;
+                        }
 
                         Vector128<T> vector1;
                         Vector128<T> vector2;
@@ -1454,6 +1495,17 @@ static T Vectorized256(ref T xRef, ref T yRef, nuint remainder)
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            xPtr += (uint)Vector256<T>.Count;
+                            yPtr += (uint)Vector256<T>.Count;
+
+                            remainder -= (uint)Vector256<T>.Count;
+                        }
 
                         Vector256<T> vector1;
                         Vector256<T> vector2;
@@ -1648,6 +1700,17 @@ static T Vectorized512(ref T xRef, ref T yRef, nuint remainder)
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            xPtr += (uint)Vector512<T>.Count;
+                            yPtr += (uint)Vector512<T>.Count;
+
+                            remainder -= (uint)Vector512<T>.Count;
+                        }
 
                         Vector512<T> vector1;
                         Vector512<T> vector2;
diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
index b5af49a24fa5f9..e5fa6ede054617 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
@@ -175,7 +175,13 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran
 
                         // Unlike many other vectorization algorithms, we cannot align for aggregation
                         // because that changes how results compound together and can cause a significant
-                        // difference in the output.
+                        // difference in the output. This also means we're processing the full data from beg
+                        // so account for that to ensure we don't double process and include them in the
+                        // aggregate twice.
+
+                        xPtr += (uint)Vector<float>.Count;
+
+                        remainder -= (uint)Vector<float>.Count;
 
                         Vector<float> vector1;
                         Vector<float> vector2;
@@ -458,7 +464,13 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
 
                         // Unlike many other vectorization algorithms, we cannot align for aggregation
                         // because that changes how results compound together and can cause a significant
-                        // difference in the output.
+                        // difference in the output. This also means we're processing the full data from beg
+                        // so account for that to ensure we don't double process and include them in the
+                        // aggregate twice.
+
+                        xPtr += (uint)Vector<float>.Count;
+
+                        remainder -= (uint)Vector<float>.Count;
 
                         Vector<float> vector1;
                         Vector<float> vector2;

From 238c94794ac487c1452e5b713d1468341db2cc9a Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Mon, 12 Aug 2024 10:18:49 -0700
Subject: [PATCH 4/7] Ensure that we properly track in the case we can't align

---
 .../TensorPrimitives.IAggregationOperator.cs  | 43 +++++++++++--------
 .../TensorPrimitives.Single.netstandard.cs    |  2 -
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
index a733e37cf64df0..6c88a8eed0a7a2 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
@@ -159,7 +159,6 @@ static T Vectorized128(ref T xRef, nuint remainder)
                             misalignment = ((uint)sizeof(Vector128<T>) - ((nuint)xPtr % (uint)sizeof(Vector128<T>))) / (uint)sizeof(T);
 
                             xPtr += misalignment;
-
                             Debug.Assert(((nuint)xPtr % (uint)sizeof(Vector128<T>)) == 0);
 
                             remainder -= misalignment;
@@ -170,9 +169,9 @@ static T Vectorized128(ref T xRef, nuint remainder)
                             // so account for that to ensure we don't double process and include them in the
                             // aggregate twice.
 
-                            xPtr += (uint)Vector128<T>.Count;
-
-                            remainder -= (uint)Vector128<T>.Count;
+                            misalignment = (uint)Vector128<T>.Count;
+                            xPtr += misalignment;
+                            remainder -= misalignment;
                         }
 
                         Vector128<T> vector1;
@@ -352,9 +351,9 @@ static T Vectorized256(ref T xRef, nuint remainder)
                             // so account for that to ensure we don't double process and include them in the
                             // aggregate twice.
 
-                            xPtr += (uint)Vector256<T>.Count;
-
-                            remainder -= (uint)Vector256<T>.Count;
+                            misalignment = (uint)Vector256<T>.Count
+                            xPtr += misalignment;
+                            remainder -= misalignment;
                         }
 
                         Vector256<T> vector1;
@@ -534,9 +533,9 @@ static T Vectorized512(ref T xRef, nuint remainder)
                             // so account for that to ensure we don't double process and include them in the
                             // aggregate twice.
 
-                            xPtr += (uint)Vector512<T>.Count;
-
-                            remainder -= (uint)Vector512<T>.Count;
+                            misalignment = (uint)Vector512<T>.Count
+                            xPtr += misalignment;
+                            remainder -= misalignment;
                         }
 
                         Vector512<T> vector1;
@@ -1296,10 +1295,12 @@ static T Vectorized128(ref T xRef, ref T yRef, nuint remainder)
                             // so account for that to ensure we don't double process and include them in the
                             // aggregate twice.
 
-                            xPtr += (uint)Vector128<T>.Count;
-                            yPtr += (uint)Vector128<T>.Count;
+                            misalignment = (uint)Vector128<T>.Count
+
+                            xPtr += misalignment;
+                            yPtr += misalignment;
 
-                            remainder -= (uint)Vector128<T>.Count;
+                            remainder -= misalignment;
                         }
 
                         Vector128<T> vector1;
@@ -1501,10 +1502,12 @@ static T Vectorized256(ref T xRef, ref T yRef, nuint remainder)
                             // so account for that to ensure we don't double process and include them in the
                             // aggregate twice.
 
-                            xPtr += (uint)Vector256<T>.Count;
-                            yPtr += (uint)Vector256<T>.Count;
+                            misalignment = (uint)Vector256<T>.Count
+
+                            xPtr += misalignment;
+                            yPtr += misalignment;
 
-                            remainder -= (uint)Vector256<T>.Count;
+                            remainder -= misalignment;
                         }
 
                         Vector256<T> vector1;
@@ -1706,10 +1709,12 @@ static T Vectorized512(ref T xRef, ref T yRef, nuint remainder)
                             // so account for that to ensure we don't double process and include them in the
                             // aggregate twice.
 
-                            xPtr += (uint)Vector512<T>.Count;
-                            yPtr += (uint)Vector512<T>.Count;
+                            misalignment = (uint)Vector512<T>.Count
 
-                            remainder -= (uint)Vector512<T>.Count;
+                            xPtr += misalignment;
+                            yPtr += misalignment;
+
+                            remainder -= misalignment;
                         }
 
                         Vector512<T> vector1;
diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
index e5fa6ede054617..a22c212e0c455a 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
@@ -243,7 +243,6 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran
 
                 nuint blocks = remainder / (nuint)(Vector<float>.Count);
                 nuint trailing = remainder - (blocks * (nuint)(Vector<float>.Count));
-                blocks -= 1u;
                 remainder -= trailing;
 
                 switch (blocks)
@@ -542,7 +541,6 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
 
                 nuint blocks = remainder / (nuint)(Vector<float>.Count);
                 nuint trailing = remainder - (blocks * (nuint)(Vector<float>.Count));
-                blocks -= 1u;
                 remainder -= trailing;
 
                 switch (blocks)

From 23c043cf132918c293c232601f3b8822cf453949 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Mon, 12 Aug 2024 12:14:52 -0700
Subject: [PATCH 5/7] Add missing semicolon

---
 .../Common/TensorPrimitives.IAggregationOperator.cs    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
index 6c88a8eed0a7a2..1c8d215bd137d8 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
@@ -351,7 +351,7 @@ static T Vectorized256(ref T xRef, nuint remainder)
                             // so account for that to ensure we don't double process and include them in the
                             // aggregate twice.
 
-                            misalignment = (uint)Vector256<T>.Count
+                            misalignment = (uint)Vector256<T>.Count;
                             xPtr += misalignment;
                             remainder -= misalignment;
                         }
@@ -533,7 +533,7 @@ static T Vectorized512(ref T xRef, nuint remainder)
                             // so account for that to ensure we don't double process and include them in the
                             // aggregate twice.
 
-                            misalignment = (uint)Vector512<T>.Count
+                            misalignment = (uint)Vector512<T>.Count;
                             xPtr += misalignment;
                             remainder -= misalignment;
                         }
@@ -1295,7 +1295,7 @@ static T Vectorized128(ref T xRef, ref T yRef, nuint remainder)
                             // so account for that to ensure we don't double process and include them in the
                             // aggregate twice.
 
-                            misalignment = (uint)Vector128<T>.Count
+                            misalignment = (uint)Vector128<T>.Count;
 
                             xPtr += misalignment;
                             yPtr += misalignment;
@@ -1502,7 +1502,7 @@ static T Vectorized256(ref T xRef, ref T yRef, nuint remainder)
                             // so account for that to ensure we don't double process and include them in the
                             // aggregate twice.
 
-                            misalignment = (uint)Vector256<T>.Count
+                            misalignment = (uint)Vector256<T>.Count;
 
                             xPtr += misalignment;
                             yPtr += misalignment;
@@ -1709,7 +1709,7 @@ static T Vectorized512(ref T xRef, ref T yRef, nuint remainder)
                             // so account for that to ensure we don't double process and include them in the
                             // aggregate twice.
 
-                            misalignment = (uint)Vector512<T>.Count
+                            misalignment = (uint)Vector512<T>.Count;
 
                             xPtr += misalignment;
                             yPtr += misalignment;

From 1f8f4a8627dad600ca696384e63df4eacff5b6a6 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Mon, 12 Aug 2024 15:07:15 -0700
Subject: [PATCH 6/7] Fix the handling on .NET Framework

---
 .../TensorPrimitives.Single.netstandard.cs    | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
index a22c212e0c455a..0964f18c3d3834 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
@@ -164,6 +164,8 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran
                 Vector<float> beg = transformOp.Invoke(AsVector(ref xRef));
                 Vector<float> end = transformOp.Invoke(AsVector(ref xRef, remainder - (uint)(Vector<float>.Count)));
 
+                nuint misalignment = 0;
+
                 if (remainder > (uint)(Vector<float>.Count * 8))
                 {
                     // Pinning is cheap and will be short lived for small inputs and unlikely to be impactful
@@ -179,9 +181,9 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran
                         // so account for that to ensure we don't double process and include them in the
                         // aggregate twice.
 
-                        xPtr += (uint)Vector<float>.Count;
-
-                        remainder -= (uint)Vector<float>.Count;
+                        misalignment = (uint)Vector<float>.Count;
+                        xPtr += misalignment;
+                        remainder -= misalignment;
 
                         Vector<float> vector1;
                         Vector<float> vector2;
@@ -233,6 +235,7 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran
                 // Store the first block. Handling this separately simplifies the latter code as we know
                 // they come after and so we can relegate it to full blocks or the trailing elements
 
+                beg = Vector.ConditionalSelect(CreateAlignmentMaskSingleVector((int)(misalignment)), beg, new Vector<float>(aggregationOp.IdentityValue));
                 vresult = aggregationOp.Invoke(vresult, beg);
 
                 // Process the remaining [0, Count * 7] elements via a jump table
@@ -243,6 +246,7 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran
 
                 nuint blocks = remainder / (nuint)(Vector<float>.Count);
                 nuint trailing = remainder - (blocks * (nuint)(Vector<float>.Count));
+                blocks -= (misalignment == 0) ? 1u : 0u;
                 remainder -= trailing;
 
                 switch (blocks)
@@ -450,6 +454,8 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
                 Vector<float> end = binaryOp.Invoke(AsVector(ref xRef, remainder - (uint)(Vector<float>.Count)),
                                                     AsVector(ref yRef, remainder - (uint)(Vector<float>.Count)));
 
+                nuint misalignment = 0;
+
                 if (remainder > (uint)(Vector<float>.Count * 8))
                 {
                     // Pinning is cheap and will be short lived for small inputs and unlikely to be impactful
@@ -467,9 +473,9 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
                         // so account for that to ensure we don't double process and include them in the
                         // aggregate twice.
 
-                        xPtr += (uint)Vector<float>.Count;
-
-                        remainder -= (uint)Vector<float>.Count;
+                        misalignment = (uint)Vector<float>.Count;
+                        xPtr += misalignment;
+                        remainder -= misalignment;
 
                         Vector<float> vector1;
                         Vector<float> vector2;
@@ -531,6 +537,7 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
                 // Store the first block. Handling this separately simplifies the latter code as we know
                 // they come after and so we can relegate it to full blocks or the trailing elements
 
+                beg = Vector.ConditionalSelect(CreateAlignmentMaskSingleVector((int)(misalignment)), beg, new Vector<float>(aggregationOp.IdentityValue));
                 vresult = aggregationOp.Invoke(vresult, beg);
 
                 // Process the remaining [0, Count * 7] elements via a jump table
@@ -541,6 +548,7 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
 
                 nuint blocks = remainder / (nuint)(Vector<float>.Count);
                 nuint trailing = remainder - (blocks * (nuint)(Vector<float>.Count));
+                blocks -= (misalignment == 0) ? 1u : 0u;
                 remainder -= trailing;
 
                 switch (blocks)

From 211d70f55b10e487a76619509a3b4f5efdcd3403 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Mon, 12 Aug 2024 18:43:18 -0700
Subject: [PATCH 7/7] Ensure yptr on .NET Framework is incremented as well

---
 .../Tensors/netstandard/TensorPrimitives.Single.netstandard.cs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
index 0964f18c3d3834..563080bf742c24 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
@@ -474,7 +474,10 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
                         // aggregate twice.
 
                         misalignment = (uint)Vector<float>.Count;
+
                         xPtr += misalignment;
+                        yPtr += misalignment;
+
                         remainder -= misalignment;
 
                         Vector<float> vector1;