From 23ed7eb2c1be68e402e702e8af3e3019bfc89a30 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 8 Aug 2024 16:05:10 -0700 Subject: [PATCH 1/7] Ensure that aggregation is consistent regardless of data alignment --- .../TensorPrimitives.IAggregationOperator.cs | 21 +++++++++---- .../TensorPrimitives.Single.netstandard.cs | 31 +++---------------- 2 files changed, 19 insertions(+), 33 deletions(-) diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs index df03ad1634d288..cb9af1edd7f166 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs @@ -1227,9 +1227,12 @@ static T Vectorized128(ref T xRef, ref T yRef, nuint remainder) // We need to the ensure the underlying data can be aligned and only align // it if it can. It is possible we have an unaligned ref, in which case we - // can never achieve the required SIMD alignment. + // can never achieve the required SIMD alignment. This cannot be done for + // float or double since that changes how results compound together. - bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0; + bool canAlign = (typeof(T) != typeof(float)) && + (typeof(T) != typeof(double)) && + ((nuint)xPtr % (nuint)sizeof(T)) == 0; if (canAlign) { @@ -1418,9 +1421,12 @@ static T Vectorized256(ref T xRef, ref T yRef, nuint remainder) // We need to the ensure the underlying data can be aligned and only align // it if it can. It is possible we have an unaligned ref, in which case we - // can never achieve the required SIMD alignment. + // can never achieve the required SIMD alignment. This cannot be done for + // float or double since that changes how results compound together. - bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0; + bool canAlign = (typeof(T) != typeof(float)) && + (typeof(T) != typeof(double)) && + ((nuint)xPtr % (nuint)sizeof(T)) == 0; if (canAlign) { @@ -1609,9 +1615,12 @@ static T Vectorized512(ref T xRef, ref T yRef, nuint remainder) // We need to the ensure the underlying data can be aligned and only align // it if it can. It is possible we have an unaligned ref, in which case we - // can never achieve the required SIMD alignment. + // can never achieve the required SIMD alignment. This cannot be done for + // float or double since that changes how results compound together. - bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0; + bool canAlign = (typeof(T) != typeof(float)) && + (typeof(T) != typeof(double)) && + ((nuint)xPtr % (nuint)sizeof(T)) == 0; if (canAlign) { diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs index c9474cb470fd7f..559a50051eb056 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs @@ -467,8 +467,6 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary Vector end = binaryOp.Invoke(AsVector(ref xRef, remainder - (uint)(Vector.Count)), AsVector(ref yRef, remainder - (uint)(Vector.Count))); - nuint misalignment = 0; - if (remainder > (uint)(Vector.Count * 8)) { // Pinning is cheap and will be short lived for small inputs and unlikely to be impactful @@ -480,29 +478,9 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary float* xPtr = px; float* yPtr = py; - // We need to the ensure the underlying data can be aligned and only align - // it if it can. It is possible we have an unaligned ref, in which case we - // can never achieve the required SIMD alignment. - - bool canAlign = ((nuint)(xPtr) % sizeof(float)) == 0; - - if (canAlign) - { - // Compute by how many elements we're misaligned and adjust the pointers accordingly - // - // Noting that we are only actually aligning dPtr. This is because unaligned stores - // are more expensive than unaligned loads and aligning both is significantly more - // complex. - - misalignment = ((uint)(sizeof(Vector)) - ((nuint)(xPtr) % (uint)(sizeof(Vector)))) / sizeof(float); - - xPtr += misalignment; - yPtr += misalignment; - - Debug.Assert(((nuint)(xPtr) % (uint)(sizeof(Vector))) == 0); - - remainder -= misalignment; - } + // Unlike many other vectorization algorithms, we cannot align for aggregation + // because that changes how results compound together and can cause a significant + // difference in the output. Vector vector1; Vector vector2; @@ -564,7 +542,6 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary // Store the first block. Handling this separately simplifies the latter code as we know // they come after and so we can relegate it to full blocks or the trailing elements - beg = Vector.ConditionalSelect(CreateAlignmentMaskSingleVector((int)(misalignment)), beg, new Vector(aggregationOp.IdentityValue)); vresult = aggregationOp.Invoke(vresult, beg); // Process the remaining [0, Count * 7] elements via a jump table @@ -575,7 +552,7 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary nuint blocks = remainder / (nuint)(Vector.Count); nuint trailing = remainder - (blocks * (nuint)(Vector.Count)); - blocks -= (misalignment == 0) ? 1u : 0u; + blocks -= 1u; remainder -= trailing; switch (blocks) From b3d460ede48262d51a4280bd88b70d89b6e76404 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 9 Aug 2024 09:26:36 -0700 Subject: [PATCH 2/7] Ensure we handle for all aggregation helpers --- .../TensorPrimitives.IAggregationOperator.cs | 21 +++++++++---- .../TensorPrimitives.Single.netstandard.cs | 30 +++---------------- 2 files changed, 19 insertions(+), 32 deletions(-) diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs index cb9af1edd7f166..8a1c29d0ddd564 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs @@ -141,9 +141,12 @@ static T Vectorized128(ref T xRef, nuint remainder) // We need to the ensure the underlying data can be aligned and only align // it if it can. It is possible we have an unaligned ref, in which case we - // can never achieve the required SIMD alignment. + // can never achieve the required SIMD alignment. This cannot be done for + // float or double since that changes how results compound together. - bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0; + bool canAlign = (typeof(T) != typeof(float)) && + (typeof(T) != typeof(double)) && + ((nuint)xPtr % (nuint)sizeof(T)) == 0; if (canAlign) { @@ -310,9 +313,12 @@ static T Vectorized256(ref T xRef, nuint remainder) // We need to the ensure the underlying data can be aligned and only align // it if it can. It is possible we have an unaligned ref, in which case we - // can never achieve the required SIMD alignment. + // can never achieve the required SIMD alignment. This cannot be done for + // float or double since that changes how results compound together. - bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0; + bool canAlign = (typeof(T) != typeof(float)) && + (typeof(T) != typeof(double)) && + ((nuint)xPtr % (nuint)sizeof(T)) == 0; if (canAlign) { @@ -479,9 +485,12 @@ static T Vectorized512(ref T xRef, nuint remainder) // We need to the ensure the underlying data can be aligned and only align // it if it can. It is possible we have an unaligned ref, in which case we - // can never achieve the required SIMD alignment. + // can never achieve the required SIMD alignment. This cannot be done for + // float or double since that changes how results compound together. - bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0; + bool canAlign = (typeof(T) != typeof(float)) && + (typeof(T) != typeof(double)) && + ((nuint)xPtr % (nuint)sizeof(T)) == 0; if (canAlign) { diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs index 559a50051eb056..b5af49a24fa5f9 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs @@ -164,8 +164,6 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran Vector beg = transformOp.Invoke(AsVector(ref xRef)); Vector end = transformOp.Invoke(AsVector(ref xRef, remainder - (uint)(Vector.Count))); - nuint misalignment = 0; - if (remainder > (uint)(Vector.Count * 8)) { // Pinning is cheap and will be short lived for small inputs and unlikely to be impactful @@ -175,28 +173,9 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran { float* xPtr = px; - // We need to the ensure the underlying data can be aligned and only align - // it if it can. It is possible we have an unaligned ref, in which case we - // can never achieve the required SIMD alignment. - - bool canAlign = ((nuint)(xPtr) % sizeof(float)) == 0; - - if (canAlign) - { - // Compute by how many elements we're misaligned and adjust the pointers accordingly - // - // Noting that we are only actually aligning dPtr. This is because unaligned stores - // are more expensive than unaligned loads and aligning both is significantly more - // complex. - - misalignment = ((uint)(sizeof(Vector)) - ((nuint)(xPtr) % (uint)(sizeof(Vector)))) / sizeof(float); - - xPtr += misalignment; - - Debug.Assert(((nuint)(xPtr) % (uint)(sizeof(Vector))) == 0); - - remainder -= misalignment; - } + // Unlike many other vectorization algorithms, we cannot align for aggregation + // because that changes how results compound together and can cause a significant + // difference in the output. Vector vector1; Vector vector2; @@ -248,7 +227,6 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran // Store the first block. Handling this separately simplifies the latter code as we know // they come after and so we can relegate it to full blocks or the trailing elements - beg = Vector.ConditionalSelect(CreateAlignmentMaskSingleVector((int)(misalignment)), beg, new Vector(aggregationOp.IdentityValue)); vresult = aggregationOp.Invoke(vresult, beg); // Process the remaining [0, Count * 7] elements via a jump table @@ -259,7 +237,7 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran nuint blocks = remainder / (nuint)(Vector.Count); nuint trailing = remainder - (blocks * (nuint)(Vector.Count)); - blocks -= (misalignment == 0) ? 1u : 0u; + blocks -= 1u; remainder -= trailing; switch (blocks) From 864920d60d72d0977d36e87d527410ef8062ded8 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 9 Aug 2024 11:17:45 -0700 Subject: [PATCH 3/7] Ensure we don't process beg twice --- .../TensorPrimitives.IAggregationOperator.cs | 63 +++++++++++++++++++ .../TensorPrimitives.Single.netstandard.cs | 16 ++++- 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs index 8a1c29d0ddd564..a733e37cf64df0 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs @@ -164,6 +164,16 @@ static T Vectorized128(ref T xRef, nuint remainder) remainder -= misalignment; } + else + { + // We can't align, but this also means we're processing the full data from beg + // so account for that to ensure we don't double process and include them in the + // aggregate twice. + + xPtr += (uint)Vector128.Count; + + remainder -= (uint)Vector128.Count; + } Vector128 vector1; Vector128 vector2; @@ -336,6 +346,16 @@ static T Vectorized256(ref T xRef, nuint remainder) remainder -= misalignment; } + else + { + // We can't align, but this also means we're processing the full data from beg + // so account for that to ensure we don't double process and include them in the + // aggregate twice. + + xPtr += (uint)Vector256.Count; + + remainder -= (uint)Vector256.Count; + } Vector256 vector1; Vector256 vector2; @@ -508,6 +528,16 @@ static T Vectorized512(ref T xRef, nuint remainder) remainder -= misalignment; } + else + { + // We can't align, but this also means we're processing the full data from beg + // so account for that to ensure we don't double process and include them in the + // aggregate twice. + + xPtr += (uint)Vector512.Count; + + remainder -= (uint)Vector512.Count; + } Vector512 vector1; Vector512 vector2; @@ -1260,6 +1290,17 @@ static T Vectorized128(ref T xRef, ref T yRef, nuint remainder) remainder -= misalignment; } + else + { + // We can't align, but this also means we're processing the full data from beg + // so account for that to ensure we don't double process and include them in the + // aggregate twice. + + xPtr += (uint)Vector128.Count; + yPtr += (uint)Vector128.Count; + + remainder -= (uint)Vector128.Count; + } Vector128 vector1; Vector128 vector2; @@ -1454,6 +1495,17 @@ static T Vectorized256(ref T xRef, ref T yRef, nuint remainder) remainder -= misalignment; } + else + { + // We can't align, but this also means we're processing the full data from beg + // so account for that to ensure we don't double process and include them in the + // aggregate twice. + + xPtr += (uint)Vector256.Count; + yPtr += (uint)Vector256.Count; + + remainder -= (uint)Vector256.Count; + } Vector256 vector1; Vector256 vector2; @@ -1648,6 +1700,17 @@ static T Vectorized512(ref T xRef, ref T yRef, nuint remainder) remainder -= misalignment; } + else + { + // We can't align, but this also means we're processing the full data from beg + // so account for that to ensure we don't double process and include them in the + // aggregate twice. + + xPtr += (uint)Vector512.Count; + yPtr += (uint)Vector512.Count; + + remainder -= (uint)Vector512.Count; + } Vector512 vector1; Vector512 vector2; diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs index b5af49a24fa5f9..e5fa6ede054617 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs @@ -175,7 +175,13 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran // Unlike many other vectorization algorithms, we cannot align for aggregation // because that changes how results compound together and can cause a significant - // difference in the output. + // difference in the output. This also means we're processing the full data from beg + // so account for that to ensure we don't double process and include them in the + // aggregate twice. + + xPtr += (uint)Vector.Count; + + remainder -= (uint)Vector.Count; Vector vector1; Vector vector2; @@ -458,7 +464,13 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary // Unlike many other vectorization algorithms, we cannot align for aggregation // because that changes how results compound together and can cause a significant - // difference in the output. + // difference in the output. This also means we're processing the full data from beg + // so account for that to ensure we don't double process and include them in the + // aggregate twice. + + xPtr += (uint)Vector.Count; + + remainder -= (uint)Vector.Count; Vector vector1; Vector vector2; From 238c94794ac487c1452e5b713d1468341db2cc9a Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 12 Aug 2024 10:18:49 -0700 Subject: [PATCH 4/7] Ensure that we properly track in the case we can't align --- .../TensorPrimitives.IAggregationOperator.cs | 43 +++++++++++-------- .../TensorPrimitives.Single.netstandard.cs | 2 - 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs index a733e37cf64df0..6c88a8eed0a7a2 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs @@ -159,7 +159,6 @@ static T Vectorized128(ref T xRef, nuint remainder) misalignment = ((uint)sizeof(Vector128) - ((nuint)xPtr % (uint)sizeof(Vector128))) / (uint)sizeof(T); xPtr += misalignment; - Debug.Assert(((nuint)xPtr % (uint)sizeof(Vector128)) == 0); remainder -= misalignment; @@ -170,9 +169,9 @@ static T Vectorized128(ref T xRef, nuint remainder) // so account for that to ensure we don't double process and include them in the // aggregate twice. - xPtr += (uint)Vector128.Count; - - remainder -= (uint)Vector128.Count; + misalignment = (uint)Vector128.Count; + xPtr += misalignment; + remainder -= misalignment; } Vector128 vector1; @@ -352,9 +351,9 @@ static T Vectorized256(ref T xRef, nuint remainder) // so account for that to ensure we don't double process and include them in the // aggregate twice. - xPtr += (uint)Vector256.Count; - - remainder -= (uint)Vector256.Count; + misalignment = (uint)Vector256.Count + xPtr += misalignment; + remainder -= misalignment; } Vector256 vector1; @@ -534,9 +533,9 @@ static T Vectorized512(ref T xRef, nuint remainder) // so account for that to ensure we don't double process and include them in the // aggregate twice. - xPtr += (uint)Vector512.Count; - - remainder -= (uint)Vector512.Count; + misalignment = (uint)Vector512.Count + xPtr += misalignment; + remainder -= misalignment; } Vector512 vector1; @@ -1296,10 +1295,12 @@ static T Vectorized128(ref T xRef, ref T yRef, nuint remainder) // so account for that to ensure we don't double process and include them in the // aggregate twice. - xPtr += (uint)Vector128.Count; - yPtr += (uint)Vector128.Count; + misalignment = (uint)Vector128.Count + + xPtr += misalignment; + yPtr += misalignment; - remainder -= (uint)Vector128.Count; + remainder -= misalignment; } Vector128 vector1; @@ -1501,10 +1502,12 @@ static T Vectorized256(ref T xRef, ref T yRef, nuint remainder) // so account for that to ensure we don't double process and include them in the // aggregate twice. - xPtr += (uint)Vector256.Count; - yPtr += (uint)Vector256.Count; + misalignment = (uint)Vector256.Count + + xPtr += misalignment; + yPtr += misalignment; - remainder -= (uint)Vector256.Count; + remainder -= misalignment; } Vector256 vector1; @@ -1706,10 +1709,12 @@ static T Vectorized512(ref T xRef, ref T yRef, nuint remainder) // so account for that to ensure we don't double process and include them in the // aggregate twice. - xPtr += (uint)Vector512.Count; - yPtr += (uint)Vector512.Count; + misalignment = (uint)Vector512.Count - remainder -= (uint)Vector512.Count; + xPtr += misalignment; + yPtr += misalignment; + + remainder -= misalignment; } Vector512 vector1; diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs index e5fa6ede054617..a22c212e0c455a 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs @@ -243,7 +243,6 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran nuint blocks = remainder / (nuint)(Vector.Count); nuint trailing = remainder - (blocks * (nuint)(Vector.Count)); - blocks -= 1u; remainder -= trailing; switch (blocks) @@ -542,7 +541,6 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary nuint blocks = remainder / (nuint)(Vector.Count); nuint trailing = remainder - (blocks * (nuint)(Vector.Count)); - blocks -= 1u; remainder -= trailing; switch (blocks) From 23c043cf132918c293c232601f3b8822cf453949 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 12 Aug 2024 12:14:52 -0700 Subject: [PATCH 5/7] Add missing semicolon --- .../Common/TensorPrimitives.IAggregationOperator.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs index 6c88a8eed0a7a2..1c8d215bd137d8 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs @@ -351,7 +351,7 @@ static T Vectorized256(ref T xRef, nuint remainder) // so account for that to ensure we don't double process and include them in the // aggregate twice. - misalignment = (uint)Vector256.Count + misalignment = (uint)Vector256.Count; xPtr += misalignment; remainder -= misalignment; } @@ -533,7 +533,7 @@ static T Vectorized512(ref T xRef, nuint remainder) // so account for that to ensure we don't double process and include them in the // aggregate twice. - misalignment = (uint)Vector512.Count + misalignment = (uint)Vector512.Count; xPtr += misalignment; remainder -= misalignment; } @@ -1295,7 +1295,7 @@ static T Vectorized128(ref T xRef, ref T yRef, nuint remainder) // so account for that to ensure we don't double process and include them in the // aggregate twice. - misalignment = (uint)Vector128.Count + misalignment = (uint)Vector128.Count; xPtr += misalignment; yPtr += misalignment; @@ -1502,7 +1502,7 @@ static T Vectorized256(ref T xRef, ref T yRef, nuint remainder) // so account for that to ensure we don't double process and include them in the // aggregate twice. - misalignment = (uint)Vector256.Count + misalignment = (uint)Vector256.Count; xPtr += misalignment; yPtr += misalignment; @@ -1709,7 +1709,7 @@ static T Vectorized512(ref T xRef, ref T yRef, nuint remainder) // so account for that to ensure we don't double process and include them in the // aggregate twice. - misalignment = (uint)Vector512.Count + misalignment = (uint)Vector512.Count; xPtr += misalignment; yPtr += misalignment; From 1f8f4a8627dad600ca696384e63df4eacff5b6a6 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 12 Aug 2024 15:07:15 -0700 Subject: [PATCH 6/7] Fix the handling on .NET Framework --- .../TensorPrimitives.Single.netstandard.cs | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs index a22c212e0c455a..0964f18c3d3834 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs @@ -164,6 +164,8 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran Vector beg = transformOp.Invoke(AsVector(ref xRef)); Vector end = transformOp.Invoke(AsVector(ref xRef, remainder - (uint)(Vector.Count))); + nuint misalignment = 0; + if (remainder > (uint)(Vector.Count * 8)) { // Pinning is cheap and will be short lived for small inputs and unlikely to be impactful @@ -179,9 +181,9 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran // so account for that to ensure we don't double process and include them in the // aggregate twice. - xPtr += (uint)Vector.Count; - - remainder -= (uint)Vector.Count; + misalignment = (uint)Vector.Count; + xPtr += misalignment; + remainder -= misalignment; Vector vector1; Vector vector2; @@ -233,6 +235,7 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran // Store the first block. Handling this separately simplifies the latter code as we know // they come after and so we can relegate it to full blocks or the trailing elements + beg = Vector.ConditionalSelect(CreateAlignmentMaskSingleVector((int)(misalignment)), beg, new Vector(aggregationOp.IdentityValue)); vresult = aggregationOp.Invoke(vresult, beg); // Process the remaining [0, Count * 7] elements via a jump table @@ -243,6 +246,7 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran nuint blocks = remainder / (nuint)(Vector.Count); nuint trailing = remainder - (blocks * (nuint)(Vector.Count)); + blocks -= (misalignment == 0) ? 1u : 0u; remainder -= trailing; switch (blocks) @@ -450,6 +454,8 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary Vector end = binaryOp.Invoke(AsVector(ref xRef, remainder - (uint)(Vector.Count)), AsVector(ref yRef, remainder - (uint)(Vector.Count))); + nuint misalignment = 0; + if (remainder > (uint)(Vector.Count * 8)) { // Pinning is cheap and will be short lived for small inputs and unlikely to be impactful @@ -467,9 +473,9 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary // so account for that to ensure we don't double process and include them in the // aggregate twice. - xPtr += (uint)Vector.Count; - - remainder -= (uint)Vector.Count; + misalignment = (uint)Vector.Count; + xPtr += misalignment; + remainder -= misalignment; Vector vector1; Vector vector2; @@ -531,6 +537,7 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary // Store the first block. Handling this separately simplifies the latter code as we know // they come after and so we can relegate it to full blocks or the trailing elements + beg = Vector.ConditionalSelect(CreateAlignmentMaskSingleVector((int)(misalignment)), beg, new Vector(aggregationOp.IdentityValue)); vresult = aggregationOp.Invoke(vresult, beg); // Process the remaining [0, Count * 7] elements via a jump table @@ -541,6 +548,7 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary nuint blocks = remainder / (nuint)(Vector.Count); nuint trailing = remainder - (blocks * (nuint)(Vector.Count)); + blocks -= (misalignment == 0) ? 1u : 0u; remainder -= trailing; switch (blocks) From 211d70f55b10e487a76619509a3b4f5efdcd3403 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 12 Aug 2024 18:43:18 -0700 Subject: [PATCH 7/7] Ensure yptr on .NET Framework is incremented as well --- .../Tensors/netstandard/TensorPrimitives.Single.netstandard.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs index 0964f18c3d3834..563080bf742c24 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs @@ -474,7 +474,10 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary // aggregate twice. misalignment = (uint)Vector.Count; + xPtr += misalignment; + yPtr += misalignment; + remainder -= misalignment; Vector vector1;