From 9411ba56f680d5b6678a2e50b50d2a218d3f6f94 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 18 Jun 2025 13:25:01 +0100 Subject: [PATCH 1/7] Arm64 SVE: re-enable use of predicate variants Fixes #101970 in #115566 Adds a simple costing to fgMorphTryUseAllMaskVariant() and assumes nodes can always be converted to masks (using ConvertVectorToMask). --- src/coreclr/jit/compiler.h | 5 +- src/coreclr/jit/gentree.cpp | 8 +- src/coreclr/jit/morph.cpp | 228 ++++++++++++------ src/coreclr/jit/simd.h | 100 +++++++- .../JIT/opt/SVE/PredicateInstructions.cs | 196 ++++++++++++++- 5 files changed, 447 insertions(+), 90 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 9629ca8725dbc9..85a5f945b7f276 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3205,7 +3205,7 @@ class Compiler unsigned simdSize); #if defined(FEATURE_MASKED_HW_INTRINSICS) - GenTree* gtNewSimdCvtVectorToMaskNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); + GenTreeHWIntrinsic* gtNewSimdCvtVectorToMaskNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); #endif // FEATURE_MASKED_HW_INTRINSICS GenTree* gtNewSimdCreateBroadcastNode( @@ -6713,8 +6713,7 @@ class Compiler GenTreeHWIntrinsic* fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* node); #endif // FEATURE_MASKED_HW_INTRINSICS #ifdef TARGET_ARM64 - bool canMorphVectorOperandToMask(GenTree* node); - bool canMorphAllVectorOperandsToMasks(GenTreeHWIntrinsic* node); + void costMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent, weight_t *currentCost, weight_t *switchCost); GenTree* doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent); GenTreeHWIntrinsic* fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node); #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 75ddbddb0fcf24..18c19bc54a382d 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21947,10 +21947,10 @@ GenTree* Compiler::gtNewSimdCvtNativeNode(var_types type, // Return Value: // The node converted to the a mask type // -GenTree* Compiler::gtNewSimdCvtVectorToMaskNode(var_types type, - GenTree* op1, - CorInfoType simdBaseJitType, - unsigned simdSize) +GenTreeHWIntrinsic* Compiler::gtNewSimdCvtVectorToMaskNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize) { assert(varTypeIsMask(type)); assert(varTypeIsSIMD(op1)); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 1b15e34a7d8bf6..65face0edfc7a5 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9727,10 +9727,7 @@ GenTreeHWIntrinsic* Compiler::fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* n return node; } #elif defined(TARGET_ARM64) - // TODO-SVE: This optimisation is too naive. It needs to calculate the full cost of the instruction - // vs using the predicate version, taking into account all input arguements and all uses - // of the result. - // return fgMorphTryUseAllMaskVariant(node); + return fgMorphTryUseAllMaskVariant(node); #else #error Unsupported platform #endif @@ -9738,37 +9735,67 @@ GenTreeHWIntrinsic* Compiler::fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* n } #ifdef TARGET_ARM64 -//------------------------------------------------------------------------ -// canMorphVectorOperandToMask: Can this vector operand be converted to a -// node with type TYP_MASK easily? -// -bool Compiler::canMorphVectorOperandToMask(GenTree* node) -{ - return varTypeIsMask(node) || node->OperIsConvertMaskToVector() || node->IsVectorZero(); -} + +// Conversion of mask to vector is one instruction. +static constexpr const weight_t costOfConvertMaskToVector = 1.0; + +// Conversion of vector to mask is two instructions. +static constexpr const weight_t costOfConvertVectorToMask = 2.0; //------------------------------------------------------------------------ -// canMorphAllVectorOperandsToMasks: Can all vector operands to this node -// be converted to a node with type -// TYP_MASK easily? +// costMorphVectorOperandToMask: Calculate the cost of the converting a node +// to a mask and the current cost. // -bool Compiler::canMorphAllVectorOperandsToMasks(GenTreeHWIntrinsic* node) +// Arguments: +// node - The node to convert to a mask +// parent - The parent of the node +// currentCost - (IN/OUT) incremented by the current cost of the node +// switchCost - (IN/OUT) incremented by the cost of switching the node to a mask +// +void Compiler::costMorphVectorOperandToMask(GenTree* node, + GenTreeHWIntrinsic* parent, + weight_t* currentCost, + weight_t* switchCost) { - bool allMaskConversions = true; - for (size_t i = 1; i <= node->GetOperandCount() && allMaskConversions; i++) + if (varTypeIsMask(node)) + { + // Node is already a mask + return; + } + + if (node->OperIsConvertMaskToVector()) + { + // Currently there is the cost of the convert + *currentCost += costOfConvertMaskToVector; + return; + } + + if (node->IsCnsVec()) { - allMaskConversions &= canMorphVectorOperandToMask(node->Op(i)); + // Check if the constant vector can be converted to a pattern during codegen + assert(node->TypeGet() == TYP_SIMD16); + var_types simdBaseType = parent->GetSimdBaseType(); + + if (SveMaskPatternNone != EvaluateSimdVectorToPattern(simdBaseType, node->AsVecCon()->gtSimd16Val)) + { + // Assume a constant vector is the same cost as a constant mask + return; + } } - return allMaskConversions; + // Switching will require wrapping in a convert vector to mask + *switchCost += costOfConvertVectorToMask; } //------------------------------------------------------------------------ -// doMorphVectorOperandToMask: Morph a vector node that is close to a mask -// node into a mask node. +// doMorphVectorOperandToMask: Morph a vector node into a mask node. +// +// Arguments: +// node - The node to convert to a mask +// parent - The parent of the node // // Return value: -// The morphed tree, or nullptr if the transform is not applicable. +// The morphed tree // GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent) { @@ -9777,76 +9804,139 @@ GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* // Already a mask, nothing to do. return node; } - else if (node->OperIsConvertMaskToVector()) + + if (node->OperIsConvertMaskToVector()) { // Replace node with op1. return node->AsHWIntrinsic()->Op(1); } - else if (node->IsVectorZero()) + + if (node->IsCnsVec()) { - // Morph the vector of zeroes into mask of zeroes. - GenTree* mask = gtNewSimdFalseMaskByteNode(); - mask->SetMorphed(this); - return mask; + // Morph the constant vector to mask + assert(node->TypeGet() == TYP_SIMD16); + var_types simdBaseType = parent->GetSimdBaseType(); + + if (SveMaskPatternNone != EvaluateSimdVectorToPattern(simdBaseType, node->AsVecCon()->gtSimd16Val)) + { + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); + EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, node->AsVecCon()->gtSimd16Val); + mskCon->SetMorphed(this); + return mskCon; + } } - return nullptr; + // Wrap in a convert vector to mask + CorInfoType simdBaseJitType = parent->GetSimdBaseJitType(); + unsigned simdSize = parent->GetSimdSize(); + GenTreeHWIntrinsic* cvt = gtNewSimdCvtVectorToMaskNode(TYP_MASK, node, simdBaseJitType, simdSize); + cvt->SetMorphed(this); + cvt->Op(1)->SetMorphed(this); + return cvt; } //----------------------------------------------------------------------------------------------------- // fgMorphTryUseAllMaskVariant: For NamedIntrinsics that have a variant where all operands are -// mask nodes. If all operands to this node are 'suggesting' that they -// originate closely from a mask, but are of vector types, then morph the -// operands as appropriate to use mask types instead. 'Suggesting' -// is defined by the canMorphVectorOperandToMask function. +// mask nodes. If the NamedIntrinsic is used by converting it to a mask +// then it can be replaced with the mask variant. Use simple costing to +// decide if the variant is preferred. +// +// Notes: +// It is not safe to convert to a mask and then back to vector again as infromation will be lost. +// Therefore only consider NamedIntrinsics that are used as masks. // // Arguments: -// tree - The HWIntrinsic to try and optimize. +// node - The conversion to mask of the HWIntrinsic to try and optimize. // // Return Value: // The fully morphed tree if a change was made, else nullptr. // GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node) { - if (HWIntrinsicInfo::HasAllMaskVariant(node->GetHWIntrinsicId())) + // We only care about vectors being used as masks. + if (!node->OperIsConvertVectorToMask()) { - NamedIntrinsic maskVariant = HWIntrinsicInfo::GetMaskVariant(node->GetHWIntrinsicId()); + return nullptr; + } - // As some intrinsics have many variants, check that the count of operands on the node - // matches the number of operands required for the mask variant of the intrinsic. The mask - // variant of the intrinsic must have a fixed number of operands. - int numArgs = HWIntrinsicInfo::lookupNumArgs(maskVariant); - assert(numArgs >= 0); - if (node->GetOperandCount() == (size_t)numArgs) - { - // We're sure it will work at this point, so perform the pattern match on operands. - if (canMorphAllVectorOperandsToMasks(node)) - { - switch (node->GetOperandCount()) - { - case 1: - node->ResetHWIntrinsicId(maskVariant, doMorphVectorOperandToMask(node->Op(1), node)); - break; - case 2: - node->ResetHWIntrinsicId(maskVariant, doMorphVectorOperandToMask(node->Op(1), node), - doMorphVectorOperandToMask(node->Op(2), node)); - break; - case 3: - node->ResetHWIntrinsicId(maskVariant, this, doMorphVectorOperandToMask(node->Op(1), node), - doMorphVectorOperandToMask(node->Op(2), node), - doMorphVectorOperandToMask(node->Op(3), node)); - break; - default: - unreached(); - } + // Check the converted node. Is there a masked variant. - node->gtType = TYP_MASK; - return node; - } - } + if (!node->Op(2)->OperIsHWIntrinsic()) + { + return nullptr; } - return nullptr; + GenTreeHWIntrinsic* convertedNode = node->Op(2)->AsHWIntrinsic(); + + if (!HWIntrinsicInfo::HasAllMaskVariant(convertedNode->GetHWIntrinsicId())) + { + return nullptr; + } + NamedIntrinsic maskVariant = HWIntrinsicInfo::GetMaskVariant(convertedNode->GetHWIntrinsicId()); + + // As some intrinsics have many variants, check that the count of operands on the node + // matches the number of operands required for the mask variant of the intrinsic. The mask + // variant of the intrinsic must have a fixed number of operands. + int numArgs = HWIntrinsicInfo::lookupNumArgs(maskVariant); + assert(numArgs >= 0); + if (convertedNode->GetOperandCount() != (size_t)numArgs) + { + return nullptr; + } + + weight_t currentCost = 0.0; + weight_t switchCost = 0.0; + + // Take into account the cost of the conversion of the result + currentCost += costOfConvertVectorToMask; + + // Take into account the cost of the children + for (size_t i = 1; i <= convertedNode->GetOperandCount(); i++) + { + costMorphVectorOperandToMask(convertedNode->Op(i), convertedNode, ¤tCost, &switchCost); + } + + JITDUMP("Attempting Mask variant morph for [%06u]. Current cost: %.2f, Conversion cost: %.2f. Will %smorph.\n", + dspTreeID(convertedNode), currentCost, switchCost, (switchCost > currentCost) ? "not " : ""); + + // If the costs are identical, then prefer morphed as it matches the output type. + if (switchCost > currentCost) + { + return nullptr; + } + + // Do the morph + switch (convertedNode->GetOperandCount()) + { + case 1: + convertedNode->ResetHWIntrinsicId(maskVariant, + doMorphVectorOperandToMask(convertedNode->Op(1), convertedNode)); + + break; + case 2: + convertedNode->ResetHWIntrinsicId(maskVariant, + doMorphVectorOperandToMask(convertedNode->Op(1), convertedNode), + doMorphVectorOperandToMask(convertedNode->Op(2), convertedNode)); + break; + case 3: + convertedNode->ResetHWIntrinsicId(maskVariant, this, + doMorphVectorOperandToMask(convertedNode->Op(1), convertedNode), + doMorphVectorOperandToMask(convertedNode->Op(2), convertedNode), + doMorphVectorOperandToMask(convertedNode->Op(3), convertedNode)); + break; + default: + unreached(); + } + + // Morph the new children. + for (size_t i = 1; i <= convertedNode->GetOperandCount(); i++) + { + convertedNode->Op(i) = fgMorphTree(convertedNode->Op(i)); + } + + convertedNode->gtType = TYP_MASK; + DEBUG_DESTROY_NODE(node); + return convertedNode; } #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 9841bdeb38c93c..9c297347f2ece6 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1906,7 +1906,7 @@ SveMaskPattern EvaluateSimdMaskToPattern(simdmask_t arg0) memcpy(&mask, &arg0.u8[0], sizeof(uint64_t)); uint32_t finalOne = count; - // A mask pattern starts with zero of more 1s and then the rest of the mask is filled with 0s. + // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. // Find an unbroken sequence of 1s. for (uint32_t i = 0; i < count; i++) @@ -1993,6 +1993,104 @@ SveMaskPattern EvaluateSimdMaskToPattern(var_types baseType, simdmask_t arg0) } } } + +template +SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) +{ + uint32_t count = sizeof(TSimd) / sizeof(TBase); + uint32_t finalOne = count; + + // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. + // This pattern is extracted using the least significant bits of the vector elements. + + // Find an unbroken sequence of 1s. + for (uint32_t i = 0; i < count; i++) + { + TBase input0; + memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); + + // For Arm64 we have count total bits to read, but + // they are sizeof(TBase) bits apart. We set + // depending on if the corresponding input element + // has its least significant bit set + + bool isSet = static_cast(1) << (i * sizeof(TBase)); + if (!isSet) + { + finalOne = i; + break; + } + } + + // Find an unbroken sequence of 0s. + for (uint32_t i = finalOne; i < count; i++) + { + // For Arm64 we have count total bits to read, but + // they are sizeof(TBase) bits apart. We set + // depending on if the corresponding input element + // has its least significant bit set + + bool isSet = static_cast(1) << (i * sizeof(TBase)); + if (isSet) + { + // Invalid sequence + return SveMaskPatternNone; + } + } + + if (finalOne == count) + { + return SveMaskPatternAll; + } + else if (finalOne >= SveMaskPatternVectorCount1 && finalOne <= SveMaskPatternVectorCount8) + { + return (SveMaskPattern)finalOne; + } + else + { + // TODO: Add other patterns as required. These probably won't be seen until we get + // to wider vector lengths. + return SveMaskPatternNone; + } +} + +template +SveMaskPattern EvaluateSimdVectorToPattern(var_types baseType, TSimd arg0) +{ + switch (baseType) + { + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + { + return EvaluateSimdVectorToPattern(arg0); + } + + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + { + return EvaluateSimdVectorToPattern(arg0); + } + + case TYP_BYTE: + case TYP_UBYTE: + { + return EvaluateSimdVectorToPattern(arg0); + } + + case TYP_SHORT: + case TYP_USHORT: + { + return EvaluateSimdVectorToPattern(arg0); + } + + default: + { + unreached(); + } + } +} #endif // TARGET_ARM64 #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/tests/JIT/opt/SVE/PredicateInstructions.cs b/src/tests/JIT/opt/SVE/PredicateInstructions.cs index b1336674f1638b..c5a1e75199c4a9 100644 --- a/src/tests/JIT/opt/SVE/PredicateInstructions.cs +++ b/src/tests/JIT/opt/SVE/PredicateInstructions.cs @@ -17,66 +17,98 @@ public static void TestPredicateInstructions() { if (Sve.IsSupported) { + Vector vecs = Vector.Create(2); + Vector veci = Vector.Create(3); + Vector vecui = Vector.Create(5); + Vector vecl = Vector.Create(7); + ZipLow(); - ZipHigh(); + ZipHigh(vecui); UnzipOdd(); UnzipEven(); TransposeOdd(); - TransposeEven(); + TransposeEven(vecl); ReverseElement(); And(); BitwiseClear(); Xor(); Or(); - ConditionalSelect(); + ConditionalSelect(veci); + + ZipLowMask(); + ZipHighMask(vecui); + UnzipOddMask(); + UnzipEvenMask(); + TransposeOddMask(); + TransposeEvenMask(vecl); + ReverseElementMask(); + AndMask(); + BitwiseClearMask(); + XorMask(); + OrMask(); + ConditionalSelectMask(veci); + + UnzipEvenZipLowMask(); + TransposeEvenAndMask(vecs); + } } + // These should not use the predicate variants + [MethodImpl(MethodImplOptions.NoInlining)] static Vector ZipLow() { + //ARM64-FULL-LINE: zip1 {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.ZipLow(Vector.Zero, Sve.CreateTrueMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] - static Vector ZipHigh() + static Vector ZipHigh(Vector v) { - return Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), Sve.CreateTrueMaskUInt32()); + //ARM64-FULL-LINE: zip2 {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s + return Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), v); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector UnzipEven() { + //ARM64-FULL-LINE: uzp1 {{z[0-9]+}}.b, {{z[0-9]+}}.b, {{z[0-9]+}}.b return Sve.UnzipEven(Sve.CreateTrueMaskSByte(), Vector.Zero); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector UnzipOdd() { + //ARM64-FULL-LINE: uzp2 {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.UnzipOdd(Sve.CreateTrueMaskInt16(), Sve.CreateFalseMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] - static Vector TransposeEven() + static Vector TransposeEven(Vector v) { - return Sve.TransposeEven(Sve.CreateFalseMaskInt64(), Sve.CreateTrueMaskInt64()); + //ARM64-FULL-LINE: trn1 {{z[0-9]+}}.d, {{z[0-9]+}}.d, {{z[0-9]+}}.d + return Sve.TransposeEven(v, Sve.CreateTrueMaskInt64()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector TransposeOdd() { + //ARM64-FULL-LINE: trn2 {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.TransposeOdd(Vector.Zero, Sve.CreateTrueMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector ReverseElement() { + //ARM64-FULL-LINE: rev {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.ReverseElement(Sve.CreateTrueMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector And() { + //ARM64-FULL-LINE: and {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.ConditionalSelect( Sve.CreateTrueMaskInt16(), Sve.And(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), @@ -87,8 +119,9 @@ static Vector And() [MethodImpl(MethodImplOptions.NoInlining)] static Vector BitwiseClear() { + //ARM64-FULL-LINE: bic {{z[0-9]+}}.h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.ConditionalSelect( - Sve.CreateFalseMaskInt16(), + Sve.CreateTrueMaskInt16(SveMaskPattern.VectorCount1), Sve.BitwiseClear(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), Vector.Zero ); @@ -97,8 +130,9 @@ static Vector BitwiseClear() [MethodImpl(MethodImplOptions.NoInlining)] static Vector Xor() { + //ARM64-FULL-LINE: eor {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s return Sve.ConditionalSelect( - Sve.CreateTrueMaskInt32(), + Sve.CreateTrueMaskInt32(SveMaskPattern.VectorCount3), Sve.Xor(Sve.CreateTrueMaskInt32(), Sve.CreateTrueMaskInt32()), Vector.Zero ); @@ -107,20 +141,156 @@ static Vector Xor() [MethodImpl(MethodImplOptions.NoInlining)] static Vector Or() { + //ARM64-FULL-LINE: orr {{z[0-9]+}}.h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h return Sve.ConditionalSelect( - Sve.CreateTrueMaskInt16(), + Sve.CreateTrueMaskInt16(SveMaskPattern.VectorCount1), Sve.Or(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), Vector.Zero ); } [MethodImpl(MethodImplOptions.NoInlining)] - static Vector ConditionalSelect() + static Vector ConditionalSelect(Vector v) { + // Use a passed in vector for the mask to prevent optimising away the select + //ARM64-FULL-LINE: sel {{z[0-9]+}}.s, {{p[0-9]+}}, {{z[0-9]+}}.s, {{z[0-9]+}}.s return Sve.ConditionalSelect( - Vector.Zero, + v, Sve.CreateFalseMaskInt32(), Sve.CreateTrueMaskInt32() ); } -} \ No newline at end of file + + + // These should use the predicate variants. + // CreateBreakAfterMask is used as the first argument is a predicate. + + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ZipLowMask() + { + //ARM64-FULL-LINE: zip1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.ZipLow(Vector.Zero, Sve.CreateTrueMaskInt16()), Sve.CreateTrueMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ZipHighMask(Vector v) + { + //ARM64-FULL-LINE: zip2 {{p[0-9]+}}.s, {{p[0-9]+}}.s, {{p[0-9]+}}.s + return Sve.CreateBreakAfterMask(Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), v), Sve.CreateTrueMaskUInt32()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector UnzipEvenMask() + { + //ARM64-FULL-LINE: uzp1 {{p[0-9]+}}.b, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.UnzipEven(Sve.CreateTrueMaskSByte(), Vector.Zero), Sve.CreateTrueMaskSByte()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector UnzipOddMask() + { + //ARM64-FULL-LINE: uzp2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.UnzipOdd(Sve.CreateTrueMaskInt16(), Sve.CreateFalseMaskInt16()), Sve.CreateTrueMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector TransposeEvenMask(Vector v) + { + //ARM64-FULL-LINE: trn1 {{p[0-9]+}}.d, {{p[0-9]+}}.d, {{p[0-9]+}}.d + return Sve.CreateBreakAfterMask(Sve.TransposeEven(v, Sve.CreateTrueMaskInt64()), Sve.CreateFalseMaskInt64()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector TransposeOddMask() + { + //ARM64-FULL-LINE: trn2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.TransposeOdd(Vector.Zero, Sve.CreateTrueMaskInt16()), Sve.CreateTrueMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ReverseElementMask() + { + //ARM64-FULL-LINE: rev {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.ReverseElement(Sve.CreateTrueMaskInt16()), Sve.CreateFalseMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector AndMask() + { + //ARM64-FULL-LINE: and {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.ConditionalSelect( + Sve.CreateTrueMaskInt16(), + Sve.And(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), + Vector.Zero + ), Sve.CreateFalseMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector BitwiseClearMask() + { + //ARM64-FULL-LINE: bic {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.ConditionalSelect( + Sve.CreateTrueMaskInt16(), + Sve.BitwiseClear(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), + Vector.Zero + ), Sve.CreateFalseMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector XorMask() + { + //ARM64-FULL-LINE: eor {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.ConditionalSelect( + Sve.CreateTrueMaskInt32(), + Sve.Xor(Sve.CreateTrueMaskInt32(), Sve.CreateTrueMaskInt32()), + Vector.Zero + ), Sve.CreateFalseMaskInt32()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector OrMask() + { + //ARM64-FULL-LINE: orr {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.ConditionalSelect( + Sve.CreateTrueMaskInt16(), + Sve.Or(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), + Vector.Zero + ), Sve.CreateFalseMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ConditionalSelectMask(Vector v) + { + // Use a passed in vector for the mask to prevent optimising away the select + //ARM64-FULL-LINE: sel {{p[0-9]+}}.b, {{p[0-9]+}}, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.CreateBreakAfterMask(Sve.ConditionalSelect( + v, + Sve.CreateFalseMaskInt32(), + Sve.CreateTrueMaskInt32() + ), Sve.CreateFalseMaskInt32()); + } + + // These have multiple uses of the predicate variants + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector UnzipEvenZipLowMask() + { + //ARM64-FULL-LINE: zip1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + //ARM64-FULL-LINE: uzp1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.UnzipEven(Sve.ZipLow(Vector.Zero, Sve.CreateTrueMaskInt16()), Vector.Zero), Sve.CreateTrueMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector TransposeEvenAndMask(Vector v) + { + //ARM64-FULL-LINE: and {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + //ARM64-FULL-LINE: trn1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.CreateBreakAfterMask(Sve.TransposeEven(Sve.CreateTrueMaskInt16(), Sve.ConditionalSelect( + Sve.CreateTrueMaskInt16(), + Sve.And(v, Sve.CreateTrueMaskInt16()), + Vector.Zero + )), Sve.CreateFalseMaskInt16()); + + } +} From 73dc29aba2db81922cf9f2d619c17c0306e8f2d5 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 23 Jun 2025 11:37:30 +0100 Subject: [PATCH 2/7] Check significantBit in EvaluateSimdVectorToPattern() --- src/coreclr/jit/simd.h | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 9c297347f2ece6..7c5740762971fa 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -2000,22 +2000,21 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint32_t finalOne = count; + TBase significantBit = 1; + // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. // This pattern is extracted using the least significant bits of the vector elements. + // For Arm64 we have count total bits to read, but they are sizeof(TBase) bits apart. We set + // depending on if the corresponding input element has its least significant bit set + // Find an unbroken sequence of 1s. for (uint32_t i = 0; i < count; i++) { TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - // For Arm64 we have count total bits to read, but - // they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element - // has its least significant bit set - - bool isSet = static_cast(1) << (i * sizeof(TBase)); - if (!isSet) + if ((input0 & significantBit) != 0) { finalOne = i; break; @@ -2025,13 +2024,10 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) // Find an unbroken sequence of 0s. for (uint32_t i = finalOne; i < count; i++) { - // For Arm64 we have count total bits to read, but - // they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element - // has its least significant bit set + TBase input0; + memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - bool isSet = static_cast(1) << (i * sizeof(TBase)); - if (isSet) + if ((input0 & significantBit) != 0) { // Invalid sequence return SveMaskPatternNone; From 3ed275b19199e7bbf2d951f31dcb38b917cfd7b3 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 23 Jun 2025 11:45:22 +0100 Subject: [PATCH 3/7] Rename costing variables --- src/coreclr/jit/morph.cpp | 40 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 65face0edfc7a5..421c56a3bde9e4 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9736,26 +9736,26 @@ GenTreeHWIntrinsic* Compiler::fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* n #ifdef TARGET_ARM64 -// Conversion of mask to vector is one instruction. +// Conversion of mask to vector is one instruction (a mov) static constexpr const weight_t costOfConvertMaskToVector = 1.0; -// Conversion of vector to mask is two instructions. +// Conversion of vector to mask is two instructions (a compare and a ptrue) static constexpr const weight_t costOfConvertVectorToMask = 2.0; //------------------------------------------------------------------------ -// costMorphVectorOperandToMask: Calculate the cost of the converting a node -// to a mask and the current cost. +// costMorphVectorOperandToMask: Calculate the cost of using the vector +// and mask variants // // Arguments: -// node - The node to convert to a mask -// parent - The parent of the node -// currentCost - (IN/OUT) incremented by the current cost of the node -// switchCost - (IN/OUT) incremented by the cost of switching the node to a mask +// node - The node to convert to a mask +// parent - The parent of the node +// vectorVariantCost - (IN/OUT) incremented by the cost of using the vector variant +// maskVariantCost - (IN/OUT) incremented by the cost of using the mask variant // void Compiler::costMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent, - weight_t* currentCost, - weight_t* switchCost) + weight_t* vectorVariantCost, + weight_t* maskVariantCost) { if (varTypeIsMask(node)) { @@ -9766,7 +9766,7 @@ void Compiler::costMorphVectorOperandToMask(GenTree* node, if (node->OperIsConvertMaskToVector()) { // Currently there is the cost of the convert - *currentCost += costOfConvertMaskToVector; + *vectorVariantCost += costOfConvertMaskToVector; return; } @@ -9783,8 +9783,8 @@ void Compiler::costMorphVectorOperandToMask(GenTree* node, } } - // Switching will require wrapping in a convert vector to mask - *switchCost += costOfConvertVectorToMask; + // mask variant will require wrapping in a convert vector to mask + *maskVariantCost += costOfConvertVectorToMask; } //------------------------------------------------------------------------ @@ -9884,23 +9884,23 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no return nullptr; } - weight_t currentCost = 0.0; - weight_t switchCost = 0.0; + weight_t vectorVariantCost = 0.0; + weight_t maskVariantCost = 0.0; // Take into account the cost of the conversion of the result - currentCost += costOfConvertVectorToMask; + vectorVariantCost += costOfConvertVectorToMask; // Take into account the cost of the children for (size_t i = 1; i <= convertedNode->GetOperandCount(); i++) { - costMorphVectorOperandToMask(convertedNode->Op(i), convertedNode, ¤tCost, &switchCost); + costMorphVectorOperandToMask(convertedNode->Op(i), convertedNode, &vectorVariantCost, &maskVariantCost); } - JITDUMP("Attempting Mask variant morph for [%06u]. Current cost: %.2f, Conversion cost: %.2f. Will %smorph.\n", - dspTreeID(convertedNode), currentCost, switchCost, (switchCost > currentCost) ? "not " : ""); + JITDUMP("Attempting Mask variant morph for [%06u]. Vector cost: %.2f, Mask cost: %.2f. Will %smorph.\n", + dspTreeID(convertedNode), vectorVariantCost, maskVariantCost, (maskVariantCost > vectorVariantCost) ? "not " : ""); // If the costs are identical, then prefer morphed as it matches the output type. - if (switchCost > currentCost) + if (maskVariantCost > vectorVariantCost) { return nullptr; } From e8aa98bc4f0543d64aaaefc26d17b735ffbfa80e Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 23 Jun 2025 12:26:38 +0100 Subject: [PATCH 4/7] fix set checks in EvaluateSimdVectorToPattern --- src/coreclr/jit/simd.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 7c5740762971fa..c1d47536c1dcb9 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -2014,7 +2014,8 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - if ((input0 & significantBit) != 0) + bool isSet = (input0 & significantBit) != 0; + if (!isSet) { finalOne = i; break; @@ -2027,7 +2028,8 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - if ((input0 & significantBit) != 0) + bool isSet = (input0 & significantBit) != 0; + if (isSet) { // Invalid sequence return SveMaskPatternNone; From b05e9e371268d12e9398e9a5302bbf63bb518c67 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 23 Jun 2025 12:27:01 +0100 Subject: [PATCH 5/7] Add vectorZero check --- src/coreclr/jit/morph.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 421c56a3bde9e4..10b9f72625fc2d 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9776,7 +9776,8 @@ void Compiler::costMorphVectorOperandToMask(GenTree* node, assert(node->TypeGet() == TYP_SIMD16); var_types simdBaseType = parent->GetSimdBaseType(); - if (SveMaskPatternNone != EvaluateSimdVectorToPattern(simdBaseType, node->AsVecCon()->gtSimd16Val)) + if (node->IsVectorZero() || + SveMaskPatternNone != EvaluateSimdVectorToPattern(simdBaseType, node->AsVecCon()->gtSimd16Val)) { // Assume a constant vector is the same cost as a constant mask return; @@ -9817,7 +9818,15 @@ GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* assert(node->TypeGet() == TYP_SIMD16); var_types simdBaseType = parent->GetSimdBaseType(); - if (SveMaskPatternNone != EvaluateSimdVectorToPattern(simdBaseType, node->AsVecCon()->gtSimd16Val)) + if (node->IsVectorZero()) + { + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); + mskCon->gtSimdMaskVal = simdmask_t::Zero(); + mskCon->SetMorphed(this); + return mskCon; + } + else if (SveMaskPatternNone != + EvaluateSimdVectorToPattern(simdBaseType, node->AsVecCon()->gtSimd16Val)) { GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, node->AsVecCon()->gtSimd16Val); @@ -9885,7 +9894,7 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no } weight_t vectorVariantCost = 0.0; - weight_t maskVariantCost = 0.0; + weight_t maskVariantCost = 0.0; // Take into account the cost of the conversion of the result vectorVariantCost += costOfConvertVectorToMask; @@ -9897,7 +9906,8 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no } JITDUMP("Attempting Mask variant morph for [%06u]. Vector cost: %.2f, Mask cost: %.2f. Will %smorph.\n", - dspTreeID(convertedNode), vectorVariantCost, maskVariantCost, (maskVariantCost > vectorVariantCost) ? "not " : ""); + dspTreeID(convertedNode), vectorVariantCost, maskVariantCost, + (maskVariantCost > vectorVariantCost) ? "not " : ""); // If the costs are identical, then prefer morphed as it matches the output type. if (maskVariantCost > vectorVariantCost) From 303e3623ae0c50fa2ecfb0a277961c48f841c67d Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 23 Jun 2025 16:57:53 +0100 Subject: [PATCH 6/7] rename costing function --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/morph.cpp | 26 ++++++++++++++------------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 85a5f945b7f276..e5d1db90f2a271 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6713,7 +6713,7 @@ class Compiler GenTreeHWIntrinsic* fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* node); #endif // FEATURE_MASKED_HW_INTRINSICS #ifdef TARGET_ARM64 - void costMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent, weight_t *currentCost, weight_t *switchCost); + void getVectorMaskVariantWeights(GenTree* node, GenTreeHWIntrinsic* parent, weight_t *currentCost, weight_t *switchCost); GenTree* doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent); GenTreeHWIntrinsic* fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node); #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 10b9f72625fc2d..af7b4631f667c3 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9743,8 +9743,8 @@ static constexpr const weight_t costOfConvertMaskToVector = 1.0; static constexpr const weight_t costOfConvertVectorToMask = 2.0; //------------------------------------------------------------------------ -// costMorphVectorOperandToMask: Calculate the cost of using the vector -// and mask variants +// getVectorMaskVariantWeights: Calculate the cost of using the vector +// and mask variants // // Arguments: // node - The node to convert to a mask @@ -9752,10 +9752,10 @@ static constexpr const weight_t costOfConvertVectorToMask = 2.0; // vectorVariantCost - (IN/OUT) incremented by the cost of using the vector variant // maskVariantCost - (IN/OUT) incremented by the cost of using the mask variant // -void Compiler::costMorphVectorOperandToMask(GenTree* node, - GenTreeHWIntrinsic* parent, - weight_t* vectorVariantCost, - weight_t* maskVariantCost) +void Compiler::getVectorMaskVariantWeights(GenTree* node, + GenTreeHWIntrinsic* parent, + weight_t* vectorVariantCost, + weight_t* maskVariantCost) { if (varTypeIsMask(node)) { @@ -9772,14 +9772,15 @@ void Compiler::costMorphVectorOperandToMask(GenTree* node, if (node->IsCnsVec()) { - // Check if the constant vector can be converted to a pattern during codegen + // If the constant vector can be converted to a mask pattern during codegen, then the morph is free. + // Otherwise it will cost one vector-to-mask conversion. + assert(node->TypeGet() == TYP_SIMD16); var_types simdBaseType = parent->GetSimdBaseType(); if (node->IsVectorZero() || SveMaskPatternNone != EvaluateSimdVectorToPattern(simdBaseType, node->AsVecCon()->gtSimd16Val)) { - // Assume a constant vector is the same cost as a constant mask return; } } @@ -9851,8 +9852,8 @@ GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* // decide if the variant is preferred. // // Notes: -// It is not safe to convert to a mask and then back to vector again as infromation will be lost. -// Therefore only consider NamedIntrinsics that are used as masks. +// It is not safe to convert to a mask and then back to vector again as information will be lost. +// Therefore only consider NamedIntrinsics that operate on masks. // // Arguments: // node - The conversion to mask of the HWIntrinsic to try and optimize. @@ -9868,7 +9869,7 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no return nullptr; } - // Check the converted node. Is there a masked variant. + // Check the converted node. Is there a masked variant? if (!node->Op(2)->OperIsHWIntrinsic()) { @@ -9881,6 +9882,7 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no { return nullptr; } + NamedIntrinsic maskVariant = HWIntrinsicInfo::GetMaskVariant(convertedNode->GetHWIntrinsicId()); // As some intrinsics have many variants, check that the count of operands on the node @@ -9902,7 +9904,7 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no // Take into account the cost of the children for (size_t i = 1; i <= convertedNode->GetOperandCount(); i++) { - costMorphVectorOperandToMask(convertedNode->Op(i), convertedNode, &vectorVariantCost, &maskVariantCost); + getVectorMaskVariantWeights(convertedNode->Op(i), convertedNode, &vectorVariantCost, &maskVariantCost); } JITDUMP("Attempting Mask variant morph for [%06u]. Vector cost: %.2f, Mask cost: %.2f. Will %smorph.\n", From 73cc62c5d9317ec85b8c13792c3339f3aa903ed1 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 23 Jun 2025 17:10:08 +0100 Subject: [PATCH 7/7] Check all of a vector lane when converting to mask --- src/coreclr/jit/simd.h | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index c1d47536c1dcb9..a3e5b754f1fcea 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1598,9 +1598,8 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint64_t mask = 0; - TBase significantBit = 1; #if defined(TARGET_XARCH) - significantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); + TBase significantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); #endif for (uint32_t i = 0; i < count; i++) @@ -1608,25 +1607,24 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); +#if defined(TARGET_XARCH) + // For xarch we have count sequential bits to write depending on if the + // corresponding the input element has its most significant bit set if ((input0 & significantBit) != 0) { -#if defined(TARGET_XARCH) - // For xarch we have count sequential bits to write - // depending on if the corresponding the input element - // has its most significant bit set - mask |= static_cast(1) << i; + } #elif defined(TARGET_ARM64) - // For Arm64 we have count total bits to write, but - // they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element - // has its least significant bit set - + // For Arm64 we have count total bits to write, but they are sizeof(TBase) + // bits apart. We set depending on if the corresponding input element has + // any bit set (this matches the use of cmpne in outputted assembly). + if (input0 != 0) + { mask |= static_cast(1) << (i * sizeof(TBase)); + } #else - unreached(); + unreached(); #endif - } } memcpy(&result->u8[0], &mask, sizeof(uint64_t)); @@ -2000,13 +1998,12 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint32_t finalOne = count; - TBase significantBit = 1; - // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s. // This pattern is extracted using the least significant bits of the vector elements. // For Arm64 we have count total bits to read, but they are sizeof(TBase) bits apart. We set - // depending on if the corresponding input element has its least significant bit set + // depending on if the corresponding input element has any bit set (this matches the use + // of cmpne in outputted assembly) // Find an unbroken sequence of 1s. for (uint32_t i = 0; i < count; i++) @@ -2014,7 +2011,7 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - bool isSet = (input0 & significantBit) != 0; + bool isSet = input0 != 0; if (!isSet) { finalOne = i; @@ -2028,7 +2025,7 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0) TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - bool isSet = (input0 & significantBit) != 0; + bool isSet = input0 != 0; if (isSet) { // Invalid sequence