From 9411ba56f680d5b6678a2e50b50d2a218d3f6f94 Mon Sep 17 00:00:00 2001
From: Alan Hayward <alan.hayward@arm.com>
Date: Wed, 18 Jun 2025 13:25:01 +0100
Subject: [PATCH 1/7] Arm64 SVE: re-enable use of predicate variants

Fixes #101970

in #115566

Adds a simple costing to fgMorphTryUseAllMaskVariant() and assumes
nodes can always be converted to masks (using ConvertVectorToMask).
---
 src/coreclr/jit/compiler.h                    |   5 +-
 src/coreclr/jit/gentree.cpp                   |   8 +-
 src/coreclr/jit/morph.cpp                     | 228 ++++++++++++------
 src/coreclr/jit/simd.h                        | 100 +++++++-
 .../JIT/opt/SVE/PredicateInstructions.cs      | 196 ++++++++++++++-
 5 files changed, 447 insertions(+), 90 deletions(-)

diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 9629ca8725dbc9..85a5f945b7f276 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -3205,7 +3205,7 @@ class Compiler
                                     unsigned    simdSize);
 
 #if defined(FEATURE_MASKED_HW_INTRINSICS)
-    GenTree* gtNewSimdCvtVectorToMaskNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize);
+    GenTreeHWIntrinsic* gtNewSimdCvtVectorToMaskNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize);
 #endif // FEATURE_MASKED_HW_INTRINSICS
 
     GenTree* gtNewSimdCreateBroadcastNode(
@@ -6713,8 +6713,7 @@ class Compiler
     GenTreeHWIntrinsic* fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* node);
 #endif // FEATURE_MASKED_HW_INTRINSICS
 #ifdef TARGET_ARM64
-    bool canMorphVectorOperandToMask(GenTree* node);
-    bool canMorphAllVectorOperandsToMasks(GenTreeHWIntrinsic* node);
+    void costMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent, weight_t *currentCost, weight_t *switchCost);
     GenTree* doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent);
     GenTreeHWIntrinsic* fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node);
 #endif // TARGET_ARM64
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 75ddbddb0fcf24..18c19bc54a382d 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -21947,10 +21947,10 @@ GenTree* Compiler::gtNewSimdCvtNativeNode(var_types   type,
 // Return Value:
 //    The node converted to the a mask type
 //
-GenTree* Compiler::gtNewSimdCvtVectorToMaskNode(var_types   type,
-                                                GenTree*    op1,
-                                                CorInfoType simdBaseJitType,
-                                                unsigned    simdSize)
+GenTreeHWIntrinsic* Compiler::gtNewSimdCvtVectorToMaskNode(var_types   type,
+                                                           GenTree*    op1,
+                                                           CorInfoType simdBaseJitType,
+                                                           unsigned    simdSize)
 {
     assert(varTypeIsMask(type));
     assert(varTypeIsSIMD(op1));
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 1b15e34a7d8bf6..65face0edfc7a5 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -9727,10 +9727,7 @@ GenTreeHWIntrinsic* Compiler::fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* n
         return node;
     }
 #elif defined(TARGET_ARM64)
-    // TODO-SVE: This optimisation is too naive. It needs to calculate the full cost of the instruction
-    //           vs using the predicate version, taking into account all input arguements and all uses
-    //           of the result.
-    // return fgMorphTryUseAllMaskVariant(node);
+    return fgMorphTryUseAllMaskVariant(node);
 #else
 #error Unsupported platform
 #endif
@@ -9738,37 +9735,67 @@ GenTreeHWIntrinsic* Compiler::fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* n
 }
 
 #ifdef TARGET_ARM64
-//------------------------------------------------------------------------
-// canMorphVectorOperandToMask: Can this vector operand be converted to a
-//                              node with type TYP_MASK easily?
-//
-bool Compiler::canMorphVectorOperandToMask(GenTree* node)
-{
-    return varTypeIsMask(node) || node->OperIsConvertMaskToVector() || node->IsVectorZero();
-}
+
+// Conversion of mask to vector is one instruction.
+static constexpr const weight_t costOfConvertMaskToVector = 1.0;
+
+// Conversion of vector to mask is two instructions.
+static constexpr const weight_t costOfConvertVectorToMask = 2.0;
 
 //------------------------------------------------------------------------
-// canMorphAllVectorOperandsToMasks: Can all vector operands to this node
-//                                   be converted to a node with type
-//                                   TYP_MASK easily?
+// costMorphVectorOperandToMask: Calculate the cost of the converting a node
+//                               to a mask and the current cost.
 //
-bool Compiler::canMorphAllVectorOperandsToMasks(GenTreeHWIntrinsic* node)
+// Arguments:
+//    node        - The node to convert to a mask
+//    parent      - The parent of the node
+//    currentCost - (IN/OUT) incremented by the current cost of the node
+//    switchCost  - (IN/OUT) incremented by the cost of switching the node to a mask
+//
+void Compiler::costMorphVectorOperandToMask(GenTree*            node,
+                                            GenTreeHWIntrinsic* parent,
+                                            weight_t*           currentCost,
+                                            weight_t*           switchCost)
 {
-    bool allMaskConversions = true;
-    for (size_t i = 1; i <= node->GetOperandCount() && allMaskConversions; i++)
+    if (varTypeIsMask(node))
+    {
+        // Node is already a mask
+        return;
+    }
+
+    if (node->OperIsConvertMaskToVector())
+    {
+        // Currently there is the cost of the convert
+        *currentCost += costOfConvertMaskToVector;
+        return;
+    }
+
+    if (node->IsCnsVec())
     {
-        allMaskConversions &= canMorphVectorOperandToMask(node->Op(i));
+        // Check if the constant vector can be converted to a pattern during codegen
+        assert(node->TypeGet() == TYP_SIMD16);
+        var_types simdBaseType = parent->GetSimdBaseType();
+
+        if (SveMaskPatternNone != EvaluateSimdVectorToPattern<simd16_t>(simdBaseType, node->AsVecCon()->gtSimd16Val))
+        {
+            // Assume a constant vector is the same cost as a constant mask
+            return;
+        }
     }
 
-    return allMaskConversions;
+    // Switching will require wrapping in a convert vector to mask
+    *switchCost += costOfConvertVectorToMask;
 }
 
 //------------------------------------------------------------------------
-// doMorphVectorOperandToMask: Morph a vector node that is close to a mask
-//                             node into a mask node.
+// doMorphVectorOperandToMask: Morph a vector node into a mask node.
+//
+// Arguments:
+//    node   - The node to convert to a mask
+//    parent - The parent of the node
 //
 // Return value:
-//      The morphed tree, or nullptr if the transform is not applicable.
+//    The morphed tree
 //
 GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent)
 {
@@ -9777,76 +9804,139 @@ GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic*
         // Already a mask, nothing to do.
         return node;
     }
-    else if (node->OperIsConvertMaskToVector())
+
+    if (node->OperIsConvertMaskToVector())
     {
         // Replace node with op1.
         return node->AsHWIntrinsic()->Op(1);
     }
-    else if (node->IsVectorZero())
+
+    if (node->IsCnsVec())
     {
-        // Morph the vector of zeroes into mask of zeroes.
-        GenTree* mask = gtNewSimdFalseMaskByteNode();
-        mask->SetMorphed(this);
-        return mask;
+        // Morph the constant vector to mask
+        assert(node->TypeGet() == TYP_SIMD16);
+        var_types simdBaseType = parent->GetSimdBaseType();
+
+        if (SveMaskPatternNone != EvaluateSimdVectorToPattern<simd16_t>(simdBaseType, node->AsVecCon()->gtSimd16Val))
+        {
+            GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK);
+            EvaluateSimdCvtVectorToMask<simd16_t>(simdBaseType, &mskCon->gtSimdMaskVal, node->AsVecCon()->gtSimd16Val);
+            mskCon->SetMorphed(this);
+            return mskCon;
+        }
     }
 
-    return nullptr;
+    // Wrap in a convert vector to mask
+    CorInfoType         simdBaseJitType = parent->GetSimdBaseJitType();
+    unsigned            simdSize        = parent->GetSimdSize();
+    GenTreeHWIntrinsic* cvt             = gtNewSimdCvtVectorToMaskNode(TYP_MASK, node, simdBaseJitType, simdSize);
+    cvt->SetMorphed(this);
+    cvt->Op(1)->SetMorphed(this);
+    return cvt;
 }
 
 //-----------------------------------------------------------------------------------------------------
 // fgMorphTryUseAllMaskVariant: For NamedIntrinsics that have a variant where all operands are
-//                              mask nodes. If all operands to this node are 'suggesting' that they
-//                              originate closely from a mask, but are of vector types, then morph the
-//                              operands as appropriate to use mask types instead. 'Suggesting'
-//                              is defined by the canMorphVectorOperandToMask function.
+//                              mask nodes. If the NamedIntrinsic is used by converting it to a mask
+//                              then it can be replaced with the mask variant. Use simple costing to
+//                              decide if the variant is preferred.
+//
+// Notes:
+//    It is not safe to convert to a mask and then back to vector again as infromation will be lost.
+//    Therefore only consider NamedIntrinsics that are used as masks.
 //
 // Arguments:
-//    tree - The HWIntrinsic to try and optimize.
+//    node - The conversion to mask of the HWIntrinsic to try and optimize.
 //
 // Return Value:
 //    The fully morphed tree if a change was made, else nullptr.
 //
 GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node)
 {
-    if (HWIntrinsicInfo::HasAllMaskVariant(node->GetHWIntrinsicId()))
+    // We only care about vectors being used as masks.
+    if (!node->OperIsConvertVectorToMask())
     {
-        NamedIntrinsic maskVariant = HWIntrinsicInfo::GetMaskVariant(node->GetHWIntrinsicId());
+        return nullptr;
+    }
 
-        // As some intrinsics have many variants, check that the count of operands on the node
-        // matches the number of operands required for the mask variant of the intrinsic. The mask
-        // variant of the intrinsic must have a fixed number of operands.
-        int numArgs = HWIntrinsicInfo::lookupNumArgs(maskVariant);
-        assert(numArgs >= 0);
-        if (node->GetOperandCount() == (size_t)numArgs)
-        {
-            // We're sure it will work at this point, so perform the pattern match on operands.
-            if (canMorphAllVectorOperandsToMasks(node))
-            {
-                switch (node->GetOperandCount())
-                {
-                    case 1:
-                        node->ResetHWIntrinsicId(maskVariant, doMorphVectorOperandToMask(node->Op(1), node));
-                        break;
-                    case 2:
-                        node->ResetHWIntrinsicId(maskVariant, doMorphVectorOperandToMask(node->Op(1), node),
-                                                 doMorphVectorOperandToMask(node->Op(2), node));
-                        break;
-                    case 3:
-                        node->ResetHWIntrinsicId(maskVariant, this, doMorphVectorOperandToMask(node->Op(1), node),
-                                                 doMorphVectorOperandToMask(node->Op(2), node),
-                                                 doMorphVectorOperandToMask(node->Op(3), node));
-                        break;
-                    default:
-                        unreached();
-                }
+    // Check the converted node. Is there a masked variant.
 
-                node->gtType = TYP_MASK;
-                return node;
-            }
-        }
+    if (!node->Op(2)->OperIsHWIntrinsic())
+    {
+        return nullptr;
     }
 
-    return nullptr;
+    GenTreeHWIntrinsic* convertedNode = node->Op(2)->AsHWIntrinsic();
+
+    if (!HWIntrinsicInfo::HasAllMaskVariant(convertedNode->GetHWIntrinsicId()))
+    {
+        return nullptr;
+    }
+    NamedIntrinsic maskVariant = HWIntrinsicInfo::GetMaskVariant(convertedNode->GetHWIntrinsicId());
+
+    // As some intrinsics have many variants, check that the count of operands on the node
+    // matches the number of operands required for the mask variant of the intrinsic. The mask
+    // variant of the intrinsic must have a fixed number of operands.
+    int numArgs = HWIntrinsicInfo::lookupNumArgs(maskVariant);
+    assert(numArgs >= 0);
+    if (convertedNode->GetOperandCount() != (size_t)numArgs)
+    {
+        return nullptr;
+    }
+
+    weight_t currentCost = 0.0;
+    weight_t switchCost  = 0.0;
+
+    // Take into account the cost of the conversion of the result
+    currentCost += costOfConvertVectorToMask;
+
+    // Take into account the cost of the children
+    for (size_t i = 1; i <= convertedNode->GetOperandCount(); i++)
+    {
+        costMorphVectorOperandToMask(convertedNode->Op(i), convertedNode, &currentCost, &switchCost);
+    }
+
+    JITDUMP("Attempting Mask variant morph for [%06u]. Current cost: %.2f, Conversion cost: %.2f. Will %smorph.\n",
+            dspTreeID(convertedNode), currentCost, switchCost, (switchCost > currentCost) ? "not " : "");
+
+    // If the costs are identical, then prefer morphed as it matches the output type.
+    if (switchCost > currentCost)
+    {
+        return nullptr;
+    }
+
+    // Do the morph
+    switch (convertedNode->GetOperandCount())
+    {
+        case 1:
+            convertedNode->ResetHWIntrinsicId(maskVariant,
+                                              doMorphVectorOperandToMask(convertedNode->Op(1), convertedNode));
+
+            break;
+        case 2:
+            convertedNode->ResetHWIntrinsicId(maskVariant,
+                                              doMorphVectorOperandToMask(convertedNode->Op(1), convertedNode),
+                                              doMorphVectorOperandToMask(convertedNode->Op(2), convertedNode));
+            break;
+        case 3:
+            convertedNode->ResetHWIntrinsicId(maskVariant, this,
+                                              doMorphVectorOperandToMask(convertedNode->Op(1), convertedNode),
+                                              doMorphVectorOperandToMask(convertedNode->Op(2), convertedNode),
+                                              doMorphVectorOperandToMask(convertedNode->Op(3), convertedNode));
+            break;
+        default:
+            unreached();
+    }
+
+    // Morph the new children.
+    for (size_t i = 1; i <= convertedNode->GetOperandCount(); i++)
+    {
+        convertedNode->Op(i) = fgMorphTree(convertedNode->Op(i));
+    }
+
+    convertedNode->gtType = TYP_MASK;
+    DEBUG_DESTROY_NODE(node);
+    return convertedNode;
 }
 #endif // TARGET_ARM64
 
diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h
index 9841bdeb38c93c..9c297347f2ece6 100644
--- a/src/coreclr/jit/simd.h
+++ b/src/coreclr/jit/simd.h
@@ -1906,7 +1906,7 @@ SveMaskPattern EvaluateSimdMaskToPattern(simdmask_t arg0)
     memcpy(&mask, &arg0.u8[0], sizeof(uint64_t));
     uint32_t finalOne = count;
 
-    // A mask pattern starts with zero of more 1s and then the rest of the mask is filled with 0s.
+    // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s.
 
     // Find an unbroken sequence of 1s.
     for (uint32_t i = 0; i < count; i++)
@@ -1993,6 +1993,104 @@ SveMaskPattern EvaluateSimdMaskToPattern(var_types baseType, simdmask_t arg0)
         }
     }
 }
+
+template <typename TSimd, typename TBase>
+SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0)
+{
+    uint32_t count    = sizeof(TSimd) / sizeof(TBase);
+    uint32_t finalOne = count;
+
+    // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s.
+    // This pattern is extracted using the least significant bits of the vector elements.
+
+    // Find an unbroken sequence of 1s.
+    for (uint32_t i = 0; i < count; i++)
+    {
+        TBase input0;
+        memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase));
+
+        // For Arm64 we have count total bits to read, but
+        // they are sizeof(TBase) bits apart. We set
+        // depending on if the corresponding input element
+        // has its least significant bit set
+
+        bool isSet = static_cast<uint64_t>(1) << (i * sizeof(TBase));
+        if (!isSet)
+        {
+            finalOne = i;
+            break;
+        }
+    }
+
+    // Find an unbroken sequence of 0s.
+    for (uint32_t i = finalOne; i < count; i++)
+    {
+        // For Arm64 we have count total bits to read, but
+        // they are sizeof(TBase) bits apart. We set
+        // depending on if the corresponding input element
+        // has its least significant bit set
+
+        bool isSet = static_cast<uint64_t>(1) << (i * sizeof(TBase));
+        if (isSet)
+        {
+            // Invalid sequence
+            return SveMaskPatternNone;
+        }
+    }
+
+    if (finalOne == count)
+    {
+        return SveMaskPatternAll;
+    }
+    else if (finalOne >= SveMaskPatternVectorCount1 && finalOne <= SveMaskPatternVectorCount8)
+    {
+        return (SveMaskPattern)finalOne;
+    }
+    else
+    {
+        // TODO: Add other patterns as required. These probably won't be seen until we get
+        //       to wider vector lengths.
+        return SveMaskPatternNone;
+    }
+}
+
+template <typename TSimd>
+SveMaskPattern EvaluateSimdVectorToPattern(var_types baseType, TSimd arg0)
+{
+    switch (baseType)
+    {
+        case TYP_FLOAT:
+        case TYP_INT:
+        case TYP_UINT:
+        {
+            return EvaluateSimdVectorToPattern<TSimd, uint32_t>(arg0);
+        }
+
+        case TYP_DOUBLE:
+        case TYP_LONG:
+        case TYP_ULONG:
+        {
+            return EvaluateSimdVectorToPattern<TSimd, uint64_t>(arg0);
+        }
+
+        case TYP_BYTE:
+        case TYP_UBYTE:
+        {
+            return EvaluateSimdVectorToPattern<TSimd, uint8_t>(arg0);
+        }
+
+        case TYP_SHORT:
+        case TYP_USHORT:
+        {
+            return EvaluateSimdVectorToPattern<TSimd, uint16_t>(arg0);
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+}
 #endif // TARGET_ARM64
 
 #endif // FEATURE_MASKED_HW_INTRINSICS
diff --git a/src/tests/JIT/opt/SVE/PredicateInstructions.cs b/src/tests/JIT/opt/SVE/PredicateInstructions.cs
index b1336674f1638b..c5a1e75199c4a9 100644
--- a/src/tests/JIT/opt/SVE/PredicateInstructions.cs
+++ b/src/tests/JIT/opt/SVE/PredicateInstructions.cs
@@ -17,66 +17,98 @@ public static void TestPredicateInstructions()
     {
         if (Sve.IsSupported)
         {
+            Vector<short> vecs  = Vector.Create<short>(2);
+            Vector<int>   veci  = Vector.Create<int>(3);
+            Vector<uint>  vecui = Vector.Create<uint>(5);
+            Vector<long>  vecl  = Vector.Create<long>(7);
+
             ZipLow();
-            ZipHigh();
+            ZipHigh(vecui);
             UnzipOdd();
             UnzipEven();
             TransposeOdd();
-            TransposeEven();
+            TransposeEven(vecl);
             ReverseElement();
             And();
             BitwiseClear();
             Xor();
             Or();
-            ConditionalSelect();
+            ConditionalSelect(veci);
+
+            ZipLowMask();
+            ZipHighMask(vecui);
+            UnzipOddMask();
+            UnzipEvenMask();
+            TransposeOddMask();
+            TransposeEvenMask(vecl);
+            ReverseElementMask();
+            AndMask();
+            BitwiseClearMask();
+            XorMask();
+            OrMask();
+            ConditionalSelectMask(veci);
+
+            UnzipEvenZipLowMask();
+            TransposeEvenAndMask(vecs);
+
         }
     }
 
+    // These should not use the predicate variants
+
     [MethodImpl(MethodImplOptions.NoInlining)]
     static Vector<short> ZipLow()
     {
+        //ARM64-FULL-LINE: zip1 {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
         return Sve.ZipLow(Vector<short>.Zero, Sve.CreateTrueMaskInt16());
     }
 
     [MethodImpl(MethodImplOptions.NoInlining)]
-    static Vector<uint> ZipHigh()
+    static Vector<uint> ZipHigh(Vector<uint> v)
     {
-        return Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), Sve.CreateTrueMaskUInt32());
+        //ARM64-FULL-LINE: zip2 {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s
+        return Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), v);
     }
 
     [MethodImpl(MethodImplOptions.NoInlining)]
     static Vector<sbyte> UnzipEven()
     {
+        //ARM64-FULL-LINE: uzp1 {{z[0-9]+}}.b, {{z[0-9]+}}.b, {{z[0-9]+}}.b
         return Sve.UnzipEven(Sve.CreateTrueMaskSByte(), Vector<sbyte>.Zero);
     }
 
     [MethodImpl(MethodImplOptions.NoInlining)]
     static Vector<short> UnzipOdd()
     {
+        //ARM64-FULL-LINE: uzp2 {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
         return Sve.UnzipOdd(Sve.CreateTrueMaskInt16(), Sve.CreateFalseMaskInt16());
     }
 
     [MethodImpl(MethodImplOptions.NoInlining)]
-    static Vector<long> TransposeEven()
+    static Vector<long> TransposeEven(Vector<long> v)
     {
-        return Sve.TransposeEven(Sve.CreateFalseMaskInt64(), Sve.CreateTrueMaskInt64());
+        //ARM64-FULL-LINE: trn1 {{z[0-9]+}}.d, {{z[0-9]+}}.d, {{z[0-9]+}}.d
+        return Sve.TransposeEven(v, Sve.CreateTrueMaskInt64());
     }
 
     [MethodImpl(MethodImplOptions.NoInlining)]
     static Vector<short> TransposeOdd()
     {
+        //ARM64-FULL-LINE: trn2 {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
         return Sve.TransposeOdd(Vector<short>.Zero, Sve.CreateTrueMaskInt16());
     }
 
     [MethodImpl(MethodImplOptions.NoInlining)]
     static Vector<short> ReverseElement()
     {
+        //ARM64-FULL-LINE: rev {{z[0-9]+}}.h, {{z[0-9]+}}.h
         return Sve.ReverseElement(Sve.CreateTrueMaskInt16());
     }
 
     [MethodImpl(MethodImplOptions.NoInlining)]
     static Vector<short> And()
     {
+        //ARM64-FULL-LINE: and {{z[0-9]+}}.h, {{z[0-9]+}}.h, {{z[0-9]+}}.h
         return Sve.ConditionalSelect(
             Sve.CreateTrueMaskInt16(),
             Sve.And(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()),
@@ -87,8 +119,9 @@ static Vector<short> And()
     [MethodImpl(MethodImplOptions.NoInlining)]
     static Vector<short> BitwiseClear()
     {
+        //ARM64-FULL-LINE: bic {{z[0-9]+}}.h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
         return Sve.ConditionalSelect(
-            Sve.CreateFalseMaskInt16(),
+            Sve.CreateTrueMaskInt16(SveMaskPattern.VectorCount1),
             Sve.BitwiseClear(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()),
             Vector<short>.Zero
         );
@@ -97,8 +130,9 @@ static Vector<short> BitwiseClear()
     [MethodImpl(MethodImplOptions.NoInlining)]
     static Vector<int> Xor()
     {
+        //ARM64-FULL-LINE: eor {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s
         return Sve.ConditionalSelect(
-            Sve.CreateTrueMaskInt32(),
+            Sve.CreateTrueMaskInt32(SveMaskPattern.VectorCount3),
             Sve.Xor(Sve.CreateTrueMaskInt32(), Sve.CreateTrueMaskInt32()),
             Vector<int>.Zero
         );
@@ -107,20 +141,156 @@ static Vector<int> Xor()
     [MethodImpl(MethodImplOptions.NoInlining)]
     static Vector<short> Or()
     {
+        //ARM64-FULL-LINE: orr {{z[0-9]+}}.h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
         return Sve.ConditionalSelect(
-            Sve.CreateTrueMaskInt16(),
+            Sve.CreateTrueMaskInt16(SveMaskPattern.VectorCount1),
             Sve.Or(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()),
             Vector<short>.Zero
         );
     }
 
     [MethodImpl(MethodImplOptions.NoInlining)]
-    static Vector<int> ConditionalSelect()
+    static Vector<int> ConditionalSelect(Vector<int> v)
     {
+        // Use a passed in vector for the mask to prevent optimising away the select
+        //ARM64-FULL-LINE: sel {{z[0-9]+}}.s, {{p[0-9]+}}, {{z[0-9]+}}.s, {{z[0-9]+}}.s
         return Sve.ConditionalSelect(
-            Vector<int>.Zero,
+            v,
             Sve.CreateFalseMaskInt32(),
             Sve.CreateTrueMaskInt32()
         );
     }
-}
\ No newline at end of file
+
+
+    // These should use the predicate variants.
+    // CreateBreakAfterMask is used as the first argument is a predicate.
+
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<short> ZipLowMask()
+    {
+        //ARM64-FULL-LINE: zip1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h
+        return Sve.CreateBreakAfterMask(Sve.ZipLow(Vector<short>.Zero, Sve.CreateTrueMaskInt16()), Sve.CreateTrueMaskInt16());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<uint> ZipHighMask(Vector<uint> v)
+    {
+        //ARM64-FULL-LINE: zip2 {{p[0-9]+}}.s, {{p[0-9]+}}.s, {{p[0-9]+}}.s
+        return Sve.CreateBreakAfterMask(Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), v), Sve.CreateTrueMaskUInt32());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<sbyte> UnzipEvenMask()
+    {
+        //ARM64-FULL-LINE: uzp1 {{p[0-9]+}}.b, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+        return Sve.CreateBreakAfterMask(Sve.UnzipEven(Sve.CreateTrueMaskSByte(), Vector<sbyte>.Zero), Sve.CreateTrueMaskSByte());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<short> UnzipOddMask()
+    {
+        //ARM64-FULL-LINE: uzp2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h
+        return Sve.CreateBreakAfterMask(Sve.UnzipOdd(Sve.CreateTrueMaskInt16(), Sve.CreateFalseMaskInt16()), Sve.CreateTrueMaskInt16());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<long> TransposeEvenMask(Vector<long> v)
+    {
+        //ARM64-FULL-LINE: trn1 {{p[0-9]+}}.d, {{p[0-9]+}}.d, {{p[0-9]+}}.d
+        return Sve.CreateBreakAfterMask(Sve.TransposeEven(v, Sve.CreateTrueMaskInt64()), Sve.CreateFalseMaskInt64());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<short> TransposeOddMask()
+    {
+        //ARM64-FULL-LINE: trn2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h
+        return Sve.CreateBreakAfterMask(Sve.TransposeOdd(Vector<short>.Zero, Sve.CreateTrueMaskInt16()), Sve.CreateTrueMaskInt16());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<short> ReverseElementMask()
+    {
+        //ARM64-FULL-LINE: rev {{p[0-9]+}}.h, {{p[0-9]+}}.h
+        return Sve.CreateBreakAfterMask(Sve.ReverseElement(Sve.CreateTrueMaskInt16()), Sve.CreateFalseMaskInt16());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<short> AndMask()
+    {
+        //ARM64-FULL-LINE: and {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+        return Sve.CreateBreakAfterMask(Sve.ConditionalSelect(
+            Sve.CreateTrueMaskInt16(),
+            Sve.And(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()),
+            Vector<short>.Zero
+        ), Sve.CreateFalseMaskInt16());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<short> BitwiseClearMask()
+    {
+        //ARM64-FULL-LINE: bic {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+        return Sve.CreateBreakAfterMask(Sve.ConditionalSelect(
+            Sve.CreateTrueMaskInt16(),
+            Sve.BitwiseClear(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()),
+            Vector<short>.Zero
+        ), Sve.CreateFalseMaskInt16());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<int> XorMask()
+    {
+        //ARM64-FULL-LINE: eor {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+        return Sve.CreateBreakAfterMask(Sve.ConditionalSelect(
+            Sve.CreateTrueMaskInt32(),
+            Sve.Xor(Sve.CreateTrueMaskInt32(), Sve.CreateTrueMaskInt32()),
+            Vector<int>.Zero
+        ), Sve.CreateFalseMaskInt32());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<short> OrMask()
+    {
+        //ARM64-FULL-LINE: orr {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+        return Sve.CreateBreakAfterMask(Sve.ConditionalSelect(
+            Sve.CreateTrueMaskInt16(),
+            Sve.Or(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()),
+            Vector<short>.Zero
+        ), Sve.CreateFalseMaskInt16());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<int> ConditionalSelectMask(Vector<int> v)
+    {
+        // Use a passed in vector for the mask to prevent optimising away the select
+        //ARM64-FULL-LINE: sel {{p[0-9]+}}.b, {{p[0-9]+}}, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+        return Sve.CreateBreakAfterMask(Sve.ConditionalSelect(
+            v,
+            Sve.CreateFalseMaskInt32(),
+            Sve.CreateTrueMaskInt32()
+        ), Sve.CreateFalseMaskInt32());
+    }
+
+    // These have multiple uses of the predicate variants
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<short> UnzipEvenZipLowMask()
+    {
+        //ARM64-FULL-LINE: zip1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h
+        //ARM64-FULL-LINE: uzp1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h
+        return Sve.CreateBreakAfterMask(Sve.UnzipEven(Sve.ZipLow(Vector<short>.Zero, Sve.CreateTrueMaskInt16()), Vector<short>.Zero), Sve.CreateTrueMaskInt16());
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static Vector<short> TransposeEvenAndMask(Vector<short> v)
+    {
+        //ARM64-FULL-LINE: and {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+        //ARM64-FULL-LINE: trn1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h
+        return Sve.CreateBreakAfterMask(Sve.TransposeEven(Sve.CreateTrueMaskInt16(), Sve.ConditionalSelect(
+            Sve.CreateTrueMaskInt16(),
+            Sve.And(v, Sve.CreateTrueMaskInt16()),
+            Vector<short>.Zero
+        )), Sve.CreateFalseMaskInt16());
+
+    }
+}

From 73dc29aba2db81922cf9f2d619c17c0306e8f2d5 Mon Sep 17 00:00:00 2001
From: Alan Hayward <alan.hayward@arm.com>
Date: Mon, 23 Jun 2025 11:37:30 +0100
Subject: [PATCH 2/7] Check significantBit in EvaluateSimdVectorToPattern()

---
 src/coreclr/jit/simd.h | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h
index 9c297347f2ece6..7c5740762971fa 100644
--- a/src/coreclr/jit/simd.h
+++ b/src/coreclr/jit/simd.h
@@ -2000,22 +2000,21 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0)
     uint32_t count    = sizeof(TSimd) / sizeof(TBase);
     uint32_t finalOne = count;
 
+    TBase significantBit = 1;
+
     // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s.
     // This pattern is extracted using the least significant bits of the vector elements.
 
+    // For Arm64 we have count total bits to read, but they are sizeof(TBase) bits apart. We set
+    // depending on if the corresponding input element has its least significant bit set
+
     // Find an unbroken sequence of 1s.
     for (uint32_t i = 0; i < count; i++)
     {
         TBase input0;
         memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase));
 
-        // For Arm64 we have count total bits to read, but
-        // they are sizeof(TBase) bits apart. We set
-        // depending on if the corresponding input element
-        // has its least significant bit set
-
-        bool isSet = static_cast<uint64_t>(1) << (i * sizeof(TBase));
-        if (!isSet)
+        if ((input0 & significantBit) != 0)
         {
             finalOne = i;
             break;
@@ -2025,13 +2024,10 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0)
     // Find an unbroken sequence of 0s.
     for (uint32_t i = finalOne; i < count; i++)
     {
-        // For Arm64 we have count total bits to read, but
-        // they are sizeof(TBase) bits apart. We set
-        // depending on if the corresponding input element
-        // has its least significant bit set
+        TBase input0;
+        memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase));
 
-        bool isSet = static_cast<uint64_t>(1) << (i * sizeof(TBase));
-        if (isSet)
+        if ((input0 & significantBit) != 0)
         {
             // Invalid sequence
             return SveMaskPatternNone;

From 3ed275b19199e7bbf2d951f31dcb38b917cfd7b3 Mon Sep 17 00:00:00 2001
From: Alan Hayward <alan.hayward@arm.com>
Date: Mon, 23 Jun 2025 11:45:22 +0100
Subject: [PATCH 3/7] Rename costing variables

---
 src/coreclr/jit/morph.cpp | 40 +++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 65face0edfc7a5..421c56a3bde9e4 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -9736,26 +9736,26 @@ GenTreeHWIntrinsic* Compiler::fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* n
 
 #ifdef TARGET_ARM64
 
-// Conversion of mask to vector is one instruction.
+// Conversion of mask to vector is one instruction (a mov)
 static constexpr const weight_t costOfConvertMaskToVector = 1.0;
 
-// Conversion of vector to mask is two instructions.
+// Conversion of vector to mask is two instructions (a compare and a ptrue)
 static constexpr const weight_t costOfConvertVectorToMask = 2.0;
 
 //------------------------------------------------------------------------
-// costMorphVectorOperandToMask: Calculate the cost of the converting a node
-//                               to a mask and the current cost.
+// costMorphVectorOperandToMask: Calculate the cost of using the vector
+//                               and mask variants
 //
 // Arguments:
-//    node        - The node to convert to a mask
-//    parent      - The parent of the node
-//    currentCost - (IN/OUT) incremented by the current cost of the node
-//    switchCost  - (IN/OUT) incremented by the cost of switching the node to a mask
+//    node              - The node to convert to a mask
+//    parent            - The parent of the node
+//    vectorVariantCost - (IN/OUT) incremented by the cost of using the vector variant
+//    maskVariantCost   - (IN/OUT) incremented by the cost of using the mask variant
 //
 void Compiler::costMorphVectorOperandToMask(GenTree*            node,
                                             GenTreeHWIntrinsic* parent,
-                                            weight_t*           currentCost,
-                                            weight_t*           switchCost)
+                                            weight_t*           vectorVariantCost,
+                                            weight_t*           maskVariantCost)
 {
     if (varTypeIsMask(node))
     {
@@ -9766,7 +9766,7 @@ void Compiler::costMorphVectorOperandToMask(GenTree*            node,
     if (node->OperIsConvertMaskToVector())
     {
         // Currently there is the cost of the convert
-        *currentCost += costOfConvertMaskToVector;
+        *vectorVariantCost += costOfConvertMaskToVector;
         return;
     }
 
@@ -9783,8 +9783,8 @@ void Compiler::costMorphVectorOperandToMask(GenTree*            node,
         }
     }
 
-    // Switching will require wrapping in a convert vector to mask
-    *switchCost += costOfConvertVectorToMask;
+    // mask variant will require wrapping in a convert vector to mask
+    *maskVariantCost += costOfConvertVectorToMask;
 }
 
 //------------------------------------------------------------------------
@@ -9884,23 +9884,23 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no
         return nullptr;
     }
 
-    weight_t currentCost = 0.0;
-    weight_t switchCost  = 0.0;
+    weight_t vectorVariantCost = 0.0;
+    weight_t maskVariantCost  = 0.0;
 
     // Take into account the cost of the conversion of the result
-    currentCost += costOfConvertVectorToMask;
+    vectorVariantCost += costOfConvertVectorToMask;
 
     // Take into account the cost of the children
     for (size_t i = 1; i <= convertedNode->GetOperandCount(); i++)
     {
-        costMorphVectorOperandToMask(convertedNode->Op(i), convertedNode, &currentCost, &switchCost);
+        costMorphVectorOperandToMask(convertedNode->Op(i), convertedNode, &vectorVariantCost, &maskVariantCost);
     }
 
-    JITDUMP("Attempting Mask variant morph for [%06u]. Current cost: %.2f, Conversion cost: %.2f. Will %smorph.\n",
-            dspTreeID(convertedNode), currentCost, switchCost, (switchCost > currentCost) ? "not " : "");
+    JITDUMP("Attempting Mask variant morph for [%06u]. Vector cost: %.2f, Mask cost: %.2f. Will %smorph.\n",
+            dspTreeID(convertedNode), vectorVariantCost, maskVariantCost, (maskVariantCost > vectorVariantCost) ? "not " : "");
 
     // If the costs are identical, then prefer morphed as it matches the output type.
-    if (switchCost > currentCost)
+    if (maskVariantCost > vectorVariantCost)
     {
         return nullptr;
     }

From e8aa98bc4f0543d64aaaefc26d17b735ffbfa80e Mon Sep 17 00:00:00 2001
From: Alan Hayward <alan.hayward@arm.com>
Date: Mon, 23 Jun 2025 12:26:38 +0100
Subject: [PATCH 4/7] fix set checks in EvaluateSimdVectorToPattern

---
 src/coreclr/jit/simd.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h
index 7c5740762971fa..c1d47536c1dcb9 100644
--- a/src/coreclr/jit/simd.h
+++ b/src/coreclr/jit/simd.h
@@ -2014,7 +2014,8 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0)
         TBase input0;
         memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase));
 
-        if ((input0 & significantBit) != 0)
+        bool isSet = (input0 & significantBit) != 0;
+        if (!isSet)
         {
             finalOne = i;
             break;
@@ -2027,7 +2028,8 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0)
         TBase input0;
         memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase));
 
-        if ((input0 & significantBit) != 0)
+        bool isSet = (input0 & significantBit) != 0;
+        if (isSet)
         {
             // Invalid sequence
             return SveMaskPatternNone;

From b05e9e371268d12e9398e9a5302bbf63bb518c67 Mon Sep 17 00:00:00 2001
From: Alan Hayward <alan.hayward@arm.com>
Date: Mon, 23 Jun 2025 12:27:01 +0100
Subject: [PATCH 5/7] Add vectorZero check

---
 src/coreclr/jit/morph.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 421c56a3bde9e4..10b9f72625fc2d 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -9776,7 +9776,8 @@ void Compiler::costMorphVectorOperandToMask(GenTree*            node,
         assert(node->TypeGet() == TYP_SIMD16);
         var_types simdBaseType = parent->GetSimdBaseType();
 
-        if (SveMaskPatternNone != EvaluateSimdVectorToPattern<simd16_t>(simdBaseType, node->AsVecCon()->gtSimd16Val))
+        if (node->IsVectorZero() ||
+            SveMaskPatternNone != EvaluateSimdVectorToPattern<simd16_t>(simdBaseType, node->AsVecCon()->gtSimd16Val))
         {
             // Assume a constant vector is the same cost as a constant mask
             return;
@@ -9817,7 +9818,15 @@ GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic*
         assert(node->TypeGet() == TYP_SIMD16);
         var_types simdBaseType = parent->GetSimdBaseType();
 
-        if (SveMaskPatternNone != EvaluateSimdVectorToPattern<simd16_t>(simdBaseType, node->AsVecCon()->gtSimd16Val))
+        if (node->IsVectorZero())
+        {
+            GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK);
+            mskCon->gtSimdMaskVal = simdmask_t::Zero();
+            mskCon->SetMorphed(this);
+            return mskCon;
+        }
+        else if (SveMaskPatternNone !=
+                 EvaluateSimdVectorToPattern<simd16_t>(simdBaseType, node->AsVecCon()->gtSimd16Val))
         {
             GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK);
             EvaluateSimdCvtVectorToMask<simd16_t>(simdBaseType, &mskCon->gtSimdMaskVal, node->AsVecCon()->gtSimd16Val);
@@ -9885,7 +9894,7 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no
     }
 
     weight_t vectorVariantCost = 0.0;
-    weight_t maskVariantCost  = 0.0;
+    weight_t maskVariantCost   = 0.0;
 
     // Take into account the cost of the conversion of the result
     vectorVariantCost += costOfConvertVectorToMask;
@@ -9897,7 +9906,8 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no
     }
 
     JITDUMP("Attempting Mask variant morph for [%06u]. Vector cost: %.2f, Mask cost: %.2f. Will %smorph.\n",
-            dspTreeID(convertedNode), vectorVariantCost, maskVariantCost, (maskVariantCost > vectorVariantCost) ? "not " : "");
+            dspTreeID(convertedNode), vectorVariantCost, maskVariantCost,
+            (maskVariantCost > vectorVariantCost) ? "not " : "");
 
     // If the costs are identical, then prefer morphed as it matches the output type.
     if (maskVariantCost > vectorVariantCost)

From 303e3623ae0c50fa2ecfb0a277961c48f841c67d Mon Sep 17 00:00:00 2001
From: Alan Hayward <alan.hayward@arm.com>
Date: Mon, 23 Jun 2025 16:57:53 +0100
Subject: [PATCH 6/7] rename costing function

---
 src/coreclr/jit/compiler.h |  2 +-
 src/coreclr/jit/morph.cpp  | 26 ++++++++++++++------------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 85a5f945b7f276..e5d1db90f2a271 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -6713,7 +6713,7 @@ class Compiler
     GenTreeHWIntrinsic* fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* node);
 #endif // FEATURE_MASKED_HW_INTRINSICS
 #ifdef TARGET_ARM64
-    void costMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent, weight_t *currentCost, weight_t *switchCost);
+    void getVectorMaskVariantWeights(GenTree* node, GenTreeHWIntrinsic* parent, weight_t *currentCost, weight_t *switchCost);
     GenTree* doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent);
     GenTreeHWIntrinsic* fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node);
 #endif // TARGET_ARM64
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 10b9f72625fc2d..af7b4631f667c3 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -9743,8 +9743,8 @@ static constexpr const weight_t costOfConvertMaskToVector = 1.0;
 static constexpr const weight_t costOfConvertVectorToMask = 2.0;
 
 //------------------------------------------------------------------------
-// costMorphVectorOperandToMask: Calculate the cost of using the vector
-//                               and mask variants
+// getVectorMaskVariantWeights: Calculate the cost of using the vector
+//                              and mask variants
 //
 // Arguments:
 //    node              - The node to convert to a mask
@@ -9752,10 +9752,10 @@ static constexpr const weight_t costOfConvertVectorToMask = 2.0;
 //    vectorVariantCost - (IN/OUT) incremented by the cost of using the vector variant
 //    maskVariantCost   - (IN/OUT) incremented by the cost of using the mask variant
 //
-void Compiler::costMorphVectorOperandToMask(GenTree*            node,
-                                            GenTreeHWIntrinsic* parent,
-                                            weight_t*           vectorVariantCost,
-                                            weight_t*           maskVariantCost)
+void Compiler::getVectorMaskVariantWeights(GenTree*            node,
+                                           GenTreeHWIntrinsic* parent,
+                                           weight_t*           vectorVariantCost,
+                                           weight_t*           maskVariantCost)
 {
     if (varTypeIsMask(node))
     {
@@ -9772,14 +9772,15 @@ void Compiler::costMorphVectorOperandToMask(GenTree*            node,
 
     if (node->IsCnsVec())
     {
-        // Check if the constant vector can be converted to a pattern during codegen
+        // If the constant vector can be converted to a mask pattern during codegen, then the morph is free.
+        // Otherwise it will cost one vector-to-mask conversion.
+
         assert(node->TypeGet() == TYP_SIMD16);
         var_types simdBaseType = parent->GetSimdBaseType();
 
         if (node->IsVectorZero() ||
             SveMaskPatternNone != EvaluateSimdVectorToPattern<simd16_t>(simdBaseType, node->AsVecCon()->gtSimd16Val))
         {
-            // Assume a constant vector is the same cost as a constant mask
             return;
         }
     }
@@ -9851,8 +9852,8 @@ GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic*
 //                              decide if the variant is preferred.
 //
 // Notes:
-//    It is not safe to convert to a mask and then back to vector again as infromation will be lost.
-//    Therefore only consider NamedIntrinsics that are used as masks.
+//    It is not safe to convert to a mask and then back to vector again as information will be lost.
+//    Therefore only consider NamedIntrinsics that operate on masks.
 //
 // Arguments:
 //    node - The conversion to mask of the HWIntrinsic to try and optimize.
@@ -9868,7 +9869,7 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no
         return nullptr;
     }
 
-    // Check the converted node. Is there a masked variant.
+    // Check the converted node. Is there a masked variant?
 
     if (!node->Op(2)->OperIsHWIntrinsic())
     {
@@ -9881,6 +9882,7 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no
     {
         return nullptr;
     }
+
     NamedIntrinsic maskVariant = HWIntrinsicInfo::GetMaskVariant(convertedNode->GetHWIntrinsicId());
 
     // As some intrinsics have many variants, check that the count of operands on the node
@@ -9902,7 +9904,7 @@ GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* no
     // Take into account the cost of the children
     for (size_t i = 1; i <= convertedNode->GetOperandCount(); i++)
     {
-        costMorphVectorOperandToMask(convertedNode->Op(i), convertedNode, &vectorVariantCost, &maskVariantCost);
+        getVectorMaskVariantWeights(convertedNode->Op(i), convertedNode, &vectorVariantCost, &maskVariantCost);
     }
 
     JITDUMP("Attempting Mask variant morph for [%06u]. Vector cost: %.2f, Mask cost: %.2f. Will %smorph.\n",

From 73cc62c5d9317ec85b8c13792c3339f3aa903ed1 Mon Sep 17 00:00:00 2001
From: Alan Hayward <alan.hayward@arm.com>
Date: Mon, 23 Jun 2025 17:10:08 +0100
Subject: [PATCH 7/7] Check all of a vector lane when converting to mask

---
 src/coreclr/jit/simd.h | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h
index c1d47536c1dcb9..a3e5b754f1fcea 100644
--- a/src/coreclr/jit/simd.h
+++ b/src/coreclr/jit/simd.h
@@ -1598,9 +1598,8 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0)
     uint32_t count = sizeof(TSimd) / sizeof(TBase);
     uint64_t mask  = 0;
 
-    TBase significantBit = 1;
 #if defined(TARGET_XARCH)
-    significantBit = static_cast<TBase>(1) << ((sizeof(TBase) * 8) - 1);
+    TBase significantBit = static_cast<TBase>(1) << ((sizeof(TBase) * 8) - 1);
 #endif
 
     for (uint32_t i = 0; i < count; i++)
@@ -1608,25 +1607,24 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0)
         TBase input0;
         memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase));
 
+#if defined(TARGET_XARCH)
+        // For xarch we have count sequential bits to write depending on if the
+        // corresponding the input element has its most significant bit set
         if ((input0 & significantBit) != 0)
         {
-#if defined(TARGET_XARCH)
-            // For xarch we have count sequential bits to write
-            // depending on if the corresponding the input element
-            // has its most significant bit set
-
             mask |= static_cast<uint64_t>(1) << i;
+        }
 #elif defined(TARGET_ARM64)
-            // For Arm64 we have count total bits to write, but
-            // they are sizeof(TBase) bits apart. We set
-            // depending on if the corresponding input element
-            // has its least significant bit set
-
+        // For Arm64 we have count total bits to write, but they are sizeof(TBase)
+        // bits apart. We set depending on if the corresponding input element has
+        // any bit set (this matches the use of cmpne in outputted assembly).
+        if (input0 != 0)
+        {
             mask |= static_cast<uint64_t>(1) << (i * sizeof(TBase));
+        }
 #else
-            unreached();
+        unreached();
 #endif
-        }
     }
 
     memcpy(&result->u8[0], &mask, sizeof(uint64_t));
@@ -2000,13 +1998,12 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0)
     uint32_t count    = sizeof(TSimd) / sizeof(TBase);
     uint32_t finalOne = count;
 
-    TBase significantBit = 1;
-
     // A mask pattern starts with zero or more 1s and then the rest of the mask is filled with 0s.
     // This pattern is extracted using the least significant bits of the vector elements.
 
     // For Arm64 we have count total bits to read, but they are sizeof(TBase) bits apart. We set
-    // depending on if the corresponding input element has its least significant bit set
+    // depending on if the corresponding input element has any bit set (this matches the use
+    // of cmpne in outputted assembly)
 
     // Find an unbroken sequence of 1s.
     for (uint32_t i = 0; i < count; i++)
@@ -2014,7 +2011,7 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0)
         TBase input0;
         memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase));
 
-        bool isSet = (input0 & significantBit) != 0;
+        bool isSet = input0 != 0;
         if (!isSet)
         {
             finalOne = i;
@@ -2028,7 +2025,7 @@ SveMaskPattern EvaluateSimdVectorToPattern(TSimd arg0)
         TBase input0;
         memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase));
 
-        bool isSet = (input0 & significantBit) != 0;
+        bool isSet = input0 != 0;
         if (isSet)
         {
             // Invalid sequence