@@ -32083,10 +32083,8 @@ bool GenTree::CanDivOrModPossiblyOverflow(Compiler* comp) const
3208332083#if defined(FEATURE_HW_INTRINSICS)
3208432084GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3208532085{
32086-     if (!opts.Tier0OptimizationEnabled())
32087-     {
32088-         return tree;
32089-     }
32086+     assert(!optValnumCSE_phase);
32087+     assert(opts.Tier0OptimizationEnabled());
3209032088
3209132089    NamedIntrinsic ni              = tree->GetHWIntrinsicId();
3209232090    var_types      retType         = tree->TypeGet();
@@ -32225,6 +32223,133 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3222532223    // We shouldn't find AND_NOT nodes since it should only be produced in lowering
3222632224    assert(oper != GT_AND_NOT);
3222732225
32226+ #if defined(FEATURE_MASKED_HW_INTRINSICS) && defined(TARGET_XARCH)
32227+     if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper))
32228+     {
32229+         // Comparisons that produce masks lead to more verbose trees than
32230+         // necessary in many scenarios due to requiring a CvtMaskToVector
32231+         // node to be inserted over them and this can block various opts
32232+         // that are dependent on tree height and similar. So we want to
32233+         // fold the unnecessary back and forth conversions away where possible.
32234+ 
32235+         genTreeOps effectiveOper = oper;
32236+         GenTree*   actualOp2     = op2;
32237+ 
32238+         if (oper == GT_NOT)
32239+         {
32240+             assert(op2 == nullptr);
32241+             op2 = op1;
32242+         }
32243+ 
32244+         // We need both operands to be ConvertMaskToVector in
32245+         // order to optimize this to a direct mask operation
32246+ 
32247+         if (!op1->OperIsConvertMaskToVector())
32248+         {
32249+             return tree;
32250+         }
32251+ 
32252+         if (!op2->OperIsHWIntrinsic())
32253+         {
32254+             if ((oper != GT_XOR) || !op2->IsVectorAllBitsSet())
32255+             {
32256+                 return tree;
32257+             }
32258+ 
32259+             // We want to explicitly recognize op1 ^ AllBitsSet as
32260+             // some platforms don't have direct support for ~op1
32261+ 
32262+             effectiveOper = GT_NOT;
32263+             op2           = op1;
32264+         }
32265+ 
32266+         GenTreeHWIntrinsic* cvtOp1 = op1->AsHWIntrinsic();
32267+         GenTreeHWIntrinsic* cvtOp2 = op2->AsHWIntrinsic();
32268+ 
32269+         if (!cvtOp2->OperIsConvertMaskToVector())
32270+         {
32271+             return tree;
32272+         }
32273+ 
32274+         unsigned simdBaseTypeSize = genTypeSize(simdBaseType);
32275+ 
32276+         if ((genTypeSize(cvtOp1->GetSimdBaseType()) != simdBaseTypeSize) ||
32277+             (genTypeSize(cvtOp2->GetSimdBaseType()) != simdBaseTypeSize))
32278+         {
32279+             // We need both operands to be the same kind of mask; otherwise
32280+             // the bitwise operation can differ in how it performs
32281+             return tree;
32282+         }
32283+ 
32284+         NamedIntrinsic maskIntrinsicId = NI_Illegal;
32285+ 
32286+         switch (effectiveOper)
32287+         {
32288+             case GT_AND:
32289+             {
32290+                 maskIntrinsicId = NI_AVX512_AndMask;
32291+                 break;
32292+             }
32293+ 
32294+             case GT_NOT:
32295+             {
32296+                 maskIntrinsicId = NI_AVX512_NotMask;
32297+                 break;
32298+             }
32299+ 
32300+             case GT_OR:
32301+             {
32302+                 maskIntrinsicId = NI_AVX512_OrMask;
32303+                 break;
32304+             }
32305+ 
32306+             case GT_XOR:
32307+             {
32308+                 maskIntrinsicId = NI_AVX512_XorMask;
32309+                 break;
32310+             }
32311+ 
32312+             default:
32313+             {
32314+                 unreached();
32315+             }
32316+         }
32317+ 
32318+         assert(maskIntrinsicId != NI_Illegal);
32319+ 
32320+         if (effectiveOper == oper)
32321+         {
32322+             tree->ChangeHWIntrinsicId(maskIntrinsicId);
32323+             tree->Op(1) = cvtOp1->Op(1);
32324+         }
32325+         else
32326+         {
32327+             assert(effectiveOper == GT_NOT);
32328+             tree->ResetHWIntrinsicId(maskIntrinsicId, this, cvtOp1->Op(1));
32329+             tree->gtFlags &= ~GTF_REVERSE_OPS;
32330+         }
32331+ 
32332+         tree->gtType = TYP_MASK;
32333+         DEBUG_DESTROY_NODE(op1);
32334+ 
32335+         if (effectiveOper != GT_NOT)
32336+         {
32337+             tree->Op(2) = cvtOp2->Op(1);
32338+         }
32339+ 
32340+         if (actualOp2 != nullptr)
32341+         {
32342+             DEBUG_DESTROY_NODE(actualOp2);
32343+         }
32344+         tree->SetMorphed(this);
32345+ 
32346+         tree = gtNewSimdCvtMaskToVectorNode(retType, tree, simdBaseJitType, simdSize)->AsHWIntrinsic();
32347+         tree->SetMorphed(this);
32348+ 
32349+         return tree;
32350+     }
32351+ #endif // FEATURE_MASKED_HW_INTRINSICS && TARGET_XARCH
32352+ 
3222832353    GenTree* cnsNode   = nullptr;
3222932354    GenTree* otherNode = nullptr;
3223032355
@@ -32762,10 +32887,28 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3276232887            oper = GT_NONE;
3276332888        }
3276432889
32890+         // For mask nodes in particular, the foldings below are done under the presumption
32891+         // that we only produce something like `AddMask(op1, op2)` if op1 and op2 are compatible
32892+         // masks. On xarch, for example, this means that it'd be adding 8, 16, 32, or 64-bits
32893+         // together with the same size. We wouldn't ever encounter something like an 8 and 16 bit
32894+         // masks being added. This ensures that we don't end up with a case where folding would
32895+         // cause a different result to be produced, such as because the remaining upper bits are
32896+         // no longer zeroed.
32897+ 
3276532898        switch (oper)
3276632899        {
3276732900            case GT_ADD:
3276832901            {
32902+                 if (varTypeIsMask(retType))
32903+                 {
32904+                     // Handle `x + 0 == x` and `0 + x == x`
32905+                     if (cnsNode->IsMaskZero())
32906+                     {
32907+                         resultNode = otherNode;
32908+                     }
32909+                     break;
32910+                 }
32911+ 
3276932912                if (varTypeIsFloating(simdBaseType))
3277032913                {
3277132914                    // Handle `x + NaN == NaN` and `NaN + x == NaN`
@@ -32799,6 +32942,23 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3279932942
3280032943            case GT_AND:
3280132944            {
32945+                 if (varTypeIsMask(retType))
32946+                 {
32947+                     // Handle `x & 0 == 0` and `0 & x == 0`
32948+                     if (cnsNode->IsMaskZero())
32949+                     {
32950+                         resultNode = otherNode;
32951+                         break;
32952+                     }
32953+ 
32954+                     // Handle `x & AllBitsSet == x` and `AllBitsSet & x == x`
32955+                     if (cnsNode->IsMaskAllBitsSet())
32956+                     {
32957+                         resultNode = otherNode;
32958+                     }
32959+                     break;
32960+                 }
32961+ 
3280232962                // Handle `x & 0 == 0` and `0 & x == 0`
3280332963                if (cnsNode->IsVectorZero())
3280432964                {
@@ -33032,6 +33192,23 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3303233192
3303333193            case GT_OR:
3303433194            {
33195+                 if (varTypeIsMask(retType))
33196+                 {
33197+                     // Handle `x | 0 == x` and `0 | x == x`
33198+                     if (cnsNode->IsMaskZero())
33199+                     {
33200+                         resultNode = otherNode;
33201+                         break;
33202+                     }
33203+ 
33204+                     // Handle `x | AllBitsSet == AllBitsSet` and `AllBitsSet | x == AllBitsSet`
33205+                     if (cnsNode->IsMaskAllBitsSet())
33206+                     {
33207+                         resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
33208+                     }
33209+                     break;
33210+                 }
33211+ 
3303533212                // Handle `x | 0 == x` and `0 | x == x`
3303633213                if (cnsNode->IsVectorZero())
3303733214                {
@@ -33059,6 +33236,27 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3305933236                // Handle `x >>  0 == x` and `0 >>  x == 0`
3306033237                // Handle `x >>> 0 == x` and `0 >>> x == 0`
3306133238
33239+                 if (varTypeIsMask(retType))
33240+                 {
33241+                     if (cnsNode->IsMaskZero())
33242+                     {
33243+                         if (cnsNode == op2)
33244+                         {
33245+                             resultNode = otherNode;
33246+                         }
33247+                         else
33248+                         {
33249+                             resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
33250+                         }
33251+                     }
33252+                     else if (cnsNode->IsIntegralConst(0))
33253+                     {
33254+                         assert(cnsNode == op2);
33255+                         resultNode = otherNode;
33256+                     }
33257+                     break;
33258+                 }
33259+ 
3306233260                if (cnsNode->IsVectorZero())
3306333261                {
3306433262                    if (cnsNode == op2)
@@ -33104,7 +33302,17 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3310433302
3310533303            case GT_XOR:
3310633304            {
33107-                 // Handle `x | 0 == x` and `0 | x == x`
33305+                 if (varTypeIsMask(retType))
33306+                 {
33307+                     // Handle `x ^ 0 == x` and `0 ^ x == x`
33308+                     if (cnsNode->IsMaskZero())
33309+                     {
33310+                         resultNode = otherNode;
33311+                     }
33312+                     break;
33313+                 }
33314+ 
33315+                 // Handle `x ^ 0 == x` and `0 ^ x == x`
3310833316                if (cnsNode->IsVectorZero())
3310933317                {
3311033318                    resultNode = otherNode;
@@ -33273,7 +33481,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3327333481                }
3327433482                else
3327533483                {
33276-                     assert(!op1->IsTrueMask(simdBaseType) && !op1->IsFalseMask ());
33484+                     assert(!op1->IsTrueMask(simdBaseType) && !op1->IsMaskZero ());
3327733485                }
3327833486#endif
3327933487
@@ -33291,7 +33499,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3329133499                    return op2;
3329233500                }
3329333501
33294-                 if (op1->IsVectorZero() || op1->IsFalseMask ())
33502+                 if (op1->IsVectorZero() || op1->IsMaskZero ())
3329533503                {
3329633504                    return gtWrapWithSideEffects(op3, op2, GTF_ALL_EFFECT);
3329733505                }
0 commit comments