From 928fea3265056cab692e08e514970f00757d977c Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 2 Dec 2024 17:12:38 -0800 Subject: [PATCH] Updating xarch to utilize EVEX compares and blending where profitable --- src/coreclr/jit/gentree.cpp | 8 +- src/coreclr/jit/hwintrinsic.h | 11 +- src/coreclr/jit/hwintrinsiclistxarch.h | 92 ++-- src/coreclr/jit/hwintrinsicxarch.cpp | 378 ++++++++++------- src/coreclr/jit/lowerxarch.cpp | 559 ++++++++++++++++++++++++- src/coreclr/jit/lsraxarch.cpp | 2 +- 6 files changed, 840 insertions(+), 210 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index d8d5752a5f6441..512d6d364ebba0 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -29540,7 +29540,7 @@ var_types GenTreeHWIntrinsic::GetLookupTypeForCmpOp( { case GT_EQ: { - if (simdSize == 64) + if ((simdSize == 64) || (comp->opts.OptimizationEnabled() && comp->canUseEvexEncoding())) { lookupType = TYP_MASK; } @@ -29551,7 +29551,8 @@ var_types GenTreeHWIntrinsic::GetLookupTypeForCmpOp( case GT_LE: case GT_NE: { - if ((simdSize == 64) || (varTypeIsIntegral(simdBaseType) && comp->canUseEvexEncoding())) + if ((simdSize == 64) || + ((comp->opts.OptimizationEnabled() || varTypeIsIntegral(simdBaseType)) && comp->canUseEvexEncoding())) { lookupType = TYP_MASK; } @@ -29561,7 +29562,8 @@ var_types GenTreeHWIntrinsic::GetLookupTypeForCmpOp( case GT_GT: case GT_LT: { - if ((simdSize == 64) || (varTypeIsUnsigned(simdBaseType) && comp->canUseEvexEncoding())) + if ((simdSize == 64) || + ((comp->opts.OptimizationEnabled() || varTypeIsUnsigned(simdBaseType)) && comp->canUseEvexEncoding())) { lookupType = TYP_MASK; } diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index d8bf386eb6009d..48c7eec6691125 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -546,12 +546,11 @@ struct HWIntrinsicInfo static bool isScalarIsa(CORINFO_InstructionSet isa); #ifdef TARGET_XARCH - static bool isAVX2GatherIntrinsic(NamedIntrinsic id); - static FloatComparisonMode lookupFloatComparisonModeForSwappedArgs(FloatComparisonMode comparison); - static NamedIntrinsic lookupIdForFloatComparisonMode(NamedIntrinsic intrinsic, - FloatComparisonMode comparison, - var_types simdBaseType, - unsigned simdSize); + static bool isAVX2GatherIntrinsic(NamedIntrinsic id); + static NamedIntrinsic lookupIdForFloatComparisonMode(NamedIntrinsic intrinsic, + FloatComparisonMode comparison, + var_types simdBaseType, + unsigned simdSize); #endif // Member lookup diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index a60b8ffd773cc7..22618629a1dc66 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -387,17 +387,17 @@ HARDWARE_INTRINSIC(SSE, Add, HARDWARE_INTRINSIC(SSE, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(SSE, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) @@ -422,7 +422,7 @@ HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedGreaterThanOrEqual, HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(SSE, ConvertScalarToVector128Single, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, ConvertToInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si32, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) @@ -492,17 +492,17 @@ HARDWARE_INTRINSIC(SSE2, AddScalar, HARDWARE_INTRINSIC(SSE2, And, 16, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(SSE2, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE2, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE2, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE2, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE2, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE2, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE2, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE2, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE2, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE2, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE2, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE2, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE2, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE2, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE2, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(SSE2, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) @@ -527,7 +527,7 @@ HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedGreaterThanOrEqual, HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(SSE2, ConvertScalarToVector128Double, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2sd, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(SSE2, ConvertScalarToVector128Int32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertScalarToVector128Single, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsd2ss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) @@ -647,10 +647,10 @@ HARDWARE_INTRINSIC(SSSE3, Sign, // SSE41 Intrinsics #define FIRST_NI_SSE41 NI_SSE41_Blend HARDWARE_INTRINSIC(SSE41, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE41, BlendVariable, 16, 3, {INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, BlendVariable, 16, 3, {INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(SSE41, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE41, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int16, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int32, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int64, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) @@ -698,8 +698,8 @@ HARDWARE_INTRINSIC(SSE41_X64, Insert, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE42 Intrinsics #define FIRST_NI_SSE42 NI_SSE42_CompareGreaterThan -HARDWARE_INTRINSIC(SSE42, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE42, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE42, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(SSE42, Crc32, 0, 2, {INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_RmwIntrinsic) #define LAST_NI_SSE42 NI_SSE42_Crc32 @@ -723,25 +723,25 @@ HARDWARE_INTRINSIC(AVX, AddSubtract, HARDWARE_INTRINSIC(AVX, And, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX, Blend, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, BroadcastVector128ToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, Ceiling, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX, Compare, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(AVX, CompareEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareGreaterThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareLessThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareNotEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareNotGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareNotGreaterThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareNotLessThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareNotLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareOrdered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, Compare, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareGreaterThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareLessThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareNotEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareNotGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareNotGreaterThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareNotLessThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareNotLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareOrdered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(AVX, CompareUnordered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, CompareUnordered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32WithTruncation, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX, ConvertToVector128Single, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) @@ -806,13 +806,13 @@ HARDWARE_INTRINSIC(AVX2, And, HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX2, Average, 32, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector128, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector256, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX2, ConvertToInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX2, ConvertToUInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int16, 32, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index d4651444bf5e7b..38251a6ec29fc0 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -473,95 +473,6 @@ bool HWIntrinsicInfo::isAVX2GatherIntrinsic(NamedIntrinsic id) } } -//------------------------------------------------------------------------ -// lookupFloatComparisonModeForSwappedArgs: Get the floating-point comparison -// mode to use when the operands are swapped. -// -// Arguments: -// comparison -- The comparison mode used for (op1, op2) -// -// Return Value: -// The comparison mode to use for (op2, op1) -// -FloatComparisonMode HWIntrinsicInfo::lookupFloatComparisonModeForSwappedArgs(FloatComparisonMode comparison) -{ - switch (comparison) - { - // These comparison modes are the same even if the operands are swapped - - case FloatComparisonMode::OrderedEqualNonSignaling: - return FloatComparisonMode::OrderedEqualNonSignaling; - case FloatComparisonMode::UnorderedNonSignaling: - return FloatComparisonMode::UnorderedNonSignaling; - case FloatComparisonMode::UnorderedNotEqualNonSignaling: - return FloatComparisonMode::UnorderedNotEqualNonSignaling; - case FloatComparisonMode::OrderedNonSignaling: - return FloatComparisonMode::OrderedNonSignaling; - case FloatComparisonMode::UnorderedEqualNonSignaling: - return FloatComparisonMode::UnorderedEqualNonSignaling; - case FloatComparisonMode::OrderedFalseNonSignaling: - return FloatComparisonMode::OrderedFalseNonSignaling; - case FloatComparisonMode::OrderedNotEqualNonSignaling: - return FloatComparisonMode::OrderedNotEqualNonSignaling; - case FloatComparisonMode::UnorderedTrueNonSignaling: - return FloatComparisonMode::UnorderedTrueNonSignaling; - case FloatComparisonMode::OrderedEqualSignaling: - return FloatComparisonMode::OrderedEqualSignaling; - case FloatComparisonMode::UnorderedSignaling: - return FloatComparisonMode::UnorderedSignaling; - case FloatComparisonMode::UnorderedNotEqualSignaling: - return FloatComparisonMode::UnorderedNotEqualSignaling; - case FloatComparisonMode::OrderedSignaling: - return FloatComparisonMode::OrderedSignaling; - case FloatComparisonMode::UnorderedEqualSignaling: - return FloatComparisonMode::UnorderedEqualSignaling; - case FloatComparisonMode::OrderedFalseSignaling: - return FloatComparisonMode::OrderedFalseSignaling; - case FloatComparisonMode::OrderedNotEqualSignaling: - return FloatComparisonMode::OrderedNotEqualSignaling; - case FloatComparisonMode::UnorderedTrueSignaling: - return FloatComparisonMode::UnorderedTrueSignaling; - - // These comparison modes need a different mode if the operands are swapped - - case FloatComparisonMode::OrderedLessThanSignaling: - return FloatComparisonMode::OrderedGreaterThanSignaling; - case FloatComparisonMode::OrderedLessThanOrEqualSignaling: - return FloatComparisonMode::OrderedGreaterThanOrEqualSignaling; - case FloatComparisonMode::UnorderedNotLessThanSignaling: - return FloatComparisonMode::UnorderedNotGreaterThanSignaling; - case FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling: - return FloatComparisonMode::UnorderedNotGreaterThanOrEqualSignaling; - case FloatComparisonMode::UnorderedNotGreaterThanOrEqualSignaling: - return FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling; - case FloatComparisonMode::UnorderedNotGreaterThanSignaling: - return FloatComparisonMode::UnorderedNotLessThanSignaling; - case FloatComparisonMode::OrderedGreaterThanOrEqualSignaling: - return FloatComparisonMode::OrderedLessThanOrEqualSignaling; - case FloatComparisonMode::OrderedGreaterThanSignaling: - return FloatComparisonMode::OrderedLessThanSignaling; - case FloatComparisonMode::OrderedLessThanNonSignaling: - return FloatComparisonMode::OrderedGreaterThanNonSignaling; - case FloatComparisonMode::OrderedLessThanOrEqualNonSignaling: - return FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling; - case FloatComparisonMode::UnorderedNotLessThanNonSignaling: - return FloatComparisonMode::UnorderedNotGreaterThanNonSignaling; - case FloatComparisonMode::UnorderedNotLessThanOrEqualNonSignaling: - return FloatComparisonMode::UnorderedNotGreaterThanOrEqualNonSignaling; - case FloatComparisonMode::UnorderedNotGreaterThanOrEqualNonSignaling: - return FloatComparisonMode::UnorderedNotLessThanOrEqualNonSignaling; - case FloatComparisonMode::UnorderedNotGreaterThanNonSignaling: - return FloatComparisonMode::UnorderedNotLessThanNonSignaling; - case FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling: - return FloatComparisonMode::OrderedLessThanOrEqualNonSignaling; - case FloatComparisonMode::OrderedGreaterThanNonSignaling: - return FloatComparisonMode::OrderedLessThanNonSignaling; - - default: - unreached(); - } -} - //------------------------------------------------------------------------ // lookupIdForFloatComparisonMode: Get the intrinsic ID to use for a given float comparison mode // @@ -2511,14 +2422,23 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector128_ExtractMostSignificantBits: + case NI_Vector256_ExtractMostSignificantBits: case NI_Vector512_ExtractMostSignificantBits: { + assert(sig->numArgs == 1); + #if defined(TARGET_X86) - // TODO-XARCH-CQ: It may be beneficial to decompose this operation - break; + if (intrinsic == NI_Vector512_ExtractMostSignificantBits) + { + // TODO-XARCH-CQ: It may be beneficial to decompose this operation + // for byte it requires proper decomposition, but for other types + // it only requires the upper bits to be zero. + break; + } #endif // TARGET_X86 - if (IsBaselineVector512IsaSupportedOpportunistically()) + if ((simdSize == 64) || (opts.OptimizationEnabled() && canUseEvexEncoding())) { op1 = impSIMDPopStack(); @@ -2526,15 +2446,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { op1 = gtNewSimdCvtVectorToMaskNode(TYP_MASK, op1, simdBaseJitType, simdSize); } + retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_EVEX_MoveMask, simdBaseJitType, simdSize); + break; } - break; - } - case NI_Vector128_ExtractMostSignificantBits: - case NI_Vector256_ExtractMostSignificantBits: - { - assert(sig->numArgs == 1); + assert(intrinsic != NI_Vector512_ExtractMostSignificantBits); if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compOpportunisticallyDependsOn(InstructionSet_AVX2)) @@ -4817,6 +4734,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_SSE41_BlendVariable: + case NI_AVX_BlendVariable: + case NI_AVX2_BlendVariable: case NI_AVX512F_BlendVariable: case NI_AVX512BW_BlendVariable: { @@ -4826,26 +4746,32 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - if (!varTypeIsMask(op3)) + if ((simdSize == 64) || (opts.OptimizationEnabled() && canUseEvexEncoding())) { - op3 = gtNewSimdCvtVectorToMaskNode(TYP_MASK, op3, simdBaseJitType, simdSize); + if (!varTypeIsMask(op3)) + { + op3 = gtNewSimdCvtVectorToMaskNode(TYP_MASK, op3, simdBaseJitType, simdSize); + } + intrinsic = NI_EVEX_BlendVariableMask; } - retNode = - gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, NI_EVEX_BlendVariableMask, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, intrinsic, simdBaseJitType, simdSize); break; } case NI_AVX_Compare: - case NI_AVX_CompareScalar: case NI_AVX512F_Compare: { - assert(sig->numArgs == 3); - - if (intrinsic == NI_AVX512F_Compare) + if ((simdSize == 64) || (opts.OptimizationEnabled() && canUseEvexEncoding())) { intrinsic = NI_EVEX_CompareMask; retType = TYP_MASK; } + FALLTHROUGH; + } + + case NI_AVX_CompareScalar: + { + assert(sig->numArgs == 3); int immLowerBound = 0; int immUpperBound = HWIntrinsicInfo::lookupImmUpperBound(intrinsic); @@ -4885,6 +4811,11 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_SSE_CompareEqual: + case NI_SSE2_CompareEqual: + case NI_SSE41_CompareEqual: + case NI_AVX_CompareEqual: + case NI_AVX2_CompareEqual: case NI_AVX512F_CompareEqual: case NI_AVX512BW_CompareEqual: { @@ -4893,28 +4824,58 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareEqualMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareEqualMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } + case NI_SSE_CompareGreaterThan: + case NI_SSE2_CompareGreaterThan: + case NI_SSE42_CompareGreaterThan: + case NI_AVX_CompareGreaterThan: + case NI_AVX2_CompareGreaterThan: case NI_AVX512F_CompareGreaterThan: case NI_AVX512F_VL_CompareGreaterThan: - case NI_AVX10v1_CompareGreaterThan: case NI_AVX512BW_CompareGreaterThan: case NI_AVX512BW_VL_CompareGreaterThan: + case NI_AVX10v1_CompareGreaterThan: { assert(sig->numArgs == 2); op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = - gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareGreaterThanMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (intrinsic >= FIRST_NI_AVX512F) || + (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareGreaterThanMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } + case NI_SSE_CompareGreaterThanOrEqual: + case NI_SSE2_CompareGreaterThanOrEqual: + case NI_AVX_CompareGreaterThanOrEqual: case NI_AVX512F_CompareGreaterThanOrEqual: case NI_AVX512F_VL_CompareGreaterThanOrEqual: case NI_AVX512BW_CompareGreaterThanOrEqual: @@ -4926,12 +4887,28 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareGreaterThanOrEqualMask, - simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (intrinsic >= FIRST_NI_AVX512F) || + (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareGreaterThanOrEqualMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } + case NI_SSE_CompareLessThan: + case NI_SSE2_CompareLessThan: + case NI_SSE42_CompareLessThan: + case NI_AVX_CompareLessThan: + case NI_AVX2_CompareLessThan: case NI_AVX512F_CompareLessThan: case NI_AVX512F_VL_CompareLessThan: case NI_AVX512BW_CompareLessThan: @@ -4943,12 +4920,26 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = - gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareLessThanMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (intrinsic >= FIRST_NI_AVX512F) || + (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareLessThanMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } + case NI_SSE_CompareLessThanOrEqual: + case NI_SSE2_CompareLessThanOrEqual: + case NI_AVX_CompareLessThanOrEqual: case NI_AVX512F_CompareLessThanOrEqual: case NI_AVX512F_VL_CompareLessThanOrEqual: case NI_AVX512BW_CompareLessThanOrEqual: @@ -4960,12 +4951,26 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareLessThanOrEqualMask, simdBaseJitType, - simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (intrinsic >= FIRST_NI_AVX512F) || + (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareLessThanOrEqualMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } + case NI_SSE_CompareNotEqual: + case NI_SSE2_CompareNotEqual: + case NI_AVX_CompareNotEqual: case NI_AVX512F_CompareNotEqual: case NI_AVX512F_VL_CompareNotEqual: case NI_AVX512BW_CompareNotEqual: @@ -4977,12 +4982,26 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = - gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareNotEqualMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (intrinsic >= FIRST_NI_AVX512F) || + (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareNotEqualMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } + case NI_SSE_CompareNotGreaterThan: + case NI_SSE2_CompareNotGreaterThan: + case NI_AVX_CompareNotGreaterThan: case NI_AVX512F_CompareNotGreaterThan: { assert(sig->numArgs == 2); @@ -4990,12 +5009,25 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareNotGreaterThanMask, simdBaseJitType, - simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareNotGreaterThanMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } + case NI_SSE_CompareNotGreaterThanOrEqual: + case NI_SSE2_CompareNotGreaterThanOrEqual: + case NI_AVX_CompareNotGreaterThanOrEqual: case NI_AVX512F_CompareNotGreaterThanOrEqual: { assert(sig->numArgs == 2); @@ -5003,12 +5035,25 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareNotGreaterThanOrEqualMask, - simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareNotGreaterThanOrEqualMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } + case NI_SSE_CompareNotLessThan: + case NI_SSE2_CompareNotLessThan: + case NI_AVX_CompareNotLessThan: case NI_AVX512F_CompareNotLessThan: { assert(sig->numArgs == 2); @@ -5016,12 +5061,25 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = - gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareNotLessThanMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareNotLessThanMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } + case NI_SSE_CompareNotLessThanOrEqual: + case NI_SSE2_CompareNotLessThanOrEqual: + case NI_AVX_CompareNotLessThanOrEqual: case NI_AVX512F_CompareNotLessThanOrEqual: { assert(sig->numArgs == 2); @@ -5029,12 +5087,25 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareNotLessThanOrEqualMask, - simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareNotLessThanOrEqualMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } + case NI_SSE_CompareOrdered: + case NI_SSE2_CompareOrdered: + case NI_AVX_CompareOrdered: case NI_AVX512F_CompareOrdered: { assert(sig->numArgs == 2); @@ -5042,12 +5113,25 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = - gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareOrderedMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareOrderedMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } + case NI_SSE_CompareUnordered: + case NI_SSE2_CompareUnordered: + case NI_AVX_CompareUnordered: case NI_AVX512F_CompareUnordered: { assert(sig->numArgs == 2); @@ -5055,9 +5139,19 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = - gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_EVEX_CompareUnorderedMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + if ((simdSize == 64) || (opts.OptimizationEnabled() && canUseEvexEncoding())) + { + intrinsic = NI_EVEX_CompareUnorderedMask; + retType = TYP_MASK; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); + + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } break; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 87528fee51bdfe..7cb8972292f2ec 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -856,7 +856,7 @@ GenTree* Lowering::LowerCast(GenTree* tree) if (varTypeIsFloating(srcType)) { noway_assert(!tree->gtOverflow()); - assert(castToType != TYP_ULONG || comp->canUseEvexEncoding()); + assert(castToType != TYP_ULONG || comp->canUseEvexEncodingDebugOnly()); } else if (srcType == TYP_UINT) { @@ -864,7 +864,7 @@ GenTree* Lowering::LowerCast(GenTree* tree) } else if (srcType == TYP_ULONG) { - assert(castToType != TYP_FLOAT || comp->canUseEvexEncoding()); + assert(castToType != TYP_FLOAT || comp->canUseEvexEncodingDebugOnly()); } #if defined(TARGET_AMD64) @@ -963,12 +963,20 @@ GenTree* Lowering::LowerCast(GenTree* tree) GenTree* compMask = comp->gtNewSimdCmpOpNode(GT_GE, TYP_SIMD16, fixupVal, maxVal, fieldType, 16); BlockRange().InsertAfter(maxValDstType, compMask); + GenTree* actualCompMask = compMask; + + if (compMask->OperIsConvertMaskToVector()) + { + actualCompMask = compMask->AsHWIntrinsic()->Op(1); + BlockRange().InsertBefore(compMask, actualCompMask); + } + // convert fixupVal to local variable and clone it for further use - LIR::Use fixupValUse(BlockRange(), &(compMask->AsHWIntrinsic()->Op(1)), compMask); + LIR::Use fixupValUse(BlockRange(), &(actualCompMask->AsHWIntrinsic()->Op(1)), actualCompMask); ReplaceWithLclVar(fixupValUse); - fixupVal = compMask->AsHWIntrinsic()->Op(1); + fixupVal = actualCompMask->AsHWIntrinsic()->Op(1); GenTree* fixupValClone = comp->gtClone(fixupVal); - LowerNode(compMask); + LowerNode(actualCompMask); BlockRange().InsertAfter(fixupVal, fixupValClone); GenTree* FixupValCloneScalar = @@ -984,12 +992,15 @@ GenTree* Lowering::LowerCast(GenTree* tree) BlockRange().InsertAfter(newCast, newTree); LowerNode(newTree); - // usage 2 --> use thecompared mask with input value and max value to blend - GenTree* control = comp->gtNewIconNode(0xCA); // (B & A) | (C & ~A) - BlockRange().InsertAfter(newTree, control); - GenTree* cndSelect = comp->gtNewSimdTernaryLogicNode(TYP_SIMD16, compMask, maxValDstType, newTree, - control, destFieldType, 16); - BlockRange().InsertAfter(control, cndSelect); + // usage 2 --> use the compared mask with input value and max value to blend + GenTree* cndSelect = + comp->gtNewSimdCndSelNode(TYP_SIMD16, compMask, maxValDstType, newTree, destFieldType, 16); + BlockRange().InsertAfter(newTree, cndSelect); + + if (compMask->OperIsConvertMaskToVector()) + { + LowerNode(compMask); + } LowerNode(cndSelect); castOutput = @@ -1064,7 +1075,7 @@ GenTree* Lowering::LowerCast(GenTree* tree) BlockRange().InsertAfter(newCast, newTree); LowerNode(newTree); - // usage 2 --> use thecompared mask with input value and max value to blend + // usage 2 --> use the compared mask with input value and max value to blend GenTree* cndSelect = comp->gtNewSimdCndSelNode(TYP_SIMD16, compMask, maxValDup, newTree, destFieldType, 16); BlockRange().InsertAfter(newTree, cndSelect); LowerNode(cndSelect); @@ -1955,6 +1966,30 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) if (!varTypeIsFloating(node->GetSimdBaseType()) && op2->IsVectorZero()) { + LIR::Use use; + + if ((node->GetSimdSize() != 64) && BlockRange().TryGetUse(node, &use)) + { + GenTree* parentNode = use.User(); + + if (parentNode->OperIsConvertMaskToVector()) + { + LIR::Use parentUse; + + if (BlockRange().TryGetUse(parentNode, &parentUse)) + { + if (!parentUse.User()->OperIsHWIntrinsic(NI_EVEX_MoveMask)) + { + // For TYP_SIMD16 and TYP_SIMD32 we want to avoid this optimization + // if the user would be just converting the mask back to a vector + // as we can instead rewrite this to a regular CompareEqual and then + // consume the vector directly. + break; + } + } + } + } + NamedIntrinsic testIntrinsicId; if (intrinsicId == NI_EVEX_CompareEqualMask) @@ -2045,6 +2080,506 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_EVEX_BlendVariableMask: + { + unsigned simdSize = node->GetSimdSize(); + + if (simdSize == 64) + { + // Nothing to handle for TYP_SIMD64 as they require masks + break; + } + + GenTree* op3 = node->Op(3); + + if (!op3->OperIsConvertVectorToMask()) + { + // We can only special case when op3 is ConvertVectorToMask + break; + } + + // We have BlendVariableMask(op1, op2, ConvertVectorToMask(op3)) and + // so we'll rewrite it to BlendVariable(op1, op2, op3) allowing us + // to avoid the additional conversion all together + + var_types simdBaseType = node->GetSimdBaseType(); + + if (simdSize == 32) + { + intrinsicId = varTypeIsFloating(simdBaseType) ? NI_AVX_BlendVariable : NI_AVX2_BlendVariable; + } + else + { + intrinsicId = NI_SSE41_BlendVariable; + } + + node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), op3->AsHWIntrinsic()->Op(1)); + BlockRange().Remove(op3); + + return LowerNode(node); + } + + case NI_EVEX_ConvertMaskToVector: + { + NamedIntrinsic id = NI_Illegal; + GenTree* op1 = node->Op(1); + + unsigned simdSize = node->GetSimdSize(); + var_types simdBaseType = node->GetSimdBaseType(); + + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + + if (foundUse) + { + if (use.User()->OperIsVectorConditionalSelect()) + { + // We have a ConvertMaskToVector but its user is ConditionalSelect + // which means we can actually consume the mask directly for this + // special scenario. + break; + } + + if (use.User()->OperIsConvertVectorToMask()) + { + // We have ConvertVectorToMask(ConvertMaskToVector(op1)) + // so we can optimize it to just be op1 if they are compatible + + GenTreeHWIntrinsic* parentNode = use.User()->AsHWIntrinsic(); + unsigned simdBaseTypeSize = genTypeSize(simdBaseType); + + if ((genTypeSize(parentNode->GetSimdBaseType()) == simdBaseTypeSize)) + { + LIR::Use parentUse; + + if (BlockRange().TryGetUse(parentNode, &parentUse)) + { + parentUse.ReplaceWith(op1); + } + else + { + op1->SetUnusedValue(); + } + + BlockRange().Remove(parentNode); + + GenTree* nextNode = node->gtNext; + BlockRange().Remove(node); + + return nextNode; + } + } + } + + if (simdSize == 64) + { + // Nothing to handle for TYP_SIMD64 as they require masks + break; + } + + if (!op1->OperIsHWIntrinsic()) + { + // We can only special case certain HWINTRINSIC nodes + break; + } + + GenTreeHWIntrinsic* op1Intrin = op1->AsHWIntrinsic(); + NamedIntrinsic op1IntrinId = op1Intrin->GetHWIntrinsicId(); + + switch (op1IntrinId) + { + case NI_EVEX_CompareEqualMask: + { + if (varTypeIsFloating(simdBaseType)) + { + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode( + NI_AVX_Compare, FloatComparisonMode::OrderedEqualNonSignaling, simdBaseType, simdSize); + } + else if (simdSize == 32) + { + id = NI_AVX2_CompareEqual; + } + else if (varTypeIsLong(simdBaseType)) + { + id = NI_SSE41_CompareEqual; + } + else + { + id = NI_SSE2_CompareEqual; + } + break; + } + + case NI_EVEX_CompareGreaterThanMask: + { + if (varTypeIsFloating(simdBaseType)) + { + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode( + NI_AVX_Compare, FloatComparisonMode::OrderedGreaterThanSignaling, simdBaseType, simdSize); + } + else if (varTypeIsUnsigned(simdBaseType)) + { + // Unsigned integer comparisons must use the EVEX instruction + break; + } + else if (simdSize == 32) + { + id = NI_AVX2_CompareGreaterThan; + } + else if (varTypeIsLong(simdBaseType)) + { + id = NI_SSE42_CompareGreaterThan; + } + else + { + id = NI_SSE2_CompareGreaterThan; + } + break; + } + + case NI_EVEX_CompareGreaterThanOrEqualMask: + { + if (varTypeIsFloating(simdBaseType)) + { + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode( + NI_AVX_Compare, FloatComparisonMode::OrderedGreaterThanOrEqualSignaling, simdBaseType, + simdSize); + } + else + { + // Integer comparisons must use the EVEX instruction + } + break; + } + + case NI_EVEX_CompareLessThanMask: + { + if (varTypeIsFloating(simdBaseType)) + { + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode( + NI_AVX_Compare, FloatComparisonMode::OrderedLessThanSignaling, simdBaseType, simdSize); + } + else if (varTypeIsUnsigned(simdBaseType)) + { + // Unsigned integer comparisons must use the EVEX instruction + break; + } + else if (simdSize == 32) + { + id = NI_AVX2_CompareLessThan; + } + else if (varTypeIsLong(simdBaseType)) + { + id = NI_SSE42_CompareLessThan; + } + else + { + id = NI_SSE2_CompareLessThan; + } + break; + } + + case NI_EVEX_CompareLessThanOrEqualMask: + { + if (varTypeIsFloating(simdBaseType)) + { + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode( + NI_AVX_Compare, FloatComparisonMode::OrderedLessThanOrEqualSignaling, simdBaseType, + simdSize); + } + else + { + // Integer comparisons must use the EVEX instruction + } + break; + } + + case NI_EVEX_CompareNotEqualMask: + { + if (varTypeIsFloating(simdBaseType)) + { + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode( + NI_AVX_Compare, FloatComparisonMode::UnorderedNotEqualNonSignaling, simdBaseType, simdSize); + } + else + { + // Integer comparisons must use the EVEX instruction + } + break; + } + + case NI_EVEX_CompareNotGreaterThanMask: + { + if (varTypeIsFloating(simdBaseType)) + { + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode( + NI_AVX_Compare, FloatComparisonMode::UnorderedNotGreaterThanSignaling, simdBaseType, + simdSize); + } + else + { + // Integer comparisons must use the EVEX instruction + // as this is the same as: LessThanOrEqual + } + break; + } + + case NI_EVEX_CompareNotGreaterThanOrEqualMask: + { + if (varTypeIsFloating(simdBaseType)) + { + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode( + NI_AVX_Compare, FloatComparisonMode::UnorderedNotGreaterThanOrEqualSignaling, simdBaseType, + simdSize); + } + else if (varTypeIsUnsigned(simdBaseType)) + { + // Unsigned integer comparisons must use the EVEX instruction + // as this is the same as: LessThan + break; + } + else if (simdSize == 32) + { + id = NI_AVX2_CompareLessThan; + } + else if (varTypeIsLong(simdBaseType)) + { + id = NI_SSE42_CompareLessThan; + } + else + { + id = NI_SSE2_CompareLessThan; + } + break; + } + + case NI_EVEX_CompareNotLessThanMask: + { + if (varTypeIsFloating(simdBaseType)) + { + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode( + NI_AVX_Compare, FloatComparisonMode::UnorderedNotLessThanSignaling, simdBaseType, simdSize); + } + else + { + // Integer comparisons must use the EVEX instruction + // as this is the same as: GreaterThanOrEqual + } + break; + } + + case NI_EVEX_CompareNotLessThanOrEqualMask: + { + if (varTypeIsFloating(simdBaseType)) + { + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode( + NI_AVX_Compare, FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling, simdBaseType, + simdSize); + } + else if (varTypeIsUnsigned(simdBaseType)) + { + // Unsigned integer comparisons must use the EVEX instruction + // as this is the same as: GreaterThan + break; + } + else if (simdSize == 32) + { + id = NI_AVX2_CompareGreaterThan; + } + else if (varTypeIsLong(simdBaseType)) + { + id = NI_SSE42_CompareGreaterThan; + } + else + { + id = NI_SSE2_CompareGreaterThan; + } + break; + } + + case NI_EVEX_CompareOrderedMask: + { + assert(varTypeIsFloating(simdBaseType)); + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode(NI_AVX_Compare, + FloatComparisonMode::OrderedNonSignaling, + simdBaseType, simdSize); + break; + } + + case NI_EVEX_CompareUnorderedMask: + { + assert(varTypeIsFloating(simdBaseType)); + id = HWIntrinsicInfo::lookupIdForFloatComparisonMode(NI_AVX_Compare, + FloatComparisonMode::UnorderedNonSignaling, + simdBaseType, simdSize); + break; + } + + default: + { + // Other cases get no special handling + break; + } + } + + if (id != NI_Illegal) + { + GenTree* op2 = op1Intrin->Op(2); + + if (op2->isContained() && op2->OperIsHWIntrinsic() && op2->AsHWIntrinsic()->OperIsBroadcastScalar()) + { + // Don't rewrite cases that are taking advantage of embedded broadcast + // as they typically reduce cache impact and help reduce code size. + break; + } + + // We've remapped ConvertMaskToVector(Compare*Mask) to be simply + // Compare*, allowing us to avoid the additional conversion expense + + op1Intrin->gtType = node->TypeGet(); + op1Intrin->ChangeHWIntrinsicId(id); + + GenTree* nextNode = node->gtNext; + + if (foundUse) + { + use.ReplaceWith(op1Intrin); + } + else + { + op1Intrin->SetUnusedValue(); + } + + // Some intrinsics need operand swapping, so ensure + // we clear containment and relower the node + + op2->ClearContained(); + LowerNode(op1Intrin); + + BlockRange().Remove(node); + return nextNode; + } + break; + } + + case NI_EVEX_ConvertVectorToMask: + { + GenTree* op1 = node->Op(1); + + unsigned simdSize = node->GetSimdSize(); + var_types simdBaseType = node->GetSimdBaseType(); + + LIR::Use use; + + if (BlockRange().TryGetUse(node, &use)) + { + if (use.User()->OperIsConvertMaskToVector()) + { + // We have ConvertMaskToVector(ConvertVectorToMask(op1)) + // so we can optimize it to just be op1 if they are compatible + + GenTreeHWIntrinsic* parentNode = use.User()->AsHWIntrinsic(); + unsigned simdBaseTypeSize = genTypeSize(simdBaseType); + + if ((genTypeSize(parentNode->GetSimdBaseType()) == simdBaseTypeSize)) + { + LIR::Use parentUse; + + if (BlockRange().TryGetUse(parentNode, &parentUse)) + { + parentUse.ReplaceWith(op1); + } + else + { + op1->SetUnusedValue(); + } + + BlockRange().Remove(parentNode); + + GenTree* nextNode = node->gtNext; + BlockRange().Remove(node); + + return nextNode; + } + } + } + break; + } + + case NI_EVEX_MoveMask: + { + unsigned simdSize = node->GetSimdSize(); + + if (simdSize == 64) + { + // Nothing to handle for TYP_SIMD64 as they require masks + break; + } + + GenTree* op1 = node->Op(1); + + if (!op1->OperIsConvertVectorToMask()) + { + // We can only special case when op1 is ConvertVectorToMask + break; + } + + // We have Evex.MoveMask(ConvertVectorToMask(op1)) and + // so we'll rewrite it to Avx.MoveMask(op1) allowing us + // to avoid the additional conversion all together + + var_types simdBaseType = node->GetSimdBaseType(); + + if (varTypeIsShort(simdBaseType)) + { + // We don't want to special case short as existing sequence is more + // optimal than the fallback we'd otherwise have to generate which is: + // ctrl = Vector128.Create(1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1) + // op1 = Ssse3.Shuffle(op1, ctrl) + // Sse2.MoveMask(op1) + // In the case of TYP_SIMD32 we need an additional Avx2.Permute4x64 as well + break; + } + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + intrinsicId = (simdSize == 32) ? NI_AVX2_MoveMask : NI_SSE2_MoveMask; + break; + } + + case TYP_INT: + case TYP_UINT: + case TYP_FLOAT: + { + intrinsicId = (simdSize == 32) ? NI_AVX_MoveMask : NI_SSE_MoveMask; + node->SetSimdBaseJitType(CORINFO_TYPE_FLOAT); + break; + } + + case TYP_LONG: + case TYP_ULONG: + case TYP_DOUBLE: + { + intrinsicId = (simdSize == 32) ? NI_AVX_MoveMask : NI_SSE2_MoveMask; + node->SetSimdBaseJitType(CORINFO_TYPE_DOUBLE); + break; + } + + default: + { + unreached(); + } + } + + node->ResetHWIntrinsicId(intrinsicId, comp, op1->AsHWIntrinsic()->Op(1)); + BlockRange().Remove(op1); + + return LowerNode(node); + } + case NI_EVEX_NotMask: { // We want to recognize ~(op1 ^ op2) and transform it diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 5a1bd13cddd09d..50f7c756cb31f3 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -3090,7 +3090,7 @@ void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/) if (sizeOfSIMDVector >= 32) { - assert((sizeOfSIMDVector == 32) || ((sizeOfSIMDVector == 64) && compiler->canUseEvexEncoding())); + assert((sizeOfSIMDVector == 32) || ((sizeOfSIMDVector == 64) && compiler->canUseEvexEncodingDebugOnly())); compiler->GetEmitter()->SetContains256bitOrMoreAVX(true); } }