Skip to content

Commit 5fdb133

Browse files
Enable AVX512 embedded masking for most other intrinsics (#101886)
* Remove HW_Flag_MultiIns in favor of using HW_Flag_SpecialCodeGen * Add a new flag HW_Flag_InvalidNodeId * Change HW_Flag_EmbMaskingIncompatible to be HW_Flag_EmbMaskingCompatible * Mark various compare intrinsics with HW_Flag_NoEvexSemantics * Marking various intrinsics as EmbBroadcastCompatible, EmbMaskingCompatible, or Commutative * Applying formatting patch * Ensure WithLower/WithUpper are not marked as InvalidNodeId * Ensure that instOptions are being passed down all relevant hwintrinsic code paths * Ensure the insOpts are plumbed through for EVEX instructions * Ensure EVEX instructions are properly annotated with EmbeddedBroadcastSupported * Ensure that embedded broadcast/masking is displayed in the disassembly * Applying formatting patch * Updating the hwintrinsic tests to cover embedded broadcast/masking * Fix some handling in the JIT related to embedded broadcast/masking * Fixup some tests where validating embedded masking is non-trivial * Cleanup some cases found by SPMI * Ensure that CompareLessThan has its operands swapped back if its being converted to the AVX512 form * Don't regress a scenario around op_Equality and TYP_MASK * Adjusting hardware intrinsic tests to test non-zero masks * Avoid some messiness around operand swapping * Ensure embedded masks mark TYP_SIMD16 and TYP_SIMD32 instructions as needing EVEX * Mark Sse2_r/Sse2_ro as AotIncompatible due to runtime/102037
1 parent 38467bb commit 5fdb133

37 files changed

+4723
-1943
lines changed

src/coreclr/jit/codegen.h

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -959,38 +959,35 @@ class CodeGen final : public CodeGenInterface
959959
#ifdef FEATURE_HW_INTRINSICS
960960
void genHWIntrinsic(GenTreeHWIntrinsic* node);
961961
#if defined(TARGET_XARCH)
962-
void genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node,
963-
instruction ins,
964-
emitAttr attr,
965-
regNumber reg,
966-
GenTree* rmOp,
967-
insOpts instOptions = INS_OPTS_NONE);
968-
void genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
962+
void genHWIntrinsic_R_RM(
963+
GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber reg, GenTree* rmOp, insOpts instOptions);
964+
void genHWIntrinsic_R_RM_I(
965+
GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival, insOpts instOptions);
969966
void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, insOpts instOptions);
970-
void genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
971-
void genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr);
967+
void genHWIntrinsic_R_R_RM_I(
968+
GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival, insOpts instOptions);
969+
void genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, insOpts instOptions);
972970
void genHWIntrinsic_R_R_R_RM(instruction ins,
973971
emitAttr attr,
974972
regNumber targetReg,
975973
regNumber op1Reg,
976974
regNumber op2Reg,
977975
GenTree* op3,
978-
insOpts instOptions = INS_OPTS_NONE);
979-
void genHWIntrinsic_R_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
976+
insOpts instOptions);
977+
void genHWIntrinsic_R_R_R_RM_I(
978+
GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival, insOpts instOptions);
980979

981-
void genBaseIntrinsic(GenTreeHWIntrinsic* node);
982-
void genX86BaseIntrinsic(GenTreeHWIntrinsic* node);
980+
void genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
981+
void genX86BaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
983982
void genSSEIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
984983
void genSSE2Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
985-
void genSSE41Intrinsic(GenTreeHWIntrinsic* node);
986-
void genSSE42Intrinsic(GenTreeHWIntrinsic* node);
984+
void genSSE41Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
985+
void genSSE42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
987986
void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
988-
void genAESIntrinsic(GenTreeHWIntrinsic* node);
989987
void genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
990988
void genFMAIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
991-
void genPermuteVar2x(GenTreeHWIntrinsic* node);
989+
void genPermuteVar2x(GenTreeHWIntrinsic* node, insOpts instOptions);
992990
void genLZCNTIntrinsic(GenTreeHWIntrinsic* node);
993-
void genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node);
994991
void genPOPCNTIntrinsic(GenTreeHWIntrinsic* node);
995992
void genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins);
996993
void genX86SerializeIntrinsic(GenTreeHWIntrinsic* node);
@@ -1003,6 +1000,8 @@ class CodeGen final : public CodeGenInterface
10031000
HWIntrinsicSwitchCaseBody emitSwCase);
10041001

10051002
void genNonTableDrivenHWIntrinsicsJumpTableFallback(GenTreeHWIntrinsic* node, GenTree* lastOp);
1003+
1004+
static insOpts AddEmbBroadcastMode(insOpts instOptions);
10061005
#endif // defined(TARGET_XARCH)
10071006

10081007
#ifdef TARGET_ARM64
@@ -1576,16 +1575,22 @@ class CodeGen final : public CodeGenInterface
15761575
void inst_TT(instruction ins, emitAttr size, GenTree* op1);
15771576
void inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2);
15781577
void inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regNumber reg2, unsigned ival);
1579-
void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival);
1578+
void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival, insOpts instOptions);
15801579
void inst_RV_RV_TT(instruction ins,
15811580
emitAttr size,
15821581
regNumber targetReg,
15831582
regNumber op1Reg,
15841583
GenTree* op2,
15851584
bool isRMW,
15861585
insOpts instOptions);
1587-
void inst_RV_RV_TT_IV(
1588-
instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, int8_t ival, bool isRMW);
1586+
void inst_RV_RV_TT_IV(instruction ins,
1587+
emitAttr size,
1588+
regNumber targetReg,
1589+
regNumber op1Reg,
1590+
GenTree* op2,
1591+
int8_t ival,
1592+
bool isRMW,
1593+
insOpts instOptions);
15891594
#endif
15901595

15911596
void inst_set_SV_var(GenTree* tree);

src/coreclr/jit/codegencommon.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3856,7 +3856,7 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP&
38563856
}
38573857
#elif defined(TARGET_XARCH)
38583858
// XORPS is the fastest and smallest way to initialize a XMM register to zero.
3859-
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg);
3859+
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg, INS_OPTS_NONE);
38603860
dblInitReg = reg;
38613861
#elif defined(TARGET_ARM64)
38623862
// We will just zero out the entire vector register. This sets it to a double/float zero value
@@ -3896,7 +3896,7 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP&
38963896
}
38973897
#elif defined(TARGET_XARCH)
38983898
// XORPS is the fastest and smallest way to initialize a XMM register to zero.
3899-
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg);
3899+
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg, INS_OPTS_NONE);
39003900
fltInitReg = reg;
39013901
#elif defined(TARGET_ARM64)
39023902
// We will just zero out the entire vector register. This sets it to a double/float zero value

0 commit comments

Comments
 (0)