Skip to content

Commit ad52afd

Browse files
anthonycaninoBruceForstallkunalspathak
authored
Enable AVX512 Additional 16 SIMD Registers (#79544)
* Change regMask_enum and regMaskTP to unsigned __int64_t on AMD64. This allows for more registers to be encoded in the register allocator. * Add upper 16 SIMD registers to allocator. Commit includes refactoring code to use `const instrDesc *` instead of `instruction` so information about when EVEX is needed (due to high SIMD registers) is available to the emitter. * Limit high SIMD reg to compatible intrinsics lsra build. * Limit high SIMD reg to compatible intrinsics lsra build. * Limit high SIMD reg to compatible intrinsics and gentree nodes. Commit constrains certain hw intrinsics and gentree nodes to use lower SIMD registers even if upper SIMD registers are available due to limitations of EVEX encoding for certain instructions. For example, SSE `Reciprocal` lowers to `rcpps` which does not have an EVEX encoding form, hence, we cannot allow that hw intrincis node to use a high SIMD register. These intrinsics are marked with `HW_Flag_NoEvexSemantics`. Other such intructions related to masking (typically marked with `HW_Flag_ReturnsPerElementMask`) also have similar issues (though they can be replaced with the EVEX k registers and associated masking when implemented). In addition, the callee/calleer save registers have also been adjusted to properly handle the presence and absence of AVX512 upper simd registers at runtime. * Fix for X86 throughput. * Add upper simd stress test to the AVX512 testing pipeline. * Formatting. * Fix wrong-sized attr for simd mov instruction. * Fix non-AMD64 LSRA stress mask. * Update src/coreclr/jit/compiler.h Co-authored-by: Bruce Forstall <[email protected]> * Update src/coreclr/jit/compiler.cpp Co-authored-by: Bruce Forstall <[email protected]> * Update src/coreclr/jit/gentree.cpp Co-authored-by: Bruce Forstall <[email protected]> * Update src/coreclr/jit/hwintrinsic.h Co-authored-by: Bruce Forstall <[email protected]> * Update src/coreclr/jit/target.h Co-authored-by: Bruce Forstall <[email protected]> * Update src/coreclr/jit/emitxarch.cpp Co-authored-by: Bruce Forstall <[email protected]> * Remove unneeded vars * Address PR comments. * Allow `emitinl.h` access to the `rbm` variables. * Replace RBM_LOWSIMD with `BuildEvexIncompatibleMask`. * Move AVX512 dependent `targetamd.h` vars into compiler object. * Fixing some edge cases for `targetamd.h` variables. * Fix a merge/rebase bug. * Update src/coreclr/jit/compiler.h Co-authored-by: Bruce Forstall <[email protected]> * Update src/coreclr/jit/lsra.cpp Co-authored-by: Bruce Forstall <[email protected]> * Update src/coreclr/jit/compiler.h Co-authored-by: Bruce Forstall <[email protected]> * Fix nits. * Trying VM changes. * VM hack. * VM hack. * Revert "VM hack." This reverts commit 91cf3db. * Adjust ACTUAL_REG_COUNT based on availability of AVX512. * Use inline accessor functions instead of macros Convert from macros to accessor functions for RBM_ALLFLOAT, RBM_FLT_CALLEE_TRASH, CNT_CALLEE_TRASH_FLOAT. Convert LSRA use of ACTUAL_REG_COUNT to AVAILABLE_REG_COUNT, and create an accessor for that value for AMD64 as well. * Clearifying comments. --------- Co-authored-by: Bruce Forstall <[email protected]> Co-authored-by: Kunal Pathak <[email protected]>
1 parent 1166bba commit ad52afd

26 files changed

+1098
-525
lines changed

eng/pipelines/common/templates/runtimes/run-test-job.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,7 @@ jobs:
532532
${{ if in(parameters.testGroup, 'jitstress-isas-avx512') }}:
533533
scenarios:
534534
- jitstress_isas_avx512_forceevex
535+
- jitstress_isas_avx512_forceevex_stresshighregs
535536
${{ if in(parameters.testGroup, 'jitstressregs-x86') }}:
536537
scenarios:
537538
- jitstressregs1_x86_noavx

src/coreclr/jit/codegen.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,17 @@ class CodeGen final : public CodeGenInterface
3535
GenTree* addr, bool fold, bool* revPtr, GenTree** rv1Ptr, GenTree** rv2Ptr, unsigned* mulPtr, ssize_t* cnsPtr);
3636

3737
private:
38+
#if defined(TARGET_AMD64)
39+
regMaskTP get_RBM_ALLFLOAT() const
40+
{
41+
return compiler->rbmAllFloat;
42+
}
43+
regMaskTP get_RBM_FLT_CALLEE_TRASH() const
44+
{
45+
return compiler->rbmFltCalleeTrash;
46+
}
47+
#endif // TARGET_AMD64
48+
3849
#if defined(TARGET_XARCH)
3950
// Bit masks used in negating a float or double number.
4051
// This is to avoid creating more than one data constant for these bitmasks when a

src/coreclr/jit/codegenxarch.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3535,7 +3535,7 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
35353535
// this probably needs to be changed.
35363536

35373537
// Load
3538-
genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src, offset);
3538+
genCodeForLoadOffset(INS_movdqu, EA_16BYTE, xmmTmpReg, src, offset);
35393539
// Store
35403540
genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset);
35413541

@@ -8358,7 +8358,7 @@ void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset
83588358
{
83598359
ins = INS_movdqu;
83608360
// This should be changed!
8361-
attr = EA_8BYTE;
8361+
attr = EA_16BYTE;
83628362
size = 16;
83638363
}
83648364
else

src/coreclr/jit/compiler.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3329,6 +3329,24 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
33293329
opts.compJitSaveFpLrWithCalleeSavedRegisters = JitConfig.JitSaveFpLrWithCalleeSavedRegisters();
33303330
}
33313331
#endif // defined(DEBUG) && defined(TARGET_ARM64)
3332+
3333+
#if defined(TARGET_AMD64)
3334+
rbmAllFloat = RBM_ALLFLOAT_INIT;
3335+
rbmFltCalleeTrash = RBM_FLT_CALLEE_TRASH_INIT;
3336+
cntCalleeTrashFloat = CNT_CALLEE_TRASH_FLOAT_INIT;
3337+
availableRegCount = ACTUAL_REG_COUNT;
3338+
3339+
if (DoJitStressEvexEncoding())
3340+
{
3341+
rbmAllFloat |= RBM_HIGHFLOAT;
3342+
rbmFltCalleeTrash |= RBM_HIGHFLOAT;
3343+
cntCalleeTrashFloat += CNT_CALLEE_TRASH_HIGHFLOAT;
3344+
}
3345+
else
3346+
{
3347+
availableRegCount -= CNT_HIGHFLOAT;
3348+
}
3349+
#endif // TARGET_AMD64
33323350
}
33333351

33343352
#ifdef DEBUG
@@ -3532,6 +3550,37 @@ bool Compiler::compPromoteFewerStructs(unsigned lclNum)
35323550
return rejectThisPromo;
35333551
}
35343552

3553+
//------------------------------------------------------------------------
3554+
// dumpRegMask: display a register mask. For well-known sets of registers, display a well-known token instead of
3555+
// a potentially large number of registers.
3556+
//
3557+
// Arguments:
3558+
// regs - The set of registers to display
3559+
//
3560+
void Compiler::dumpRegMask(regMaskTP regs) const
3561+
{
3562+
if (regs == RBM_ALLINT)
3563+
{
3564+
printf("[allInt]");
3565+
}
3566+
else if (regs == (RBM_ALLINT & ~RBM_FPBASE))
3567+
{
3568+
printf("[allIntButFP]");
3569+
}
3570+
else if (regs == RBM_ALLFLOAT)
3571+
{
3572+
printf("[allFloat]");
3573+
}
3574+
else if (regs == RBM_ALLDOUBLE)
3575+
{
3576+
printf("[allDouble]");
3577+
}
3578+
else
3579+
{
3580+
dspRegMask(regs);
3581+
}
3582+
}
3583+
35353584
#endif // DEBUG
35363585

35373586
void Compiler::compInitDebuggingInfo()

src/coreclr/jit/compiler.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10453,6 +10453,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
1045310453

1045410454
bool compJitHaltMethod();
1045510455

10456+
void dumpRegMask(regMaskTP regs) const;
10457+
1045610458
#endif
1045710459

1045810460
/*
@@ -10727,6 +10729,48 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
1072710729
GenTree* fgMorphMultiregStructArg(CallArg* arg);
1072810730

1072910731
bool killGCRefs(GenTree* tree);
10732+
10733+
#if defined(TARGET_AMD64)
10734+
private:
10735+
// The following are for initializing register allocator "constants" defined in targetamd64.h
10736+
// that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which increases
10737+
// the number of SIMD (xmm, ymm, and zmm) registers from 16 to 32.
10738+
// As only 64-bit xarch has the capability to have the additional registers, we limit the changes
10739+
// to TARGET_AMD64 only.
10740+
//
10741+
// Users of these values need to define four accessor functions:
10742+
//
10743+
// regMaskTP get_RBM_ALLFLOAT();
10744+
// regMaskTP get_RBM_FLT_CALLEE_TRASH();
10745+
// unsigned get_CNT_CALLEE_TRASH_FLOAT();
10746+
// unsigned get_AVAILABLE_REG_COUNT();
10747+
//
10748+
// which return the values of these variables.
10749+
//
10750+
// This was done to avoid polluting all `targetXXX.h` macro definitions with a compiler parameter, where only
10751+
// TARGET_AMD64 requires one.
10752+
//
10753+
regMaskTP rbmAllFloat;
10754+
regMaskTP rbmFltCalleeTrash;
10755+
unsigned cntCalleeTrashFloat;
10756+
unsigned availableRegCount;
10757+
10758+
public:
10759+
regMaskTP get_RBM_ALLFLOAT() const
10760+
{
10761+
return rbmAllFloat;
10762+
}
10763+
regMaskTP get_RBM_FLT_CALLEE_TRASH() const
10764+
{
10765+
return rbmFltCalleeTrash;
10766+
}
10767+
unsigned get_CNT_CALLEE_TRASH_FLOAT() const
10768+
{
10769+
return cntCalleeTrashFloat;
10770+
}
10771+
10772+
#endif // TARGET_AMD64
10773+
1073010774
}; // end of class Compiler
1073110775

1073210776
//---------------------------------------------------------------------------------------------------------------------

src/coreclr/jit/emit.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,17 @@ void emitLocation::Print(LONG compMethodID) const
120120
}
121121
#endif // DEBUG
122122

123+
#if defined(TARGET_AMD64)
124+
inline regMaskTP emitter::get_RBM_FLT_CALLEE_TRASH() const
125+
{
126+
return emitComp->rbmFltCalleeTrash;
127+
}
128+
inline unsigned emitter::get_AVAILABLE_REG_COUNT() const
129+
{
130+
return emitComp->availableRegCount;
131+
}
132+
#endif // TARGET_AMD64
133+
123134
/*****************************************************************************
124135
*
125136
* Return the name of an instruction format.
@@ -3226,11 +3237,19 @@ void emitter::emitDispRegSet(regMaskTP regs)
32263237

32273238
for (reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg))
32283239
{
3229-
if ((regs & genRegMask(reg)) == 0)
3240+
if (regs == RBM_NONE)
3241+
{
3242+
break;
3243+
}
3244+
3245+
regMaskTP curReg = genRegMask(reg);
3246+
if ((regs & curReg) == 0)
32303247
{
32313248
continue;
32323249
}
32333250

3251+
regs -= curReg;
3252+
32343253
if (sp)
32353254
{
32363255
printf(" ");
@@ -3400,6 +3419,7 @@ emitter::instrDesc* emitter::emitNewInstrCallInd(int argCnt,
34003419
#endif // TARGET_XARCH
34013420

34023421
/* Save the live GC registers in the unused register fields */
3422+
assert((gcrefRegs & RBM_CALLEE_TRASH) == 0);
34033423
emitEncodeCallGCregs(gcrefRegs, id);
34043424

34053425
return id;
@@ -3472,6 +3492,7 @@ emitter::instrDesc* emitter::emitNewInstrCallDir(int argCnt,
34723492
assert(!id->idIsLargeCns());
34733493

34743494
/* Save the live GC registers in the unused register fields */
3495+
assert((gcrefRegs & RBM_CALLEE_TRASH) == 0);
34753496
emitEncodeCallGCregs(gcrefRegs, id);
34763497

34773498
return id;

src/coreclr/jit/emit.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1138,6 +1138,28 @@ class emitter
11381138
idAddr()->_idReg4 = reg;
11391139
assert(reg == idAddr()->_idReg4);
11401140
}
1141+
bool idHasReg3() const
1142+
{
1143+
switch (idInsFmt())
1144+
{
1145+
case IF_RWR_RRD_RRD:
1146+
case IF_RWR_RRD_RRD_CNS:
1147+
case IF_RWR_RRD_RRD_RRD:
1148+
return true;
1149+
default:
1150+
return false;
1151+
}
1152+
}
1153+
bool idHasReg4() const
1154+
{
1155+
switch (idInsFmt())
1156+
{
1157+
case IF_RWR_RRD_RRD_RRD:
1158+
return true;
1159+
default:
1160+
return false;
1161+
}
1162+
}
11411163
#endif // defined(TARGET_XARCH)
11421164
#ifdef TARGET_ARMARCH
11431165
insOpts idInsOpt() const
@@ -1968,6 +1990,11 @@ class emitter
19681990
CORINFO_FIELD_HANDLE emitBlkConst(const void* cnsAddr, unsigned cnsSize, unsigned cnsAlign, var_types elemType);
19691991

19701992
private:
1993+
#if defined(TARGET_AMD64)
1994+
regMaskTP get_RBM_FLT_CALLEE_TRASH() const;
1995+
unsigned get_AVAILABLE_REG_COUNT() const;
1996+
#endif // TARGET_AMD64
1997+
19711998
CORINFO_FIELD_HANDLE emitFltOrDblConst(double constValue, emitAttr attr);
19721999
CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue);
19732000
CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue);

src/coreclr/jit/emitinl.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,11 +211,8 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id)
211211
*
212212
* Convert between a register mask and a smaller version for storage.
213213
*/
214-
215214
/*static*/ inline void emitter::emitEncodeCallGCregs(regMaskTP regmask, instrDesc* id)
216215
{
217-
assert((regmask & RBM_CALLEE_TRASH) == 0);
218-
219216
unsigned encodeMask;
220217

221218
#ifdef TARGET_X86

0 commit comments

Comments
 (0)