From 9da5b6d2ca3289c080f6af8f13eb73de6035af28 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Thu, 27 Oct 2022 14:05:28 -0700 Subject: [PATCH 01/34] Change regMask_enum and regMaskTP to unsigned __int64_t on AMD64. This allows for more registers to be encoded in the register allocator. --- src/coreclr/jit/target.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 392a5417141398..a2ac6b68075660 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -61,7 +61,11 @@ inline bool compUnixX86Abi() /*****************************************************************************/ // The following are intended to capture only those #defines that cannot be replaced // with static const members of Target -#if defined(TARGET_XARCH) +#if defined(TARGET_AMD64) +#define REGMASK_BITS 64 +#define CSE_CONST_SHARED_LOW_BITS 16 + +#elif defined(TARGET_XARCH) #define REGMASK_BITS 32 #define CSE_CONST_SHARED_LOW_BITS 16 @@ -135,7 +139,7 @@ enum _regMask_enum : unsigned __int64 #elif defined(TARGET_AMD64) -enum _regNumber_enum : unsigned +enum _regNumber_enum : unsigned { #define REGDEF(name, rnum, mask, sname) REG_##name = rnum, #define REGALIAS(alias, realname) REG_##alias = REG_##realname, @@ -146,7 +150,7 @@ enum _regNumber_enum : unsigned ACTUAL_REG_COUNT = REG_COUNT - 1 // everything but REG_STK (only real regs) }; -enum _regMask_enum : unsigned +enum _regMask_enum : unsigned __int64 { RBM_NONE = 0, @@ -192,7 +196,7 @@ enum _regMask_enum : unsigned // In any case, we believe that is OK to freely cast between these types; no information will // be lost. -#if defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_AMD64) || defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64) typedef unsigned __int64 regMaskTP; #else typedef unsigned regMaskTP; @@ -528,7 +532,7 @@ inline regMaskTP genRegMask(regNumber reg) // (L1 latency on sandy bridge is 4 cycles for [base] and 5 for [base + index*c] ) // the reason this is AMD-only is because the x86 BE will try to get reg masks for REG_STK // and the result needs to be zero. - regMaskTP result = 1 << reg; + regMaskTP result = 1ULL << reg; assert(result == regMasks[reg]); return result; #else From 6c6c884ff0906fbad6ad0dce2a9a2da94c92e0fd Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 14 Dec 2022 13:00:29 -0800 Subject: [PATCH 02/34] Add upper 16 SIMD registers to allocator. Commit includes refactoring code to use `const instrDesc *` instead of `instruction` so information about when EVEX is needed (due to high SIMD registers) is available to the emitter. --- src/coreclr/jit/emit.h | 24 ++ src/coreclr/jit/emitxarch.cpp | 508 +++++++++++++++++++++------------- src/coreclr/jit/emitxarch.h | 64 +++-- src/coreclr/jit/lsra.cpp | 9 + src/coreclr/jit/lsra.h | 4 +- src/coreclr/jit/register.h | 21 +- src/coreclr/jit/targetamd64.h | 5 +- 7 files changed, 408 insertions(+), 227 deletions(-) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 4d4b75ad351073..28d30508f334db 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1121,6 +1121,30 @@ class emitter idAddr()->_idReg4 = reg; assert(reg == idAddr()->_idReg4); } + bool idHasReg3() const + { + switch (idInsFmt()) + { + case IF_RWR_RRD_RRD: + case IF_RWR_RRD_RRD_CNS: + case IF_RWR_RRD_RRD_RRD: + case IF_RWR_RRD_SRD_RRD: + case IF_RWR_RRD_ARD_RRD: + return true; + default: + return false; + } + } + bool idHasReg4() const + { + switch (idInsFmt()) + { + case IF_RWR_RRD_RRD_RRD: + return true; + default: + return false; + } + } #endif // defined(TARGET_XARCH) #ifdef TARGET_ARMARCH insOpts idInsOpt() const diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 2541ed4473a722..bf34bffdd4ed35 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -761,9 +761,11 @@ bool emitter::Is4ByteSSEInstruction(instruction ins) const // Return Value: // true if this instruction requires a VEX or EVEX prefix. // -bool emitter::TakesSimdPrefix(instruction ins) const +bool emitter::TakesSimdPrefix(const instrDesc *id) const { - return TakesEvexPrefix(ins) || TakesVexPrefix(ins); + instruction ins = id->idIns(); + + return TakesEvexPrefix(id) || TakesVexPrefix(ins); } //------------------------------------------------------------------------ @@ -785,13 +787,20 @@ bool emitter::TakesSimdPrefix(instruction ins) const // Return Value: // true if this instruction requires a EVEX prefix. // -bool emitter::TakesEvexPrefix(instruction ins) const +bool emitter::TakesEvexPrefix(const instrDesc *id) const { if (!emitComp->DoJitStressEvexEncoding()) { return false; } + if (HasHighSIMDReg(id)) + { + return true; + } + + instruction ins = id->idIns(); + // TODO-XArch-AVX512: Revisit 'HasKMaskRegisterDest()' check once KMask support is added. return IsEvexEncodedInstruction(ins) && !HasKMaskRegisterDest(ins); } @@ -1059,6 +1068,35 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) #endif //! TARGET_AMD64 } +// Returns true if using this register will require an EVEX.R', EVEX.V' or EVEX.X bit. +bool emitter::HasHighSIMDReg(const instrDesc *id) const +{ +#if defined(TARGET_AMD64) + if (IsHighSIMDReg(id->idReg1()) || IsHighSIMDReg(id->idReg2())) + return true; + + if (id->idIsSmallDsc()) + return false; + + if ((id->idHasReg3() && IsHighSIMDReg(id->idReg3())) || + (id->idHasReg4() && IsHighSIMDReg(id->idReg4()))) + return true; +#endif + // X86 JIT operates in 32-bit mode and hence extended reg are not available. + return false; +} + +// Returns true if using this register will require an EVEX.R', EVEX.V' or EVEX.X bit. +bool emitter::IsHighSIMDReg(regNumber reg) const +{ +#ifdef TARGET_AMD64 + return ((reg >= REG_XMM16) && (reg <= REG_XMM31)); +#else + // X86 JIT operates in 32-bit mode and hence extended reg are not available. + return false; +#endif +} + // Returns true if using this register will require a REX.* prefix. // Since XMM registers overlap with YMM registers, this routine // can also be used to know whether a YMM register if the @@ -1066,7 +1104,7 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) bool IsExtendedReg(regNumber reg) { #ifdef TARGET_AMD64 - return ((reg >= REG_R8) && (reg <= REG_R15)) || ((reg >= REG_XMM8) && (reg <= REG_XMM15)); + return ((reg >= REG_R8) && (reg <= REG_R15)) || ((reg >= REG_XMM8) && (reg <= REG_XMM31)); #else // X86 JIT operates in 32-bit mode and hence extended reg are not available. return false; @@ -1078,7 +1116,7 @@ bool IsExtendedReg(regNumber reg, emitAttr attr) { #ifdef TARGET_AMD64 // Not a register, so doesn't need a prefix - if (reg > REG_XMM15) + if (reg > REG_XMM31) { return false; } @@ -1119,12 +1157,19 @@ bool IsExtendedReg(regNumber reg, emitAttr attr) bool IsXMMReg(regNumber reg) { #ifdef TARGET_AMD64 - return (reg >= REG_XMM0) && (reg <= REG_XMM15); + return (reg >= REG_XMM0) && (reg <= REG_XMM31); #else // !TARGET_AMD64 return (reg >= REG_XMM0) && (reg <= REG_XMM7); #endif // !TARGET_AMD64 } +// Returns bits to be encoded in instruction for the given register +unsigned HighAwareRegEncoding(regNumber reg) +{ + static_assert((REG_XMM0 & 0x7) == 0, "bad XMMBASE"); + return (unsigned)(reg & 0xF); +} + // Returns bits to be encoded in instruction for the given register. unsigned RegEncoding(regNumber reg) { @@ -1135,11 +1180,13 @@ unsigned RegEncoding(regNumber reg) // Utility routines that abstract the logic of adding REX.W, REX.R, REX.X, REX.B and REX prefixes // SSE2: separate 1-byte prefix gets added before opcode. // AVX: specific bits within VEX prefix need to be set in bit-inverted form. -emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexWPrefix(const instrDesc *id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // W-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1169,11 +1216,13 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) #ifdef TARGET_AMD64 -emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexRPrefix(const instrDesc *id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // R-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1197,11 +1246,13 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) return code | 0x4400000000ULL; } -emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexXPrefix(const instrDesc *id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { // X-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1224,11 +1275,13 @@ emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) return code | 0x4200000000ULL; } -emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexBPrefix(const instrDesc *id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // B-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1260,6 +1313,19 @@ emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) return code | 0x4000000000ULL; } +emitter::code_t emitter::AddEvexVPrimePrefix(code_t code) +{ + assert(UseEvexEncoding() && hasEvexPrefix(code)); + return emitter::code_t(code & 0xFFFFFFF7FFFFFFFFULL); +} + +emitter::code_t emitter::AddEvexRPrimePrefix(code_t code) +{ + assert(UseEvexEncoding() && hasEvexPrefix(code)); + return emitter::code_t(code & 0xFFEFFFFFFFFFFFFFULL); +} + + #endif // TARGET_AMD64 bool isPrefix(BYTE b) @@ -1800,7 +1866,7 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const // IsEvexEncodedInstruction(ins) is `true` for AVX/SSE instructions also which needs to be VEX encoded unless // explicitly // asked for EVEX. - if (IsEvexEncodedInstruction(ins) && TakesEvexPrefix(ins)) + if (IsEvexEncodedInstruction(ins) && TakesEvexPrefix(id)) { // EVEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces. // Therefore, to estimate the size adding EVEX prefix size and size of instruction opcode bytes will always @@ -2574,10 +2640,12 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins) const * part of an opcode. */ -inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code) +inline unsigned emitter::insEncodeReg012(const instrDesc *id, regNumber reg, emitAttr size, code_t* code) { assert(reg < REG_STK); + instruction ins = id->idIns(); + #ifdef TARGET_AMD64 // Either code is not NULL or reg is not an extended reg. // If reg is an extended reg, instruction needs to be prefixed with 'REX' @@ -2586,7 +2654,14 @@ inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAtt if (IsExtendedReg(reg)) { - *code = AddRexBPrefix(ins, *code); // REX.B + if (IsHighSIMDReg(reg)) + { + *code = AddRexXPrefix(id, *code); // EVEX.X + } + if (reg & 0x8) + { + *code = AddRexBPrefix(id, *code); // REX.B + } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) { @@ -2608,10 +2683,12 @@ inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAtt * part of an opcode. */ -inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code) +inline unsigned emitter::insEncodeReg345(const instrDesc *id, regNumber reg, emitAttr size, code_t* code) { assert(reg < REG_STK); + instruction ins = id->idIns(); + #ifdef TARGET_AMD64 // Either code is not NULL or reg is not an extended reg. // If reg is an extended reg, instruction needs to be prefixed with 'REX' @@ -2620,7 +2697,14 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt if (IsExtendedReg(reg)) { - *code = AddRexRPrefix(ins, *code); // REX.R + if (IsHighSIMDReg(reg)) + { + *code = AddEvexRPrimePrefix(*code); // EVEX.R' + } + if (reg & 0x8) + { + *code = AddRexRPrefix(id, *code); // REX.R + } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) { @@ -2641,8 +2725,10 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt * Returns modified SIMD opcode with the specified register encoded in bits 3-6 of * byte 2 of VEX and EVEX prefix. */ -inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeReg3456(const instrDesc *id, regNumber reg, emitAttr size, code_t code) { + instruction ins = id->idIns(); + assert(reg < REG_STK); assert(IsVexOrEvexEncodedInstruction(ins)); assert(hasVexOrEvexPrefix(code)); @@ -2660,10 +2746,20 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, assert(regBits <= 0xF); if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) { - assert(hasEvexPrefix(code) && TakesEvexPrefix(ins)); + assert(hasEvexPrefix(code) && TakesEvexPrefix(id)); + // TODO-XARCH-AVX512 I don't like that we redefine regBits on the EVEX case. + // Rather see these paths cleaned up. + regBits = HighAwareRegEncoding(reg); +#if defined(TARGET_AMD64) + if (IsHighSIMDReg(reg)) + { + // Have to set the EVEX V' bit + code = AddEvexVPrimePrefix(code); + } +#endif // Shift count = 5-bytes of opcode + 0-2 bits for EVEX regBits <<= 43; return code ^ regBits; @@ -2671,6 +2767,11 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, } if (UseVEXEncoding() && IsVexEncodedInstruction(ins)) { + + + // Both prefix encodes register operand in 1's complement form + assert(regBits <= 0xF); + if (TakesVexPrefix(ins)) { assert(hasVexPrefix(code)); @@ -2690,8 +2791,10 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, * Used exclusively to generate the REX.X bit and truncate the register. */ -inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* code) +inline unsigned emitter::insEncodeRegSIB(const instrDesc *id, regNumber reg, code_t* code) { + instruction ins = id->idIns(); + assert(reg < REG_STK); #ifdef TARGET_AMD64 @@ -2702,7 +2805,14 @@ inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* if (IsExtendedReg(reg)) { - *code = AddRexXPrefix(ins, *code); // REX.X + if (IsHighSIMDReg(reg)) + { + *code = AddEvexVPrimePrefix(*code); // EVEX.X + } + if (reg & 0x8) + { + *code = AddRexXPrefix(id, *code); // REX.B + } } unsigned regBits = RegEncoding(reg); #else // !TARGET_AMD64 @@ -2718,7 +2828,7 @@ inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* * Returns the "[r/m]" opcode with the mod/RM field set to register. */ -inline emitter::code_t emitter::insEncodeMRreg(instruction ins, code_t code) +inline emitter::code_t emitter::insEncodeMRreg(const instrDesc *id, code_t code) { // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding. @@ -2736,7 +2846,7 @@ inline emitter::code_t emitter::insEncodeMRreg(instruction ins, code_t code) * Returns the given "[r/m]" opcode with the mod/RM field set to register. */ -inline emitter::code_t emitter::insEncodeRMreg(instruction ins, code_t code) +inline emitter::code_t emitter::insEncodeRMreg(const instrDesc *id, code_t code) { // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding. @@ -2754,11 +2864,11 @@ inline emitter::code_t emitter::insEncodeRMreg(instruction ins, code_t code) * the given register. */ -inline emitter::code_t emitter::insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeMRreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code) { assert((code & 0xC000) == 0); code |= 0xC000; - unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8; + unsigned regcode = insEncodeReg012(id, reg, size, &code) << 8; code |= regcode; return code; } @@ -2769,11 +2879,11 @@ inline emitter::code_t emitter::insEncodeMRreg(instruction ins, regNumber reg, e * the given register. */ -inline emitter::code_t emitter::insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeMIreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code) { assert((code & 0xC000) == 0); code |= 0xC000; - unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8; + unsigned regcode = insEncodeReg012(id, reg, size, &code) << 8; code |= regcode; return code; } @@ -2794,13 +2904,13 @@ inline bool insNeedsRRIb(instruction ins) * Returns the "reg,reg,imm8" opcode with both the reg's set to the * the given register. */ -inline emitter::code_t emitter::insEncodeRRIb(instruction ins, regNumber reg, emitAttr size) +inline emitter::code_t emitter::insEncodeRRIb(const instrDesc *id, regNumber reg, emitAttr size) { assert(size == EA_4BYTE); // All we handle for now. - assert(insNeedsRRIb(ins)); + assert(insNeedsRRIb(id->idIns())); // If this list gets longer, use a switch, or a table lookup. code_t code = 0x69c0; - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // We use the same register as source and destination. (Could have another version that does both regs...) code |= regcode; code |= (regcode << 3); @@ -2813,10 +2923,10 @@ inline emitter::code_t emitter::insEncodeRRIb(instruction ins, regNumber reg, em * nibble of the opcode */ -inline emitter::code_t emitter::insEncodeOpreg(instruction ins, regNumber reg, emitAttr size) +inline emitter::code_t emitter::insEncodeOpreg(const instrDesc *id, regNumber reg, emitAttr size) { - code_t code = insCodeRR(ins); - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + code_t code = insCodeRR(id->idIns()); + unsigned regcode = insEncodeReg012(id, reg, size, &code); code |= regcode; return code; } @@ -3090,7 +3200,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) } else { - sz += emitInsSize(id, insEncodeRMreg(ins, code), includeRexPrefixSize); + sz += emitInsSize(id, insEncodeRMreg(id, code), includeRexPrefixSize); } return sz; @@ -3219,7 +3329,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, assert(emitComp->lvaTempsHaveLargerOffsetThanVars()); // Check whether we can use compressed displacement if EVEX. - if (TakesEvexPrefix(id->idIns())) + if (TakesEvexPrefix(id)) { bool compressedFitsInByte = false; TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte); @@ -3263,7 +3373,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, #endif // !FEATURE_FIXED_OUT_ARGS bool useSmallEncoding = false; - if (TakesEvexPrefix(id->idIns())) + if (TakesEvexPrefix(id)) { TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding); } @@ -3416,7 +3526,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) } else { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -5026,7 +5136,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) /* We expect this to always be a 'big' opcode */ - assert(insEncodeMRreg(ins, reg, attr, insCodeMR(ins)) & 0x00FF0000); + assert(insEncodeMRreg(id, reg, attr, insCodeMR(ins)) & 0x00FF0000); size = attr; @@ -5046,7 +5156,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) id->idReg1(reg); // Vex bytes - sz += emitGetAdjustedSize(id, insEncodeMRreg(ins, reg, attr, insCodeMR(ins))); + sz += emitGetAdjustedSize(id, insEncodeMRreg(id, reg, attr, insCodeMR(ins))); // REX byte if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, attr)) @@ -8903,7 +9013,7 @@ void emitter::emitIns_Call(EmitCallType callType, { // Tailcall with addressing mode/register needs to be rex.w // prefixed to be recognized as part of epilog by unwinder. - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } sz = emitInsSizeAM(id, code); @@ -11262,13 +11372,13 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { // tail call with addressing mode (or through register) needs rex.w // prefix to be recognized by unwinder as part of epilog. - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Special case: call via a register if (id->idIsCallRegPtr()) { - code = insEncodeMRreg(ins, reg, EA_PTRSIZE, code); + code = insEncodeMRreg(id, reg, EA_PTRSIZE, code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); dst += emitOutputWord(dst, code); goto DONE; @@ -11282,14 +11392,14 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Compute the REX prefix if it exists if (IsExtendedReg(reg, EA_PTRSIZE)) { - insEncodeReg012(ins, reg, EA_PTRSIZE, &code); + insEncodeReg012(id, reg, EA_PTRSIZE, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. reg = (regNumber)RegEncoding(reg); } if (IsExtendedReg(rgx, EA_PTRSIZE)) { - insEncodeRegSIB(ins, rgx, &code); + insEncodeRegSIB(id, rgx, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. rgx = (regNumber)RegEncoding(rgx); } @@ -11335,7 +11445,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { case IF_RWR_ARD: - assert(code == (insCodeRM(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8))); + assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); code &= ~((code_t)0xFFFFFFFF); code |= 0xA0; @@ -11344,7 +11454,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_AWR_RRD: - assert(code == (insCodeMR(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8))); + assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); code &= ~((code_t)0xFFFFFFFF); code |= 0xA2; @@ -11361,10 +11471,10 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Emit SIMD prefix if required // There are some callers who already add SIMD prefix and call this routine. // Therefore, add SIMD prefix is one is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // For this format, moves do not support a third operand, so we only need to handle the binary ops. - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { if (IsDstDstSrcAVXInstruction(ins)) { @@ -11389,33 +11499,33 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, src1, size, code); + code = insEncodeReg3456(id, src1, size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { - code = insEncodeReg3456(ins, id->idReg2(), size, code); + code = insEncodeReg3456(id, id->idReg2(), size, code); } } // Emit the REX prefix if required // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. - // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doing so currently - // since we cannot differentiate EVEX vs VEX without 'code' until all paths have EVEX support. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doind so currently + // since we cannot differentiate EVEX vs VEX without 'code' untill all paths have EVEX support. + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } if (IsExtendedReg(reg, EA_PTRSIZE)) { - insEncodeReg012(ins, reg, EA_PTRSIZE, &code); + insEncodeReg012(id, reg, EA_PTRSIZE, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. reg = (regNumber)RegEncoding(reg); } if (IsExtendedReg(rgx, EA_PTRSIZE)) { - insEncodeRegSIB(ins, rgx, &code); + insEncodeRegSIB(id, rgx, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. rgx = (regNumber)RegEncoding(rgx); } @@ -11459,7 +11569,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } } } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -11589,7 +11699,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -11820,7 +11930,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { // Put the register in the opcode - code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr); + code |= insEncodeReg012(id, reg, EA_PTRSIZE, nullptr); // Is there a displacement? if (dspIsZero) @@ -11850,7 +11960,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // Put the register in the opcode - code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) << 8; + code |= insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) << 8; // Is there a displacement? if (dspIsZero) @@ -11896,8 +12006,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (reg != REG_NA) { // The address is "[reg + {2/4/8} * rgx + icon]" - regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) | - insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); + regByte = insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) | + insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -11963,8 +12073,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // The address is "[{2/4/8} * rgx + icon]" - regByte = insEncodeReg012(ins, REG_EBP, EA_PTRSIZE, nullptr) | - insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); + regByte = insEncodeReg012(id, REG_EBP, EA_PTRSIZE, nullptr) | + insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -11993,7 +12103,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // The address is "[reg+rgx+dsp]" - regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) | insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr); + regByte = insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) | insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -12231,16 +12341,16 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Add VEX or EVEX prefix if required. // There are some callers who already add prefix and call this routine. // Therefore, add VEX or EVEX prefix if one is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). // Not doing so currently since we cannot differentiate EVEX vs VEX without // 'code' until all paths have EVEX support. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Special case emitting AVX instructions @@ -12267,9 +12377,9 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -12400,7 +12510,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this int dspAsByte = dsp; - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -12454,7 +12564,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -12680,12 +12790,12 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Compute VEX/EVEX prefix // Some of its callers already add EVEX/VEX prefix and then call this routine. // Therefore add EVEX/VEX prefix is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // `addc` is used for two kinds if instructions @@ -12720,7 +12830,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { case IF_RWR_MRD: - assert(code == (insCodeRM(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); + assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); code &= ~((code_t)0xFFFFFFFF); code |= 0xA0; @@ -12729,7 +12839,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_MWR_RRD: - assert(code == (insCodeMR(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); + assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); code &= ~((code_t)0xFFFFFFFF); code |= 0xA2; @@ -12767,9 +12877,9 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13146,13 +13256,13 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code |= 0x1; } - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Register... - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13166,7 +13276,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) // Output a size prefix for a 16-bit operand dst += emitOutputByte(dst, 0x66); } - dst += emitOutputByte(dst, insCodeRR(ins) | insEncodeReg012(ins, reg, size, nullptr)); + dst += emitOutputByte(dst, insCodeRR(ins) | insEncodeReg012(id, reg, size, nullptr)); } break; @@ -13176,9 +13286,9 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) case INS_push_hide: assert(size == EA_PTRSIZE); - code = insEncodeOpreg(ins, reg, size); + code = insEncodeOpreg(id, reg, size); - assert(!TakesSimdPrefix(ins)); + assert(!TakesSimdPrefix(id)); assert(!TakesRexWPrefix(ins, size)); // Output the REX prefix @@ -13198,13 +13308,13 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code = insCodeRR(ins); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Register... - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13233,7 +13343,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) assert(id->idGCref() == GCT_NONE); assert(size == EA_1BYTE); - code = insEncodeMRreg(ins, reg, EA_1BYTE, insCodeMR(ins)); + code = insEncodeMRreg(id, reg, EA_1BYTE, insCodeMR(ins)); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13259,7 +13369,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) assert(id->idGCref() == GCT_NONE); - code = insEncodeMRreg(ins, reg, size, insCodeMR(ins)); + code = insEncodeMRreg(id, reg, size, insCodeMR(ins)); if (size != EA_1BYTE) { @@ -13273,11 +13383,11 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) } } - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Output the REX prefix @@ -13364,36 +13474,36 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { code = insCodeMR(ins); } - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } else if ((ins == INS_movsx) || (ins == INS_movzx) || (insIsCMOV(ins))) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code) | (int)(size == EA_2BYTE); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code) | (int)(size == EA_2BYTE); #ifdef TARGET_AMD64 assert((size < EA_4BYTE) || (insIsCMOV(ins))); if ((size == EA_8BYTE) || (ins == INS_movsx)) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } else if (ins == INS_movsxd) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); #endif // TARGET_AMD64 } @@ -13403,8 +13513,8 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); if ((ins == INS_crc32) && (size > EA_1BYTE)) { code |= 0x0100; @@ -13417,15 +13527,15 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } else if (size == EA_8BYTE) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } #endif // FEATURE_HW_INTRINSICS else { - assert(!TakesSimdPrefix(ins)); + assert(!TakesSimdPrefix(id)); code = insCodeMR(ins); - code = insEncodeMRreg(ins, code); + code = insEncodeMRreg(id, code); if (ins != INS_test) { @@ -13455,7 +13565,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) // Don't need to zero out the high bits explicitly if ((ins != INS_xor) || (reg1 != reg2)) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } else { @@ -13492,10 +13602,10 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } } - unsigned regCode = insEncodeReg345(ins, regFor345Bits, size, &code); - regCode |= insEncodeReg012(ins, regFor012Bits, size, &code); + unsigned regCode = insEncodeReg345(id, regFor345Bits, size, &code); + regCode |= insEncodeReg012(id, regFor012Bits, size, &code); - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { // In case of AVX instructions that take 3 operands, we generally want to encode reg1 // as first source. In this case, reg1 is both a source and a destination. @@ -13507,12 +13617,12 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) if (IsDstDstSrcAVXInstruction(ins)) { // encode source/dest operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, reg1, size, code); + code = insEncodeReg3456(id, reg1, size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, reg2, size, code); + code = insEncodeReg3456(id, reg2, size, code); } } @@ -13754,21 +13864,21 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) emitAttr size = id->idOpSize(); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); - code = insEncodeRMreg(ins, code); + code = insEncodeRMreg(id, code); // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } - unsigned regCode = insEncodeReg345(ins, targetReg, size, &code); - regCode |= insEncodeReg012(ins, src2, size, &code); + unsigned regCode = insEncodeReg345(id, targetReg, size, &code); + regCode |= insEncodeReg012(id, src2, size, &code); // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, src1, size, code); + code = insEncodeReg3456(id, src1, size, code); // Output the REX/VEX/EVEX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13862,17 +13972,17 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) // Get the 'base' opcode. code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMIreg(ins, reg, size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMIreg(id, reg, size, code); assert(code & 0x00FF0000); - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { // The 'vvvv' bits encode the destination register, which for this case (RI) // is the same as the source. - code = insEncodeReg3456(ins, reg, size, code); + code = insEncodeReg3456(id, reg, size, code); } - unsigned regcode = (insEncodeReg345(ins, regOpcode, size, &code) | insEncodeReg012(ins, reg, size, &code)) << 8; + unsigned regcode = (insEncodeReg345(id, regOpcode, size, &code) | insEncodeReg012(id, reg, size, &code)) << 8; // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13900,15 +14010,15 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) assert(code < 0x100); code |= 0x08; // Set the 'w' bit - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); code |= regcode; // This is INS_mov and will not take VEX prefix assert(!TakesVexPrefix(ins)); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -14000,13 +14110,13 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) // r/m, immed form, but do have a dstReg,srcReg,imm8 form. if (valInByte && useSigned && insNeedsRRIb(ins)) { - code = insEncodeRRIb(ins, reg, size); + code = insEncodeRRIb(id, reg, size); } else { code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMIreg(ins, reg, size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMIreg(id, reg, size, code); } } @@ -14030,7 +14140,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) /* Set the 'w' bit to get the large version */ /* and the REX.W bit to get the really large version */ - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); code |= 0x1; break; #endif @@ -14239,9 +14349,9 @@ BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id) } else { - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); } @@ -14549,7 +14659,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) idAmd->idCodeSize(sz); code = insCodeRM(ins); - code |= (insEncodeReg345(ins, id->idReg1(), EA_PTRSIZE, &code) << 8); + code |= (insEncodeReg345(id, id->idReg1(), EA_PTRSIZE, &code) << 8); dst = emitOutputAM(dst, idAmd, code, nullptr); @@ -14665,7 +14775,7 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id) // ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte) { - assert(TakesEvexPrefix(id->idIns())); + assert(TakesEvexPrefix(id)); insTupleType tt = insTupleTypeInfo(id->idIns()); assert(hasTupleTypeInfo(id->idIns())); @@ -14876,12 +14986,12 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) #ifdef TARGET_AMD64 // Support only scalar AVX instructions and hence size is hard coded to 4-byte. - code = AddSimdPrefixIfNeeded(ins, code, EA_4BYTE); + code = AddSimdPrefixIfNeeded(id, code, EA_4BYTE); if (((ins == INS_cdq) || (ins == INS_cwde)) && - (TakesRexWPrefix(ins, id->idOpSize()) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins)))) + (TakesRexWPrefix(ins, id->idOpSize()) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id)))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); #endif @@ -15155,8 +15265,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RRW_SHF: code = insCodeMR(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMRreg(ins, id->idReg1(), size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, id->idReg1(), size, code); // set the W bit if (size != EA_1BYTE) @@ -15165,9 +15275,9 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } // Emit the REX prefix if it exists - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Output a size prefix for a 16-bit operand @@ -15223,8 +15333,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeMR(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMRreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, code); mReg = id->idReg1(); rReg = id->idReg2(); } @@ -15233,7 +15343,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeMI(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); assert((code & 0xC000) == 0); code |= 0xC000; @@ -15247,19 +15357,19 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeRM(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); mReg = id->idReg2(); rReg = id->idReg1(); } assert(code & 0x00FF0000); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { if (IsDstDstSrcAVXInstruction(ins)) { @@ -15269,17 +15379,17 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // (Though we will need to handle the few ops that can have the 'vvvv' bits as destination, // e.g. pslldq, when/if we support those instructions with 2 registers.) // (see x64 manual Table 2-9. Instructions with a VEX.vvvv destination) - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { // This is a "merge" move instruction. // Encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg2(), size, code); + code = insEncodeReg3456(id, id->idReg2(), size, code); } } - regcode = (insEncodeReg345(ins, rReg, size, &code) | insEncodeReg012(ins, mReg, size, &code)); + regcode = (insEncodeReg345(id, rReg, size, &code) | insEncodeReg012(id, mReg, size, &code)); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -15394,8 +15504,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } @@ -15422,8 +15532,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); } sz = emitSizeOfInsDsc(id); @@ -15451,8 +15561,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } sz = emitSizeOfInsDsc(id); @@ -15463,8 +15573,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_AWR_RRD: case IF_ARW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); sz = emitSizeOfInsDsc(id); break; @@ -15472,7 +15582,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_AWR_RRD_RRD: { code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); dst = emitOutputAM(dst, id, code); sz = emitSizeOfInsDsc(id); break; @@ -15561,7 +15671,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15572,10 +15682,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode, &cnsVal); } @@ -15596,15 +15706,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); } @@ -15617,8 +15727,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodedInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // 4-byte AVX instructions are special cased inside emitOutputSV @@ -15629,7 +15739,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); } break; @@ -15643,8 +15753,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // 4-byte AVX instructions are special cased inside emitOutputSV @@ -15655,7 +15765,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode, &cnsVal); } @@ -15667,7 +15777,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SWR_RRD: case IF_SRW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15678,10 +15788,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); break; @@ -15715,7 +15825,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15726,10 +15836,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500, &cnsVal); } @@ -15760,15 +15870,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); } @@ -15782,8 +15892,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodedInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // Special case 4-byte AVX instructions @@ -15793,7 +15903,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); } sz = emitSizeOfInsDsc(id); @@ -15808,8 +15918,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // Special case 4-byte AVX instructions @@ -15819,7 +15929,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500, &cnsVal); } sz = emitSizeOfInsDsc(id); @@ -15828,7 +15938,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RWR_MRD_OFF: code = insCode(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15839,10 +15949,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = insEncodeReg012(id->idIns(), id->idReg1(), size, &code); + regcode = insEncodeReg012(id, id->idReg1(), size, &code); dst = emitOutputCV(dst, id, code | 0x30 | regcode); sz = emitSizeOfInsDsc(id); break; @@ -15851,7 +15961,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_MWR_RRD: case IF_MRW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15862,10 +15972,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); sz = emitSizeOfInsDsc(id); break; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index dd4eec46dadb92..476902a6858cf1 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -75,16 +75,16 @@ unsigned emitGetAdjustedSize(instrDesc* id, code_t code) const; code_t emitExtractVexPrefix(instruction ins, code_t& code) const; code_t emitExtractEvexPrefix(instruction ins, code_t& code) const; -unsigned insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code); -unsigned insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code); -code_t insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code); -unsigned insEncodeRegSIB(instruction ins, regNumber reg, code_t* code); +unsigned insEncodeReg012(const instrDesc *id, regNumber reg, emitAttr size, code_t* code); +unsigned insEncodeReg345(const instrDesc *id, regNumber reg, emitAttr size, code_t* code); +code_t insEncodeReg3456(const instrDesc *id, regNumber reg, emitAttr size, code_t code); +unsigned insEncodeRegSIB(const instrDesc *id, regNumber reg, code_t* code); -code_t insEncodeMRreg(instruction ins, code_t code); -code_t insEncodeRMreg(instruction ins, code_t code); -code_t insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code); -code_t insEncodeRRIb(instruction ins, regNumber reg, emitAttr size); -code_t insEncodeOpreg(instruction ins, regNumber reg, emitAttr size); +code_t insEncodeMRreg(const instrDesc *id, code_t code); +code_t insEncodeRMreg(const instrDesc *id, code_t code); +code_t insEncodeMRreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code); +code_t insEncodeRRIb(const instrDesc *id, regNumber reg, emitAttr size); +code_t insEncodeOpreg(const instrDesc *id, regNumber reg, emitAttr size); unsigned insSSval(unsigned scale); @@ -103,16 +103,19 @@ bool IsVexEncodedInstruction(instruction ins) const; bool IsEvexEncodedInstruction(instruction ins) const; bool IsVexOrEvexEncodedInstruction(instruction ins) const; -code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code); +code_t insEncodeMIreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code); -code_t AddRexWPrefix(instruction ins, code_t code); -code_t AddRexRPrefix(instruction ins, code_t code); -code_t AddRexXPrefix(instruction ins, code_t code); -code_t AddRexBPrefix(instruction ins, code_t code); +code_t AddRexWPrefix(const instrDesc *id, code_t code); +code_t AddRexRPrefix(const instrDesc *id, code_t code); +code_t AddRexXPrefix(const instrDesc *id, code_t code); +code_t AddRexBPrefix(const instrDesc *id, code_t code); code_t AddRexPrefix(instruction ins, code_t code); bool EncodedBySSE38orSSE3A(instruction ins) const; bool Is4ByteSSEInstruction(instruction ins) const; +code_t AddEvexVPrimePrefix(code_t code); +code_t AddEvexRPrimePrefix(code_t code); + static bool IsMovInstruction(instruction ins); bool HasSideEffect(instruction ins, emitAttr size); bool IsRedundantMov( @@ -181,13 +184,15 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr // Returns: // `true` if W bit needs to be set to 1. // -bool IsWEvexOpcodeExtension(instruction ins) +bool IsWEvexOpcodeExtension(const instrDesc *id) { - if (!TakesEvexPrefix(ins)) + if (!TakesEvexPrefix(id)) { return false; } + instruction ins = id->idIns(); + switch (ins) { case INS_movq: @@ -486,7 +491,7 @@ bool UseSimdEncoding() const #define EVEX_PREFIX_MASK 0xFF00000000000000ULL #define EVEX_PREFIX_CODE 0x6200000000000000ULL -bool TakesEvexPrefix(instruction ins) const; +bool TakesEvexPrefix(const instrDesc *id) const; //------------------------------------------------------------------------ // hasEvexPrefix: Returns true if the instruction encoding already @@ -514,9 +519,13 @@ code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); // // Returns: // code with prefix added. -code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) +// TODO-XARCH-AVX512 come back and check whether we can id `id` directly (no need) +// to pass emitAttr size +code_t AddSimdPrefixIfNeeded(const instrDesc *id, code_t code, emitAttr size) { - if (TakesEvexPrefix(ins)) + instruction ins = id->idIns(); + + if (TakesEvexPrefix(id)) { code = AddEvexPrefix(ins, code, size); } @@ -537,11 +546,14 @@ code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) // size - operand size // // Returns: -// `true` if code has an Evex prefix. -// -code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr size) +// TRUE if code has an Evex prefix. +// TODO-XARCH-AVX512 come back and check whether we can id `id` directly (no need) +// to pass emitAttr size +code_t AddSimdPrefixIfNeededAndNotPresent(const instrDesc *id, code_t code, emitAttr size) { - if (TakesEvexPrefix(ins)) + instruction ins = id->idIns(); + + if (TakesEvexPrefix(id)) { code = !hasEvexPrefix(code) ? AddEvexPrefix(ins, code, size) : code; } @@ -552,7 +564,7 @@ code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr return code; } -bool TakesSimdPrefix(instruction ins) const; +bool TakesSimdPrefix(const instrDesc *id) const; //------------------------------------------------------------------------ // hasVexOrEvexPrefix: Returns true if the instruction encoding already @@ -1024,4 +1036,8 @@ inline bool HasEmbeddedBroadcast(instrDesc* id) return false; } + +inline bool HasHighSIMDReg(const instrDesc *id) const; +inline bool IsHighSIMDReg(regNumber) const; + #endif // TARGET_XARCH diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 659d630f8d326b..f32c54da1b4431 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -460,6 +460,15 @@ regMaskTP LinearScan::stressLimitRegs(RefPosition* refPosition, regMaskTP mask) } break; +#if defined(TARGET_AMD64) + case LSRA_LIMIT_UPPER_SIMD_SET: + if ((mask & LsraLimitUpperSimdSet) != RBM_NONE) + { + mask = getConstrainedRegMask(mask, LsraLimitUpperSimdSet, minRegCount); + } + break; +#endif + default: unreached(); } diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index cfbd74487f4947..ddf4f448db24ed 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -737,7 +737,7 @@ class LinearScan : public LinearScanInterface // This controls the registers available for allocation enum LsraStressLimitRegs{LSRA_LIMIT_NONE = 0, LSRA_LIMIT_CALLEE = 0x1, LSRA_LIMIT_CALLER = 0x2, - LSRA_LIMIT_SMALL_SET = 0x3, LSRA_LIMIT_MASK = 0x3}; + LSRA_LIMIT_SMALL_SET = 0x3, LSRA_LIMIT_UPPER_SIMD_SET = 0x4, LSRA_LIMIT_MASK = 0x5}; // When LSRA_LIMIT_SMALL_SET is specified, it is desirable to select a "mixed" set of caller- and callee-save // registers, so as to get different coverage than limiting to callee or caller. @@ -757,6 +757,8 @@ class LinearScan : public LinearScanInterface (RBM_EAX | RBM_ECX | RBM_EBX | RBM_ETW_FRAMED_EBP | RBM_ESI | RBM_EDI); #endif // !UNIX_AMD64_ABI static const regMaskTP LsraLimitSmallFPSet = (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM6 | RBM_XMM7); + static const regMaskTP LsraLimitUpperSimdSet = (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 + | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31); #elif defined(TARGET_ARM) // On ARM, we may need two registers to set up the target register for a virtual call, so we need // to have at least the maximum number of arg registers, plus 2. diff --git a/src/coreclr/jit/register.h b/src/coreclr/jit/register.h index 6f63bc51211d63..239e80c856528f 100644 --- a/src/coreclr/jit/register.h +++ b/src/coreclr/jit/register.h @@ -94,7 +94,26 @@ REGDEF(XMM12, 12+XMMBASE, XMMMASK(12), "mm12" ) REGDEF(XMM13, 13+XMMBASE, XMMMASK(13), "mm13" ) REGDEF(XMM14, 14+XMMBASE, XMMMASK(14), "mm14" ) REGDEF(XMM15, 15+XMMBASE, XMMMASK(15), "mm15" ) -REGDEF(STK, 16+XMMBASE, 0x0000, "STK" ) + +REGDEF(XMM16, 16+XMMBASE, XMMMASK(16), "mm16" ) +REGDEF(XMM17, 17+XMMBASE, XMMMASK(17), "mm17" ) +REGDEF(XMM18, 18+XMMBASE, XMMMASK(18), "mm18" ) +REGDEF(XMM19, 19+XMMBASE, XMMMASK(19), "mm19" ) +REGDEF(XMM20, 20+XMMBASE, XMMMASK(20), "mm20" ) +REGDEF(XMM21, 21+XMMBASE, XMMMASK(21), "mm21" ) +REGDEF(XMM22, 22+XMMBASE, XMMMASK(22), "mm22" ) +REGDEF(XMM23, 23+XMMBASE, XMMMASK(23), "mm23" ) + +REGDEF(XMM24, 24+XMMBASE, XMMMASK(24), "mm24" ) +REGDEF(XMM25, 25+XMMBASE, XMMMASK(25), "mm25" ) +REGDEF(XMM26, 26+XMMBASE, XMMMASK(26), "mm26" ) +REGDEF(XMM27, 27+XMMBASE, XMMMASK(27), "mm27" ) +REGDEF(XMM28, 28+XMMBASE, XMMMASK(28), "mm28" ) +REGDEF(XMM29, 29+XMMBASE, XMMMASK(29), "mm29" ) +REGDEF(XMM30, 30+XMMBASE, XMMMASK(30), "mm30" ) +REGDEF(XMM31, 31+XMMBASE, XMMMASK(31), "mm31" ) + +REGDEF(STK, 32+XMMBASE, 0x0000, "STK" ) #endif // !TARGET_X86 #elif defined(TARGET_ARM) diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 4ec128a6345d21..3c1c2b83960cd6 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -78,10 +78,11 @@ #endif // !UNIX_AMD64_ABI #define CSE_CONSTS 1 // Enable if we want to CSE constants - #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15) + #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15 | RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31) + #define RBM_ALLDOUBLE RBM_ALLFLOAT #define REG_FP_FIRST REG_XMM0 - #define REG_FP_LAST REG_XMM15 + #define REG_FP_LAST REG_XMM31 #define FIRST_FP_ARGREG REG_XMM0 #ifdef UNIX_AMD64_ABI From bd6d2a5be2c8a15652bf3b19ab593f5dff81c65b Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Tue, 17 Jan 2023 10:19:29 -0800 Subject: [PATCH 03/34] Limit high SIMD reg to compatible intrinsics lsra build. --- src/coreclr/jit/emit.h | 2 - src/coreclr/jit/emitxarch.cpp | 45 ++-- src/coreclr/jit/gentree.cpp | 16 ++ src/coreclr/jit/gentree.h | 6 + src/coreclr/jit/hwintrinsic.h | 20 +- src/coreclr/jit/hwintrinsiclistxarch.h | 304 +++++++++++++------------ src/coreclr/jit/instrsxarch.h | 20 +- src/coreclr/jit/lsra.cpp | 20 ++ src/coreclr/jit/lsra.h | 1 + src/coreclr/jit/lsrabuild.cpp | 15 ++ src/coreclr/jit/lsraxarch.cpp | 119 ++++++++-- src/coreclr/jit/register.h | 1 + src/coreclr/jit/target.h | 3 + src/coreclr/jit/targetamd64.h | 2 + 14 files changed, 371 insertions(+), 203 deletions(-) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 28d30508f334db..e99183629c7a52 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1128,8 +1128,6 @@ class emitter case IF_RWR_RRD_RRD: case IF_RWR_RRD_RRD_CNS: case IF_RWR_RRD_RRD_RRD: - case IF_RWR_RRD_SRD_RRD: - case IF_RWR_RRD_ARD_RRD: return true; default: return false; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index bf34bffdd4ed35..c572073ca3f2aa 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -217,10 +217,10 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_phminposuw: case INS_mpsadbw: case INS_pclmulqdq: - case INS_aesdec: - case INS_aesdeclast: case INS_aesenc: case INS_aesenclast: + case INS_aesdec: + case INS_aesdeclast: case INS_aesimc: case INS_aeskeygenassist: case INS_vzeroupper: @@ -260,18 +260,24 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_prefetcht2: case INS_sfence: // Might need new INS_*suffix* instructions for these. - case INS_por: // INS_pord, INS_porq. - case INS_pxor: // INS_pxord, INS_pxorq - case INS_movdqa: // INS_movdqa32, INS_movdqa64. - case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64. - case INS_pand: // INS_pandd, INS_pandq. - case INS_pandn: // INS_pandnd, INS_pandnq. - case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. - case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. - case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. - case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2. case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2. + + // TODO-XARCH-AVX512 these need to be encoded with the proper individual EVEX instructions (movdqu8, movdqu16 etc) + // For implementation speed, I have set it up so the standing instruction will default to the 32-bit operand type + // i.e., movdqu => movdqu32 etc + // Since we are not using k registers yet, this will have no impact on correctness but will affect things once + // k registers are used (as that is the point of the "break out operand type" of these instructions) + //case INS_movdqa: // INS_movdqa32, INS_movdqa64. + //case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64. + //case INS_pand: // INS_pandd, INS_pandq. + //case INS_pandn: // INS_pandnd, INS_pandnq. + //case INS_por: // INS_pord, INS_porq. + //case INS_pxor: // INS_pxord, INS_pxorq + //case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. + //case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. + //case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. + //case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. { return false; } @@ -794,13 +800,16 @@ bool emitter::TakesEvexPrefix(const instrDesc *id) const return false; } + instruction ins = id->idIns(); + if (HasHighSIMDReg(id)) { + assert(IsEvexEncodedInstruction(ins)); + // TODO-XARCH-AVX512 remove this check once k registers have been implemented + assert(!HasKMaskRegisterDest(ins)); return true; } - instruction ins = id->idIns(); - // TODO-XArch-AVX512: Revisit 'HasKMaskRegisterDest()' check once KMask support is added. return IsEvexEncodedInstruction(ins) && !HasKMaskRegisterDest(ins); } @@ -4013,10 +4022,14 @@ void emitter::emitIns(instruction ins, emitAttr attr) insFormat fmt = IF_NONE; +<<<<<<< HEAD id->idIns(ins); id->idInsFmt(fmt); sz += emitGetAdjustedSize(id, code); +======= + sz += emitGetAdjustedSizeEvexAware(id, attr, code); +>>>>>>> 12a8cc387e0 (Limit high SIMD reg to compatible intrinsics lsra build.) if (TakesRexWPrefix(ins, attr)) { sz += emitGetRexPrefixSize(ins); @@ -5300,6 +5313,7 @@ void emitter::emitIns_R_I(instruction ins, id->idInsFmt(fmt); id->idReg1(reg); +<<<<<<< HEAD #ifdef DEBUG id->idDebugOnlyInfo()->idFlags = gtFlags; id->idDebugOnlyInfo()->idMemCookie = targetHandle; @@ -5320,6 +5334,9 @@ void emitter::emitIns_R_I(instruction ins, } sz += emitGetAdjustedSize(id, insCodeMI(ins)); +======= + sz += emitGetAdjustedSizeEvexAware(id, attr, insCodeMI(ins)); +>>>>>>> 12a8cc387e0 (Limit high SIMD reg to compatible intrinsics lsra build.) // Do we need a REX prefix for AMD64? We need one if we are using any extended register (REX.R), or if we have a // 64-bit sized operand (REX.W). Note that IMUL in our encoding is special, with a "built-in", implicit, target diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index dd1dc0e70d371e..764e1c5e23e9a0 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19034,6 +19034,22 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp) #endif } +bool GenTree::isEvexCompatibleHWIntrinsic(Compiler* comp) +{ + assert(gtOper == GT_HWINTRINSIC); + assert(comp != nullptr); + +// TODO-XARCH-AVX512 remove the ReturnsPerElementMask check once K registers have been properly +// implemented in the register allocator +#if defined(TARGET_XARCH) + return HWIntrinsicInfo::HasEvexSemantics(AsHWIntrinsic()->GetHWIntrinsicId()) && !HWIntrinsicInfo::ReturnsPerElementMask(AsHWIntrinsic()->GetHWIntrinsicId()); +#elif defined(TARGET_ARM64) + return HWIntrinsicInfo::HasEvexSemantics(AsHWIntrinsic()->GetHWIntrinsicId()) && !HWIntrinsicInfo::ReturnsPerElementMask(AsHWIntrinsic()->GetHWIntrinsicId()); +#else + return false; +#endif +} + GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID, CorInfoType simdBaseJitType, diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 790d14f2841779..f6c576bf84591a 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1508,6 +1508,7 @@ struct GenTree bool isCommutativeHWIntrinsic() const; bool isContainableHWIntrinsic() const; bool isRMWHWIntrinsic(Compiler* comp); + bool isEvexCompatibleHWIntrinsic(Compiler* comp); #else bool isCommutativeHWIntrinsic() const { @@ -1523,6 +1524,11 @@ struct GenTree { return false; } + + bool isEvexCompatibleHWIntrinsic(Compiler* comp) + { + return false; + } #endif // FEATURE_HW_INTRINSICS static bool OperIsCommutative(genTreeOps gtOper) diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index b1299df1c1f1cf..fc3f6e9425e667 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -158,6 +158,9 @@ enum HWIntrinsicFlag : unsigned int // contained HW_Flag_MaybeCommutative = 0x80000, + // The intrinsic has EVEX compatible form + HW_Flag_NoEvexSemantics = 0x100000 + #elif defined(TARGET_ARM64) // The intrinsic has an immediate operand // - the value can be (and should be) encoded in a corresponding instruction when the operand value is constant @@ -172,7 +175,10 @@ enum HWIntrinsicFlag : unsigned int HW_Flag_SIMDScalar = 0x1000, // The intrinsic supports some sort of containment analysis - HW_Flag_SupportsContainment = 0x2000 + HW_Flag_SupportsContainment = 0x2000, + + // The intrinsic does not have an EVEX compatible form + HW_Flag_NoEvexSemantics = 0x4000 #else #error Unsupported platform @@ -761,6 +767,18 @@ struct HWIntrinsicInfo #endif } + static bool HasEvexSemantics(NamedIntrinsic id) + { + HWIntrinsicFlag flags = lookupFlags(id); +#if defined(TARGET_XARCH) + return (flags & HW_Flag_NoEvexSemantics) == 0; +#elif defined(TARGET_ARM64) + return (flags & HW_Flag_NoEvexSemantics) == 0; +#else +#error Unsupported platform +#endif + } + static bool HasSpecialImport(NamedIntrinsic id) { HWIntrinsicFlag flags = lookupFlags(id); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 8d5c2d16a35cbd..55a92725760718 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -68,7 +68,8 @@ HARDWARE_INTRINSIC(Vector128, EqualsAll, HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(Vector128, get_Count, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_One, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) @@ -171,7 +172,8 @@ HARDWARE_INTRINSIC(Vector256, EqualsAll, HARDWARE_INTRINSIC(Vector256, EqualsAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, ExtractMostSignificantBits, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(Vector256, get_Count, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, get_One, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) @@ -256,42 +258,42 @@ HARDWARE_INTRINSIC(SSE, Add, HARDWARE_INTRINSIC(SSE, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, ConvertToInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, ConvertScalarToVector128Single, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) @@ -308,7 +310,7 @@ HARDWARE_INTRINSIC(SSE, Min, HARDWARE_INTRINSIC(SSE, MinScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, MoveHighToLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE, MoveLowToHigh, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) -HARDWARE_INTRINSIC(SSE, MoveMask, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE, MoveMask, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, MoveScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE, MultiplyScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) @@ -317,10 +319,10 @@ HARDWARE_INTRINSIC(SSE, Prefetch0, HARDWARE_INTRINSIC(SSE, Prefetch1, 0, 1, {INS_invalid, INS_prefetcht1, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, Prefetch2, 0, 1, {INS_invalid, INS_prefetcht2, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, PrefetchNonTemporal, 0, 1, {INS_invalid, INS_prefetchnta, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, Reciprocal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, ReciprocalScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, ReciprocalSqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, ReciprocalSqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, Reciprocal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalSqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalSqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, Shuffle, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, SqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) @@ -357,42 +359,42 @@ HARDWARE_INTRINSIC(SSE2, AddScalar, HARDWARE_INTRINSIC(SSE2, And, 16, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToUInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) @@ -420,7 +422,7 @@ HARDWARE_INTRINSIC(SSE2, MemoryFence, HARDWARE_INTRINSIC(SSE2, MaxScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2, Min, 16, 2, {INS_invalid, INS_pminub, INS_pminsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) HARDWARE_INTRINSIC(SSE2, MinScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, MoveMask, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE2, MoveMask, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, MoveScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE2, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuludq, INS_invalid, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, MultiplyHigh, 16, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -473,11 +475,11 @@ HARDWARE_INTRINSIC(SSE2_X64, StoreNonTemporal, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE3 Intrinsics -HARDWARE_INTRINSIC(SSE3, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE3, LoadDquVector128, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE3, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, LoadDquVector128, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE3, MoveAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE3, MoveHighAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE3, MoveLowAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) @@ -489,54 +491,54 @@ HARDWARE_INTRINSIC(SSE3, MoveLowAndDuplicate, // SSSE3 Intrinsics HARDWARE_INTRINSIC(SSSE3, Abs, 16, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(SSSE3, AlignRight, 16, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(SSSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSSE3, MultiplyAddAdjacent, 16, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3, MultiplyHighRoundScale, 16, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3, Shuffle, 16, 2, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSSE3, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE41 Intrinsics -HARDWARE_INTRINSIC(SSE41, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(SSE41, BlendVariable, 16, 3, {INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE41, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, BlendVariable, 16, 3, {INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int16, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int32, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int64, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(SSE41, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Extract, 16, 2, {INS_pextrb, INS_pextrb, INS_invalid, INS_invalid, INS_pextrd, INS_pextrd, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MultiIns|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Insert, 16, 3, {INS_pinsrb, INS_pinsrb, INS_invalid, INS_invalid, INS_pinsrd, INS_pinsrd, INS_invalid, INS_invalid, INS_insertps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE41, LoadAlignedVector128NonTemporal, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41, Max, 16, 2, {INS_pmaxsb, INS_invalid, INS_invalid, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE41, Min, 16, 2, {INS_pminsb, INS_invalid, INS_invalid, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE41, MultiplyLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE41, PackUnsignedSaturate, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, TestC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(SSE41, TestNotZAndNotC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(SSE41, TestZ, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE41, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestNotZAndNotC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestZ, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -568,15 +570,15 @@ HARDWARE_INTRINSIC(SSE42_X64, Crc32, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX Intrinsics HARDWARE_INTRINSIC(AVX, Add, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, And, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Blend, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Ceiling, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, Blend, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, Ceiling, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, BroadcastVector128ToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, BroadcastVector128ToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Compare, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) @@ -590,7 +592,7 @@ HARDWARE_INTRINSIC(AVX, CompareNotLessThan, HARDWARE_INTRINSIC(AVX, CompareNotLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareOrdered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareUnordered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector128Single, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector256Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) @@ -599,43 +601,43 @@ HARDWARE_INTRINSIC(AVX, ConvertToVector256Double, HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32WithTruncation, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector256Int32WithTruncation, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttps2dq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, Divide, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_divpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, DotProduct, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX, DotProduct, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, DuplicateEvenIndexed, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, DuplicateOddIndexed, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ExtractVector128, 32, 2, {INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, InsertVector128, 32, 3, {INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX, LoadAlignedVector256, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, LoadDquVector256, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, LoadDquVector256, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, LoadVector256, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX, Max, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) HARDWARE_INTRINSIC(AVX, Min, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) -HARDWARE_INTRINSIC(AVX, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg) -HARDWARE_INTRINSIC(AVX, MoveMask, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, MoveMask, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Multiply, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, Or, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, Permute, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilps, INS_vpermilpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, Permute2x128, 32, 3, {INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX, Permute2x128, 32, 3, {INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, PermuteVar, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilpsvar, INS_vpermilpdvar}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Reciprocal, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, ReciprocalSqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundCurrentDirection, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToNearestInteger, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToNegativeInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToPositiveInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToZero, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, Reciprocal, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, ReciprocalSqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundCurrentDirection, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToNearestInteger, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToNegativeInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToPositiveInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToZero, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Shuffle, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_NoRMWSemantics|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX, Sqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, Store, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX, StoreAligned, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX, StoreAlignedNonTemporal, 32, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, TestC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX, TestNotZAndNotC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX, TestC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, TestNotZAndNotC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, UnpackHigh, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, UnpackLow, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, Xor, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -648,47 +650,47 @@ HARDWARE_INTRINSIC(AVX, Xor, HARDWARE_INTRINSIC(AVX2, Abs, 32, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX2, Add, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, AddSaturate, 32, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, And, 32, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, Average, 32, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector128, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector256, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, ExtractVector128, 32, 2, {INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, ConvertToInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX2, ConvertToUInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int16, 32, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int32, 32, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int64, 32, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, GatherVector128, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherVector256, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherMaskVector128, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherMaskVector256, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalAddSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalSubtractSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, GatherVector128, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherVector256, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherMaskVector128, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherMaskVector256, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalAddSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalSubtractSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, InsertVector128, 32, 3, {INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, LoadAlignedVector256NonTemporal, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg) +HARDWARE_INTRINSIC(AVX2, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Max, 32, 2, {INS_pmaxsb, INS_pmaxub, INS_pmaxsw, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, Min, 32, 2, {INS_pminsb, INS_pminub, INS_pminsw, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, MoveMask, 32, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX2, MoveMask, 32, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Multiply, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, MultipleSumAbsoluteDifferences, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, MultipleSumAbsoluteDifferences, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, MultiplyAddAdjacent, 32, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_pmaddwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, MultiplyHigh, 32, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, MultiplyHighRoundScale, 32, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, MultiplyLow, 32, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, Or, 32, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, Permute2x128, 32, 3, {INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, Permute2x128, 32, 3, {INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Permute4x64, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermq, INS_vpermq, INS_invalid, INS_vpermpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, PermuteVar8x32, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermd, INS_vpermd, INS_invalid, INS_invalid, INS_vpermps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX2, PackSignedSaturate, 32, 2, {INS_packsswb, INS_invalid, INS_packssdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -704,7 +706,7 @@ HARDWARE_INTRINSIC(AVX2, ShiftRightLogicalVariable, HARDWARE_INTRINSIC(AVX2, Shuffle, 32, 2, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_pshufd, INS_pshufd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_MaybeIMM) HARDWARE_INTRINSIC(AVX2, ShuffleHigh, 32, 2, {INS_invalid, INS_invalid, INS_pshufhw, INS_pshufhw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, ShuffleLow, 32, 2, {INS_invalid, INS_invalid, INS_pshuflw, INS_pshuflw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX2, Sign, 32, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, Sign, 32, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, SumAbsoluteDifferences, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, Subtract, 32, 2, {INS_psubb, INS_psubb, INS_psubw, INS_psubw, INS_psubd, INS_psubd, INS_psubq, INS_psubq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, SubtractSaturate, 32, 2, {INS_psubsb, INS_psubusb, INS_psubsw, INS_psubusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -723,56 +725,56 @@ HARDWARE_INTRINSIC(AVXVNNI, MultiplyWideningAndAddSaturate, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AES Intrinsics -HARDWARE_INTRINSIC(AES, Decrypt, 16, 2, {INS_invalid, INS_aesdec, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, DecryptLast, 16, 2, {INS_invalid, INS_aesdeclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, Encrypt, 16, 2, {INS_invalid, INS_aesenc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, EncryptLast, 16, 2, {INS_invalid, INS_aesenclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, InverseMixColumns, 16, 1, {INS_invalid, INS_aesimc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, KeygenAssist, 16, 2, {INS_invalid, INS_aeskeygenassist, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AES, Decrypt, 16, 2, {INS_invalid, INS_aesdec, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, DecryptLast, 16, 2, {INS_invalid, INS_aesdeclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, Encrypt, 16, 2, {INS_invalid, INS_aesenc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, EncryptLast, 16, 2, {INS_invalid, INS_aesenclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, InverseMixColumns, 16, 1, {INS_invalid, INS_aesimc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, KeygenAssist, 16, 2, {INS_invalid, INS_aeskeygenassist, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics -HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1, GetMaskUpToLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1, ResetLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1, TrailingZeroCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_tzcnt, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns) -HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics -HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1_X64, GetMaskUpToLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1_X64, ResetLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1_X64, TrailingZeroCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_tzcnt, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns) -HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI2 Intrinsics -HARDWARE_INTRINSIC(BMI2, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(BMI2, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI2, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI2 Intrinsics -HARDWARE_INTRINSIC(BMI2_X64, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2_X64, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2_X64, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(BMI2_X64, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -809,7 +811,7 @@ HARDWARE_INTRINSIC(LZCNT_X64, LeadingZeroCount, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // PCLMULQDQ Intrinsics -HARDWARE_INTRINSIC(PCLMULQDQ, CarrylessMultiply, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pclmulqdq, INS_pclmulqdq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(PCLMULQDQ, CarrylessMultiply, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pclmulqdq, INS_pclmulqdq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -842,8 +844,8 @@ HARDWARE_INTRINSIC(SSE, COMISS, HARDWARE_INTRINSIC(SSE, UCOMISS, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, COMISD, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, UCOMISD, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index a6968c123c7381..13ed02d75c6ead 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -202,8 +202,8 @@ INST3(movntdq, "movntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, INST3(movnti, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(movntpd, "movntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_64Bit) INST3(movntps, "movntps", IUM_WR, PCKFLT(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_32Bit) -INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_NONE, INS_FLAGS_None) -INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_NONE, INS_FLAGS_None) +INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit) // TODO-XARCH-AVX512 TT and IP encoded is movdqu32 +INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit) // TODO-XARCH-AVX512 TT and IP encoded is movdqa32 INST3(movlpd, "movlpd", IUM_WR, PCKDBL(0x13), BAD_CODE, PCKDBL(0x12), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movlps, "movlps", IUM_WR, PCKFLT(0x13), BAD_CODE, PCKFLT(0x12), INS_TT_TUPLE1_FIXED, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movhpd, "movhpd", IUM_WR, PCKDBL(0x17), BAD_CODE, PCKDBL(0x16), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) @@ -341,10 +341,10 @@ INST3(pmulhuw, "pmulhuw", IUM_WR, BAD_CODE, BAD_CODE, INST3(pmuludq, "pmuludq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF4), INS_TT_FULL_MEM, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit unsigned integers and store 64-bit result INST3(pmullw, "pmullw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD5), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result // TODO-XArch-AVX512: pand, pandn, por, and pxor have AVX512 instructions under different names, pandd, pandq etc -INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs -INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs -INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs -INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs +INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs INST3(psadbw, "psadbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF6), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Compute the sum of absolute differences of packed unsigned 8-bit integers INST3(psubsb, "psubsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation INST3(psubusb, "psubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation @@ -493,10 +493,10 @@ INST3(vpbroadcastb, "pbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, INST3(vpbroadcastw, "pbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x79), INS_TT_TUPLE1_SCALAR, Input_16Bit | INS_FLAGS_None) // Broadcast int16 value from reg/memory to entire ymm register INST3(vpbroadcastd, "pbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_FLAGS_None) // Broadcast int32 value from reg/memory to entire ymm register INST3(vpbroadcastq, "pbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Broadcast int64 value from reg/memory to entire ymm register -INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Extract 128-bit packed floating point values -INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Extract 128-bit packed integer values -INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values -INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values +INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit ) // Extract 128-bit packed floating point values // TODO-XARCH-AVX512 TT and IP encoded is extractf32x4 +INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit ) // Extract 128-bit packed integer values // TODO-XARCH-AVX512 TT and IP encoded is extractf32x4 +INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_TUPLE4, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values // TODO-XARCH-AVX512 TT and IP encoded is insertf32x4 +INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_TUPLE4, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values // TODO-XARCH-AVX512 TT and IP encoded is inserti32x4 INST3(vzeroupper, "zeroupper", IUM_WR, 0xC577F8, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) INST3(vperm2i128, "perm2i128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x46), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Permute 128-bit halves of input register INST3(vpermq, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x00), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Permute 64-bit of input register diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index f32c54da1b4431..9ab2405c3815d2 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -256,6 +256,15 @@ regMaskTP LinearScan::allSIMDRegs() return availableFloatRegs; } +regMaskTP LinearScan::lowSIMDRegs() +{ +#if defined(TARGET_AMD64) + return (availableFloatRegs & RBM_LOWFLOAT); +#else + return availableFloatRegs; +#endif +} + void LinearScan::updateNextFixedRef(RegRecord* regRecord, RefPosition* nextRefPosition) { LsraLocation nextLocation; @@ -680,6 +689,17 @@ LinearScan::LinearScan(Compiler* theCompiler) } #endif // TARGET_AMD64 || TARGET_ARM64 +#if defined(TARGET_AMD64) + // TODO-XARCH-AVX512 switch this to canUseEvexEncoding() once we independetly + // allow EVEX use from the stress flag (currently, if EVEX stress is turned off, + // we cannot use EVEX at all) + if (!compiler->DoJitStressEvexEncoding()) + { + availableFloatRegs &= ~RBM_HIGHFLOAT; + availableDoubleRegs &= ~RBM_HIGHFLOAT; + } +#endif + for (unsigned int i = 0; i < TYP_COUNT; i++) { var_types thisType = (var_types)genActualTypes[i]; diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index ddf4f448db24ed..90a5bb900d2ce9 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1064,6 +1064,7 @@ class LinearScan : public LinearScanInterface regMaskTP allRegs(RegisterType rt); regMaskTP allByteRegs(); regMaskTP allSIMDRegs(); + regMaskTP lowSIMDRegs(); regMaskTP internalFloatRegCandidates(); void makeRegisterInactive(RegRecord* physRegRecord); diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 3908f1998792a9..b170e9baf9e8ad 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -557,6 +557,21 @@ RefPosition* LinearScan::newRefPosition(Interval* theInterval, { if (theInterval != nullptr) { +#if defined(TARGET_AMD64) + if (mask == RBM_LOWSIMD) + { + // Constrain if we have to for float/simd types + if (varTypeIsFloating(theInterval->registerType) || varTypeIsSIMD(theInterval->registerType)) + { + mask = lowSIMDRegs(); + } + else + { + mask = RBM_NONE; + } + + } +#endif if (mask == RBM_NONE) { mask = allRegs(theInterval->registerType); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index c0fd6030c28804..2943452e169b14 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -156,7 +156,12 @@ int LinearScan::BuildNode(GenTree* tree) srcCount = 0; assert(dstCount == 1); assert(!tree->IsReuseRegVal()); - RefPosition* def = BuildDef(tree); +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif + RefPosition* def = BuildDef(tree, opRegMask); def->getInterval()->isConstant = true; } break; @@ -1885,21 +1890,30 @@ int LinearScan::BuildIntrinsic(GenTree* tree) break; } assert(tree->gtGetOp2IfPresent() == nullptr); + + // TODO-XARCH-AVX512 this is overly constraining register available as NI_System_Math_Abs + // can be lowered to EVEX compatible instruction (the rest cannot) +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif + int srcCount; if (op1->isContained()) { - srcCount = BuildOperandUses(op1); + srcCount = BuildOperandUses(op1, opRegMask); } else { - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, opRegMask); srcCount = 1; } if (internalFloatDef != nullptr) { buildInternalRegisterUses(); } - BuildDef(tree); + BuildDef(tree, opRegMask); return srcCount; } @@ -2006,6 +2020,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it // is not allocated the same register as the target. bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); + bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic(compiler); // Create internal temps, and handle any other special requirements. // Note that the default case for building uses will handle the RMW flag, but if the uses @@ -2089,9 +2104,14 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(numArgs == 3); assert(!isRMW); +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif // MaskMove hardcodes the destination (op3) in DI/EDI/RDI - srcCount += BuildOperandUses(op1); - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op1, opRegMask); + srcCount += BuildOperandUses(op2, opRegMask); srcCount += BuildOperandUses(op3, RBM_EDI); buildUses = false; @@ -2106,11 +2126,17 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert(isRMW); +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif + // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0 - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, opRegMask); srcCount += 1; - srcCount += op2->isContained() ? BuildOperandUses(op2) : BuildDelayFreeUses(op2, op1); + srcCount += op2->isContained() ? BuildOperandUses(op2, opRegMask) : BuildDelayFreeUses(op2, op1, opRegMask); srcCount += BuildDelayFreeUses(op3, op1, RBM_XMM0); buildUses = false; @@ -2304,15 +2330,22 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(numArgs == 3); assert(!isRMW); +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif + + // Any pair of the index, mask, or destination registers should be different - srcCount += BuildOperandUses(op1); - srcCount += BuildDelayFreeUses(op2); + srcCount += BuildOperandUses(op1, opRegMask); + srcCount += BuildDelayFreeUses(op2, nullptr, opRegMask); // op3 should always be contained assert(op3->isContained()); // get a tmp register for mask that will be cleared by gather instructions - buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); + buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs()); setInternalRegsDelayFree = true; buildUses = false; @@ -2327,17 +2360,24 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou GenTree* op4 = intrinsicTree->Op(4); GenTree* op5 = intrinsicTree->Op(5); +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif + + // Any pair of the index, mask, or destination registers should be different - srcCount += BuildOperandUses(op1); - srcCount += BuildDelayFreeUses(op2); - srcCount += BuildDelayFreeUses(op3); - srcCount += BuildDelayFreeUses(op4); + srcCount += BuildOperandUses(op1, opRegMask); + srcCount += BuildDelayFreeUses(op2, nullptr, opRegMask); + srcCount += BuildDelayFreeUses(op3, nullptr, opRegMask); + srcCount += BuildDelayFreeUses(op4, nullptr, opRegMask); // op5 should always be contained assert(op5->isContained()); // get a tmp register for mask that will be cleared by gather instructions - buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); + buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs()); setInternalRegsDelayFree = true; buildUses = false; @@ -2355,25 +2395,40 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert((numArgs > 0) && (numArgs < 4)); + regMaskTP op1RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op1RegCandidates = RBM_LOWSIMD; + } +#endif + if (intrinsicTree->OperIsMemoryLoadOrStore()) { - srcCount += BuildAddrUses(op1); + srcCount += BuildAddrUses(op1, op1RegCandidates); } else if (isRMW && !op1->isContained()) { - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, op1RegCandidates); srcCount += 1; } else { - srcCount += BuildOperandUses(op1); + srcCount += BuildOperandUses(op1, op1RegCandidates); } if (op2 != nullptr) { + regMaskTP op2RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op2RegCandidates = RBM_LOWSIMD; + } +#endif if (op2->OperIs(GT_HWINTRINSIC) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained()) { - srcCount += BuildAddrUses(op2->AsHWIntrinsic()->Op(1)); + srcCount += BuildAddrUses(op2->AsHWIntrinsic()->Op(1), op2RegCandidates); } else if (isRMW) { @@ -2382,7 +2437,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // When op2 is not contained and we are commutative, we can set op2 // to also be a tgtPrefUse. Codegen will then swap the operands. - tgtPrefUse2 = BuildUse(op2); + tgtPrefUse2 = BuildUse(op2, op2RegCandidates); srcCount += 1; } else if (!op2->isContained() || varTypeIsArithmetic(intrinsicTree->TypeGet())) @@ -2390,7 +2445,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // When op2 is not contained or if we are producing a scalar value // we need to mark it as delay free because the operand and target // exist in the same register set. - srcCount += BuildDelayFreeUses(op2, op1); + srcCount += BuildDelayFreeUses(op2, op1, op2RegCandidates); } else { @@ -2398,17 +2453,24 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // have no concerns of overwriting op2 because they exist in different // register sets. - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op2, op2RegCandidates); } } else { - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op2, op2RegCandidates); } if (op3 != nullptr) { - srcCount += isRMW ? BuildDelayFreeUses(op3, op1) : BuildOperandUses(op3); + regMaskTP op3RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op3RegCandidates = RBM_LOWSIMD; + } +#endif + srcCount += isRMW ? BuildDelayFreeUses(op3, op1, op3RegCandidates) : BuildOperandUses(op3, op3RegCandidates); } } } @@ -2418,6 +2480,13 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (dstCount == 1) { +#if defined(TARGET_AMD64) + if (!intrinsicTree->isEvexCompatibleHWIntrinsic(compiler)) + { + dstCandidates = RBM_LOWSIMD; + } +#endif + BuildDef(intrinsicTree, dstCandidates); } else diff --git a/src/coreclr/jit/register.h b/src/coreclr/jit/register.h index 239e80c856528f..ca90673e85adfe 100644 --- a/src/coreclr/jit/register.h +++ b/src/coreclr/jit/register.h @@ -114,6 +114,7 @@ REGDEF(XMM30, 30+XMMBASE, XMMMASK(30), "mm30" ) REGDEF(XMM31, 31+XMMBASE, XMMMASK(31), "mm31" ) REGDEF(STK, 32+XMMBASE, 0x0000, "STK" ) + #endif // !TARGET_X86 #elif defined(TARGET_ARM) diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index a2ac6b68075660..b9b2f82a85a781 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -153,10 +153,13 @@ enum _regNumber_enum : unsigned enum _regMask_enum : unsigned __int64 { RBM_NONE = 0, + RBM_LOWSIMD = 1LL << 63, #define REGDEF(name, rnum, mask, sname) RBM_##name = mask, #define REGALIAS(alias, realname) RBM_##alias = RBM_##realname, #include "register.h" + + }; #elif defined(TARGET_X86) diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 3c1c2b83960cd6..a17432f9d8e3f5 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -79,6 +79,8 @@ #define CSE_CONSTS 1 // Enable if we want to CSE constants #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15 | RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31) + #define RBM_LOWFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15 ) + #define RBM_HIGHFLOAT (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31) #define RBM_ALLDOUBLE RBM_ALLFLOAT #define REG_FP_FIRST REG_XMM0 From 47ff9ca94a31055f97b79a5a02678c2d12130a31 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Mon, 14 Nov 2022 13:49:05 -0800 Subject: [PATCH 04/34] Limit high SIMD reg to compatible intrinsics lsra build. --- src/coreclr/jit/emitxarch.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index c572073ca3f2aa..1966848a9e5d0d 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -4022,14 +4022,10 @@ void emitter::emitIns(instruction ins, emitAttr attr) insFormat fmt = IF_NONE; -<<<<<<< HEAD id->idIns(ins); id->idInsFmt(fmt); sz += emitGetAdjustedSize(id, code); -======= - sz += emitGetAdjustedSizeEvexAware(id, attr, code); ->>>>>>> 12a8cc387e0 (Limit high SIMD reg to compatible intrinsics lsra build.) if (TakesRexWPrefix(ins, attr)) { sz += emitGetRexPrefixSize(ins); @@ -5313,7 +5309,6 @@ void emitter::emitIns_R_I(instruction ins, id->idInsFmt(fmt); id->idReg1(reg); -<<<<<<< HEAD #ifdef DEBUG id->idDebugOnlyInfo()->idFlags = gtFlags; id->idDebugOnlyInfo()->idMemCookie = targetHandle; @@ -5334,9 +5329,6 @@ void emitter::emitIns_R_I(instruction ins, } sz += emitGetAdjustedSize(id, insCodeMI(ins)); -======= - sz += emitGetAdjustedSizeEvexAware(id, attr, insCodeMI(ins)); ->>>>>>> 12a8cc387e0 (Limit high SIMD reg to compatible intrinsics lsra build.) // Do we need a REX prefix for AMD64? We need one if we are using any extended register (REX.R), or if we have a // 64-bit sized operand (REX.W). Note that IMUL in our encoding is special, with a "built-in", implicit, target From 9cabef6203523be0836c86814c549391316b636b Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 14 Dec 2022 13:02:59 -0800 Subject: [PATCH 05/34] Limit high SIMD reg to compatible intrinsics and gentree nodes. Commit constrains certain hw intrinsics and gentree nodes to use lower SIMD registers even if upper SIMD registers are available due to limitations of EVEX encoding for certain instructions. For example, SSE `Reciprocal` lowers to `rcpps` which does not have an EVEX encoding form, hence, we cannot allow that hw intrincis node to use a high SIMD register. These intrinsics are marked with `HW_Flag_NoEvexSemantics`. Other such intructions related to masking (typically marked with `HW_Flag_ReturnsPerElementMask`) also have similar issues (though they can be replaced with the EVEX k registers and associated masking when implemented). In addition, the callee/calleer save registers have also been adjusted to properly handle the presence and absence of AVX512 upper simd registers at runtime. --- src/coreclr/jit/compiler.cpp | 46 ++++++++++++ src/coreclr/jit/compiler.h | 26 +++++++ src/coreclr/jit/emitinl.h | 3 +- src/coreclr/jit/emitxarch.cpp | 135 +++++++++++++++++++++++----------- src/coreclr/jit/emitxarch.h | 41 +++++------ src/coreclr/jit/gentree.cpp | 15 ++-- src/coreclr/jit/gentree.h | 4 +- src/coreclr/jit/hwintrinsic.h | 12 ++- src/coreclr/jit/lsra.cpp | 16 +++- src/coreclr/jit/lsra.h | 13 +++- src/coreclr/jit/lsrabuild.cpp | 6 +- src/coreclr/jit/lsraxarch.cpp | 32 ++++---- src/coreclr/jit/target.h | 7 +- src/coreclr/jit/targetamd64.h | 80 +++++++++++++++----- src/coreclr/jit/utils.h | 10 +++ 15 files changed, 319 insertions(+), 127 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 6c8445b6fee744..cab8ec8ebfa592 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3325,6 +3325,30 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.compJitSaveFpLrWithCalleeSavedRegisters = JitConfig.JitSaveFpLrWithCalleeSavedRegisters(); } #endif // defined(DEBUG) && defined(TARGET_ARM64) + +#if defined(TARGET_AMD64) + rbmAllFloat = RBM_ALLFLOAT_INIT; + rbmFltCalleeTrash = RBM_FLT_CALLEE_TRASH_INIT; + rbmCntCalleeTrashFloat = CNT_CALLEE_TRASH_FLOAT_INIT; + + if (DoJitStressEvexEncoding()) + { + rbmAllFloat |= RBM_HIGHFLOAT; + rbmFltCalleeTrash |= RBM_HIGHFLOAT; + rbmCntCalleeTrashFloat += 16; + } + rbmCalleeTrash = RBM_CALLEE_TRASH_INIT; + rbmCalleeTrashNoGC = RBM_CALLEE_TRASH_NOGC_INIT; + rbmCalleeTrashWriteBarrier = RBM_CALLEE_TRASH_WRITEBARRIER_INIT; + rbmCalleeGCTrashWriteBarrier = RBM_CALLEE_GCTRASH_WRITEBARRIER_INIT; + rbmCalleeTrashWriteBarrierByref = RBM_CALLEE_TRASH_WRITEBARRIER_BYREF_INIT; + rbmCalleeGCTrashWriteBarrierByref = RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF_INIT; + rbmStopForGCTrash = RBM_STOP_FOR_GC_TRASH_INIT; + rbmInitPInvokeFrameTrash = RBM_INIT_PINVOKE_FRAME_TRASH_INIT; + rbmProfilerEnterTrash = RBM_PROFILER_ENTER_TRASH_INIT; + rbmProfilerLeaveTrash = RBM_PROFILER_LEAVE_TRASH_INIT; + rbmProfilerTailcallTrash = RBM_PROFILER_TAILCALL_TRASH_INIT; +#endif // TARGET_AMD64 } #ifdef DEBUG @@ -10276,3 +10300,25 @@ void Compiler::EnregisterStats::Dump(FILE* fout) const PRINT_STATS(m_dispatchRetBuf, m_addrExposed); } #endif // TRACK_ENREG_STATS + +#if defined(TARGET_AMD64) +// The following are for initializing register allocator "constants" defined in targetamd.h +// that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which increases +// the number of simd (xmm,ymm, and zmm) registers from 16 to 32. +// As only 64-bit xarch has the capability to have the additional registers, we limit the changes +// to TARGET_AMD64 only. +regMaskTP rbmAllFloat; +regMaskTP rbmFltCalleeTrash; +regMaskTP rbmCalleeTrash; +regMaskTP rbmCalleeTrashNoGC; +regMaskTP rbmCalleeTrashWriteBarrier; +regMaskTP rbmCalleeGCTrashWriteBarrier; +regMaskTP rbmCalleeTrashWriteBarrierByref; +regMaskTP rbmCalleeGCTrashWriteBarrierByref; +regMaskTP rbmStopForGCTrash; +regMaskTP rbmProfilerTailcallTrash; +regMaskTP rbmInitPInvokeFrameTrash; +regMaskTP rbmProfilerEnterTrash; +regMaskTP rbmProfilerLeaveTrash; +unsigned rbmCntCalleeTrashFloat; +#endif // TARGET_AMD64 diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 11fb3c3986d7a7..98940a29be910c 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -11551,6 +11551,32 @@ extern const BYTE genActualTypes[]; /*****************************************************************************/ +/*****************************************************************************/ + +#if defined(TARGET_AMD64) +// The following are for initializing register allocator "constants" defined in targetamd.h +// that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which increases +// the number of simd (xmm,ymm, and zmm) registers from 16 to 32. +// As only 64-bit xarch has the capability to have the additional registers, we limit the changes +// to TARGET_AMD64 only. +extern regMaskTP rbmAllFloat; +extern regMaskTP rbmFltCalleeTrash; +extern regMaskTP rbmCalleeTrash; +extern regMaskTP rbmCalleeTrashNoGC; +extern regMaskTP rbmCalleeTrashWriteBarrier; +extern regMaskTP rbmCalleeGCTrashWriteBarrier; +extern regMaskTP rbmCalleeTrashWriteBarrierByref; +extern regMaskTP rbmCalleeGCTrashWriteBarrierByref; +extern regMaskTP rbmStopForGCTrash; +extern regMaskTP rbmProfilerTailcallTrash; +extern regMaskTP rbmInitPInvokeFrameTrash; +extern regMaskTP rbmProfilerEnterTrash; +extern regMaskTP rbmProfilerLeaveTrash; +extern unsigned rbmCntCalleeTrashFloat; +#endif // TARGET_AMD64 + +/*****************************************************************************/ + #ifdef DEBUG void dumpConvertedVarSet(Compiler* comp, VARSET_VALARG_TP vars); #endif // DEBUG diff --git a/src/coreclr/jit/emitinl.h b/src/coreclr/jit/emitinl.h index 82c78299efebd3..354fc23363a2f7 100644 --- a/src/coreclr/jit/emitinl.h +++ b/src/coreclr/jit/emitinl.h @@ -214,7 +214,8 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id) /*static*/ inline void emitter::emitEncodeCallGCregs(regMaskTP regmask, instrDesc* id) { - assert((regmask & RBM_CALLEE_TRASH) == 0); + // TODO-XARCH-AVX512 global defined in compiler.h, not in scope here + // assert((regmask & RBM_CALLEE_TRASH) == 0); unsigned encodeMask; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 1966848a9e5d0d..436f1e46619ded 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -263,24 +263,27 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2. case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2. - // TODO-XARCH-AVX512 these need to be encoded with the proper individual EVEX instructions (movdqu8, movdqu16 etc) - // For implementation speed, I have set it up so the standing instruction will default to the 32-bit operand type - // i.e., movdqu => movdqu32 etc - // Since we are not using k registers yet, this will have no impact on correctness but will affect things once - // k registers are used (as that is the point of the "break out operand type" of these instructions) - //case INS_movdqa: // INS_movdqa32, INS_movdqa64. - //case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64. - //case INS_pand: // INS_pandd, INS_pandq. - //case INS_pandn: // INS_pandnd, INS_pandnq. - //case INS_por: // INS_pord, INS_porq. - //case INS_pxor: // INS_pxord, INS_pxorq - //case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. - //case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. - //case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. - //case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. - { - return false; - } + // TODO-XARCH-AVX512 these need to be encoded with the proper individual EVEX instructions (movdqu8, + // movdqu16 etc) + // For implementation speed, I have set it up so the standing instruction will default to the 32-bit operand + // type + // i.e., movdqu => movdqu32 etc + // Since we are not using k registers yet, this will have no impact on correctness but will affect things + // once + // k registers are used (as that is the point of the "break out operand type" of these instructions) + // case INS_movdqa: // INS_movdqa32, INS_movdqa64. + // case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64. + // case INS_pand: // INS_pandd, INS_pandq. + // case INS_pandn: // INS_pandnd, INS_pandnq. + // case INS_por: // INS_pord, INS_porq. + // case INS_pxor: // INS_pxord, INS_pxorq + // case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. + // case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. + // case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. + // case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. + { + return false; + } default: { break; @@ -767,7 +770,7 @@ bool emitter::Is4ByteSSEInstruction(instruction ins) const // Return Value: // true if this instruction requires a VEX or EVEX prefix. // -bool emitter::TakesSimdPrefix(const instrDesc *id) const +bool emitter::TakesSimdPrefix(const instrDesc* id) const { instruction ins = id->idIns(); @@ -793,7 +796,7 @@ bool emitter::TakesSimdPrefix(const instrDesc *id) const // Return Value: // true if this instruction requires a EVEX prefix. // -bool emitter::TakesEvexPrefix(const instrDesc *id) const +bool emitter::TakesEvexPrefix(const instrDesc* id) const { if (!emitComp->DoJitStressEvexEncoding()) { @@ -1077,8 +1080,16 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) #endif //! TARGET_AMD64 } -// Returns true if using this register will require an EVEX.R', EVEX.V' or EVEX.X bit. -bool emitter::HasHighSIMDReg(const instrDesc *id) const +//------------------------------------------------------------------------ +// HasHighSIMReg: Checks if an instruction uses a high SIMD registers (mm16-mm31) +// and will require one of the EVEX high SIMD bits (EVEX.R', EVEX.V', EVEX.X) +// +// Arguments: +// id -- instruction descriptor for encoding +// +// Return Value: +// true if instruction will require EVEX encoding for its register operands. +bool emitter::HasHighSIMDReg(const instrDesc* id) const { #if defined(TARGET_AMD64) if (IsHighSIMDReg(id->idReg1()) || IsHighSIMDReg(id->idReg2())) @@ -1087,15 +1098,22 @@ bool emitter::HasHighSIMDReg(const instrDesc *id) const if (id->idIsSmallDsc()) return false; - if ((id->idHasReg3() && IsHighSIMDReg(id->idReg3())) || - (id->idHasReg4() && IsHighSIMDReg(id->idReg4()))) + if ((id->idHasReg3() && IsHighSIMDReg(id->idReg3())) || (id->idHasReg4() && IsHighSIMDReg(id->idReg4()))) return true; #endif // X86 JIT operates in 32-bit mode and hence extended reg are not available. return false; } -// Returns true if using this register will require an EVEX.R', EVEX.V' or EVEX.X bit. +//------------------------------------------------------------------------ +// IsHighSIMDReg: Checks if a register is strictly an EVEX encoded high SIMD +// registers (mm16-mm31). +// +// Arguments: +// reg -- register to check +// +// Return Value: +// true if the register is strictly an EVEX encoded high SIMD register bool emitter::IsHighSIMDReg(regNumber reg) const { #ifdef TARGET_AMD64 @@ -1172,7 +1190,17 @@ bool IsXMMReg(regNumber reg) #endif // !TARGET_AMD64 } -// Returns bits to be encoded in instruction for the given register +//------------------------------------------------------------------------ +// HighAwareRegEncoding: For EVEX encoded high SIMD registers (mm16-mm31), +// get a register encoding for bits 0-4, where the 5th bit is encoded via +// EVEX.R', EVEX.R, or EVEX.X. +// +// Arguments: +// reg -- register to encode +// +// Return Value: +// bits 0-4 of register encoding +// unsigned HighAwareRegEncoding(regNumber reg) { static_assert((REG_XMM0 & 0x7) == 0, "bad XMMBASE"); @@ -1189,7 +1217,7 @@ unsigned RegEncoding(regNumber reg) // Utility routines that abstract the logic of adding REX.W, REX.R, REX.X, REX.B and REX prefixes // SSE2: separate 1-byte prefix gets added before opcode. // AVX: specific bits within VEX prefix need to be set in bit-inverted form. -emitter::code_t emitter::AddRexWPrefix(const instrDesc *id, code_t code) +emitter::code_t emitter::AddRexWPrefix(const instrDesc* id, code_t code) { instruction ins = id->idIns(); @@ -1225,7 +1253,7 @@ emitter::code_t emitter::AddRexWPrefix(const instrDesc *id, code_t code) #ifdef TARGET_AMD64 -emitter::code_t emitter::AddRexRPrefix(const instrDesc *id, code_t code) +emitter::code_t emitter::AddRexRPrefix(const instrDesc* id, code_t code) { instruction ins = id->idIns(); @@ -1255,7 +1283,7 @@ emitter::code_t emitter::AddRexRPrefix(const instrDesc *id, code_t code) return code | 0x4400000000ULL; } -emitter::code_t emitter::AddRexXPrefix(const instrDesc *id, code_t code) +emitter::code_t emitter::AddRexXPrefix(const instrDesc* id, code_t code) { instruction ins = id->idIns(); @@ -1284,7 +1312,7 @@ emitter::code_t emitter::AddRexXPrefix(const instrDesc *id, code_t code) return code | 0x4200000000ULL; } -emitter::code_t emitter::AddRexBPrefix(const instrDesc *id, code_t code) +emitter::code_t emitter::AddRexBPrefix(const instrDesc* id, code_t code) { instruction ins = id->idIns(); @@ -1322,19 +1350,39 @@ emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) return code | 0x4000000000ULL; } + +//------------------------------------------------------------------------ +// AddEvexVPrimePrefix: Add the EVEX.V' bit to the EVEX prefix. EVEX.V' +// is encoded in inverted form. +// +// Arguments: +// code -- register to encode +// +// Return Value: +// code with EVEX.V' set in verted form. +// emitter::code_t emitter::AddEvexVPrimePrefix(code_t code) { assert(UseEvexEncoding() && hasEvexPrefix(code)); return emitter::code_t(code & 0xFFFFFFF7FFFFFFFFULL); } +//------------------------------------------------------------------------ +// AddEvexRPrimePrefix: Add the EVEX.R' bit to the EVEX prefix. EVEX.R' +// is encoded in inverted form. +// +// Arguments: +// code -- register to encode +// +// Return Value: +// code with EVEX.R' set in verted form. +// emitter::code_t emitter::AddEvexRPrimePrefix(code_t code) { assert(UseEvexEncoding() && hasEvexPrefix(code)); return emitter::code_t(code & 0xFFEFFFFFFFFFFFFFULL); } - #endif // TARGET_AMD64 bool isPrefix(BYTE b) @@ -2649,7 +2697,7 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins) const * part of an opcode. */ -inline unsigned emitter::insEncodeReg012(const instrDesc *id, regNumber reg, emitAttr size, code_t* code) +inline unsigned emitter::insEncodeReg012(const instrDesc* id, regNumber reg, emitAttr size, code_t* code) { assert(reg < REG_STK); @@ -2665,7 +2713,7 @@ inline unsigned emitter::insEncodeReg012(const instrDesc *id, regNumber reg, emi { if (IsHighSIMDReg(reg)) { - *code = AddRexXPrefix(id, *code); // EVEX.X + *code = AddRexXPrefix(id, *code); // EVEX.X } if (reg & 0x8) { @@ -2692,7 +2740,7 @@ inline unsigned emitter::insEncodeReg012(const instrDesc *id, regNumber reg, emi * part of an opcode. */ -inline unsigned emitter::insEncodeReg345(const instrDesc *id, regNumber reg, emitAttr size, code_t* code) +inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emitAttr size, code_t* code) { assert(reg < REG_STK); @@ -2734,7 +2782,7 @@ inline unsigned emitter::insEncodeReg345(const instrDesc *id, regNumber reg, emi * Returns modified SIMD opcode with the specified register encoded in bits 3-6 of * byte 2 of VEX and EVEX prefix. */ -inline emitter::code_t emitter::insEncodeReg3456(const instrDesc *id, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber reg, emitAttr size, code_t code) { instruction ins = id->idIns(); @@ -2776,7 +2824,6 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc *id, regNumber } if (UseVEXEncoding() && IsVexEncodedInstruction(ins)) { - // Both prefix encodes register operand in 1's complement form assert(regBits <= 0xF); @@ -2800,7 +2847,7 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc *id, regNumber * Used exclusively to generate the REX.X bit and truncate the register. */ -inline unsigned emitter::insEncodeRegSIB(const instrDesc *id, regNumber reg, code_t* code) +inline unsigned emitter::insEncodeRegSIB(const instrDesc* id, regNumber reg, code_t* code) { instruction ins = id->idIns(); @@ -2816,7 +2863,7 @@ inline unsigned emitter::insEncodeRegSIB(const instrDesc *id, regNumber reg, cod { if (IsHighSIMDReg(reg)) { - *code = AddEvexVPrimePrefix(*code); // EVEX.X + *code = AddEvexVPrimePrefix(*code); // EVEX.X } if (reg & 0x8) { @@ -2837,7 +2884,7 @@ inline unsigned emitter::insEncodeRegSIB(const instrDesc *id, regNumber reg, cod * Returns the "[r/m]" opcode with the mod/RM field set to register. */ -inline emitter::code_t emitter::insEncodeMRreg(const instrDesc *id, code_t code) +inline emitter::code_t emitter::insEncodeMRreg(const instrDesc* id, code_t code) { // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding. @@ -2855,7 +2902,7 @@ inline emitter::code_t emitter::insEncodeMRreg(const instrDesc *id, code_t code) * Returns the given "[r/m]" opcode with the mod/RM field set to register. */ -inline emitter::code_t emitter::insEncodeRMreg(const instrDesc *id, code_t code) +inline emitter::code_t emitter::insEncodeRMreg(const instrDesc* id, code_t code) { // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding. @@ -2873,7 +2920,7 @@ inline emitter::code_t emitter::insEncodeRMreg(const instrDesc *id, code_t code) * the given register. */ -inline emitter::code_t emitter::insEncodeMRreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeMRreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code) { assert((code & 0xC000) == 0); code |= 0xC000; @@ -2888,7 +2935,7 @@ inline emitter::code_t emitter::insEncodeMRreg(const instrDesc *id, regNumber re * the given register. */ -inline emitter::code_t emitter::insEncodeMIreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeMIreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code) { assert((code & 0xC000) == 0); code |= 0xC000; @@ -2913,7 +2960,7 @@ inline bool insNeedsRRIb(instruction ins) * Returns the "reg,reg,imm8" opcode with both the reg's set to the * the given register. */ -inline emitter::code_t emitter::insEncodeRRIb(const instrDesc *id, regNumber reg, emitAttr size) +inline emitter::code_t emitter::insEncodeRRIb(const instrDesc* id, regNumber reg, emitAttr size) { assert(size == EA_4BYTE); // All we handle for now. assert(insNeedsRRIb(id->idIns())); @@ -2932,7 +2979,7 @@ inline emitter::code_t emitter::insEncodeRRIb(const instrDesc *id, regNumber reg * nibble of the opcode */ -inline emitter::code_t emitter::insEncodeOpreg(const instrDesc *id, regNumber reg, emitAttr size) +inline emitter::code_t emitter::insEncodeOpreg(const instrDesc* id, regNumber reg, emitAttr size) { code_t code = insCodeRR(id->idIns()); unsigned regcode = insEncodeReg012(id, reg, size, &code); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 476902a6858cf1..6741676dfce43f 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -75,16 +75,16 @@ unsigned emitGetAdjustedSize(instrDesc* id, code_t code) const; code_t emitExtractVexPrefix(instruction ins, code_t& code) const; code_t emitExtractEvexPrefix(instruction ins, code_t& code) const; -unsigned insEncodeReg012(const instrDesc *id, regNumber reg, emitAttr size, code_t* code); -unsigned insEncodeReg345(const instrDesc *id, regNumber reg, emitAttr size, code_t* code); -code_t insEncodeReg3456(const instrDesc *id, regNumber reg, emitAttr size, code_t code); -unsigned insEncodeRegSIB(const instrDesc *id, regNumber reg, code_t* code); +unsigned insEncodeReg012(const instrDesc* id, regNumber reg, emitAttr size, code_t* code); +unsigned insEncodeReg345(const instrDesc* id, regNumber reg, emitAttr size, code_t* code); +code_t insEncodeReg3456(const instrDesc* id, regNumber reg, emitAttr size, code_t code); +unsigned insEncodeRegSIB(const instrDesc* id, regNumber reg, code_t* code); -code_t insEncodeMRreg(const instrDesc *id, code_t code); -code_t insEncodeRMreg(const instrDesc *id, code_t code); -code_t insEncodeMRreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code); -code_t insEncodeRRIb(const instrDesc *id, regNumber reg, emitAttr size); -code_t insEncodeOpreg(const instrDesc *id, regNumber reg, emitAttr size); +code_t insEncodeMRreg(const instrDesc* id, code_t code); +code_t insEncodeRMreg(const instrDesc* id, code_t code); +code_t insEncodeMRreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code); +code_t insEncodeRRIb(const instrDesc* id, regNumber reg, emitAttr size); +code_t insEncodeOpreg(const instrDesc* id, regNumber reg, emitAttr size); unsigned insSSval(unsigned scale); @@ -103,12 +103,12 @@ bool IsVexEncodedInstruction(instruction ins) const; bool IsEvexEncodedInstruction(instruction ins) const; bool IsVexOrEvexEncodedInstruction(instruction ins) const; -code_t insEncodeMIreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code); +code_t insEncodeMIreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code); -code_t AddRexWPrefix(const instrDesc *id, code_t code); -code_t AddRexRPrefix(const instrDesc *id, code_t code); -code_t AddRexXPrefix(const instrDesc *id, code_t code); -code_t AddRexBPrefix(const instrDesc *id, code_t code); +code_t AddRexWPrefix(const instrDesc* id, code_t code); +code_t AddRexRPrefix(const instrDesc* id, code_t code); +code_t AddRexXPrefix(const instrDesc* id, code_t code); +code_t AddRexBPrefix(const instrDesc* id, code_t code); code_t AddRexPrefix(instruction ins, code_t code); bool EncodedBySSE38orSSE3A(instruction ins) const; @@ -184,7 +184,7 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr // Returns: // `true` if W bit needs to be set to 1. // -bool IsWEvexOpcodeExtension(const instrDesc *id) +bool IsWEvexOpcodeExtension(const instrDesc* id) { if (!TakesEvexPrefix(id)) { @@ -491,7 +491,7 @@ bool UseSimdEncoding() const #define EVEX_PREFIX_MASK 0xFF00000000000000ULL #define EVEX_PREFIX_CODE 0x6200000000000000ULL -bool TakesEvexPrefix(const instrDesc *id) const; +bool TakesEvexPrefix(const instrDesc* id) const; //------------------------------------------------------------------------ // hasEvexPrefix: Returns true if the instruction encoding already @@ -521,7 +521,7 @@ code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); // code with prefix added. // TODO-XARCH-AVX512 come back and check whether we can id `id` directly (no need) // to pass emitAttr size -code_t AddSimdPrefixIfNeeded(const instrDesc *id, code_t code, emitAttr size) +code_t AddSimdPrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size) { instruction ins = id->idIns(); @@ -549,7 +549,7 @@ code_t AddSimdPrefixIfNeeded(const instrDesc *id, code_t code, emitAttr size) // TRUE if code has an Evex prefix. // TODO-XARCH-AVX512 come back and check whether we can id `id` directly (no need) // to pass emitAttr size -code_t AddSimdPrefixIfNeededAndNotPresent(const instrDesc *id, code_t code, emitAttr size) +code_t AddSimdPrefixIfNeededAndNotPresent(const instrDesc* id, code_t code, emitAttr size) { instruction ins = id->idIns(); @@ -564,7 +564,7 @@ code_t AddSimdPrefixIfNeededAndNotPresent(const instrDesc *id, code_t code, emit return code; } -bool TakesSimdPrefix(const instrDesc *id) const; +bool TakesSimdPrefix(const instrDesc* id) const; //------------------------------------------------------------------------ // hasVexOrEvexPrefix: Returns true if the instruction encoding already @@ -1036,8 +1036,7 @@ inline bool HasEmbeddedBroadcast(instrDesc* id) return false; } - -inline bool HasHighSIMDReg(const instrDesc *id) const; +inline bool HasHighSIMDReg(const instrDesc* id) const; inline bool IsHighSIMDReg(regNumber) const; #endif // TARGET_XARCH diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 764e1c5e23e9a0..2e5e4e0befd867 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19034,17 +19034,22 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp) #endif } -bool GenTree::isEvexCompatibleHWIntrinsic(Compiler* comp) +//------------------------------------------------------------------------ +// isEvexCompatibleHWIntrinsics: Checks if the intrinsic has a compatible +// EVEX form for its intended lowering instruction. +// +// Return Value: +// true if the intrisic node lowering instruction has an EVEX form +// +bool GenTree::isEvexCompatibleHWIntrinsic() { assert(gtOper == GT_HWINTRINSIC); - assert(comp != nullptr); // TODO-XARCH-AVX512 remove the ReturnsPerElementMask check once K registers have been properly // implemented in the register allocator #if defined(TARGET_XARCH) - return HWIntrinsicInfo::HasEvexSemantics(AsHWIntrinsic()->GetHWIntrinsicId()) && !HWIntrinsicInfo::ReturnsPerElementMask(AsHWIntrinsic()->GetHWIntrinsicId()); -#elif defined(TARGET_ARM64) - return HWIntrinsicInfo::HasEvexSemantics(AsHWIntrinsic()->GetHWIntrinsicId()) && !HWIntrinsicInfo::ReturnsPerElementMask(AsHWIntrinsic()->GetHWIntrinsicId()); + return HWIntrinsicInfo::HasEvexSemantics(AsHWIntrinsic()->GetHWIntrinsicId()) && + !HWIntrinsicInfo::ReturnsPerElementMask(AsHWIntrinsic()->GetHWIntrinsicId()); #else return false; #endif diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index f6c576bf84591a..3dea877572f76d 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1508,7 +1508,7 @@ struct GenTree bool isCommutativeHWIntrinsic() const; bool isContainableHWIntrinsic() const; bool isRMWHWIntrinsic(Compiler* comp); - bool isEvexCompatibleHWIntrinsic(Compiler* comp); + bool isEvexCompatibleHWIntrinsic(); #else bool isCommutativeHWIntrinsic() const { @@ -1525,7 +1525,7 @@ struct GenTree return false; } - bool isEvexCompatibleHWIntrinsic(Compiler* comp) + bool isEvexCompatibleHWIntrinsic() { return false; } diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index fc3f6e9425e667..2f867721e551d6 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -766,16 +766,20 @@ struct HWIntrinsicInfo #error Unsupported platform #endif } - + //------------------------------------------------------------------------ + // HasEvexSemantics: Checks if the NamedIntrinsic has a lowering to + // to an instruction with an EVEX form. + // + // Return Value: + // true if the NamedIntrinsic lowering has an EVEX form. + // static bool HasEvexSemantics(NamedIntrinsic id) { HWIntrinsicFlag flags = lookupFlags(id); #if defined(TARGET_XARCH) return (flags & HW_Flag_NoEvexSemantics) == 0; -#elif defined(TARGET_ARM64) - return (flags & HW_Flag_NoEvexSemantics) == 0; #else -#error Unsupported platform + return false; #endif } diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 9ab2405c3815d2..644c7ebea1beb6 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -256,6 +256,14 @@ regMaskTP LinearScan::allSIMDRegs() return availableFloatRegs; } +//------------------------------------------------------------------------ +// lowSIMDRegs(): Return the set of SIMD registers associated with VEX +// encoding only, i.e., remove the high EVEX SIMD registers from the available +// set. +// +// Return Value: +// Register mask of the SSE/VEX-only SIMD registers +// regMaskTP LinearScan::lowSIMDRegs() { #if defined(TARGET_AMD64) @@ -479,7 +487,9 @@ regMaskTP LinearScan::stressLimitRegs(RefPosition* refPosition, regMaskTP mask) #endif default: + { unreached(); + } } if (refPosition != nullptr && refPosition->isFixedRegRef) @@ -693,10 +703,10 @@ LinearScan::LinearScan(Compiler* theCompiler) // TODO-XARCH-AVX512 switch this to canUseEvexEncoding() once we independetly // allow EVEX use from the stress flag (currently, if EVEX stress is turned off, // we cannot use EVEX at all) - if (!compiler->DoJitStressEvexEncoding()) + if (compiler->DoJitStressEvexEncoding()) { - availableFloatRegs &= ~RBM_HIGHFLOAT; - availableDoubleRegs &= ~RBM_HIGHFLOAT; + availableFloatRegs |= RBM_HIGHFLOAT; + availableDoubleRegs |= RBM_HIGHFLOAT; } #endif diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 90a5bb900d2ce9..18d5815cb234c5 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -736,8 +736,12 @@ class LinearScan : public LinearScanInterface unsigned lsraStressMask; // This controls the registers available for allocation - enum LsraStressLimitRegs{LSRA_LIMIT_NONE = 0, LSRA_LIMIT_CALLEE = 0x1, LSRA_LIMIT_CALLER = 0x2, - LSRA_LIMIT_SMALL_SET = 0x3, LSRA_LIMIT_UPPER_SIMD_SET = 0x4, LSRA_LIMIT_MASK = 0x5}; + enum LsraStressLimitRegs{LSRA_LIMIT_NONE = 0, + LSRA_LIMIT_CALLEE = 0x1, + LSRA_LIMIT_CALLER = 0x2, + LSRA_LIMIT_SMALL_SET = 0x3, + LSRA_LIMIT_UPPER_SIMD_SET = 0x2000, + LSRA_LIMIT_MASK = 0x2003}; // When LSRA_LIMIT_SMALL_SET is specified, it is desirable to select a "mixed" set of caller- and callee-save // registers, so as to get different coverage than limiting to callee or caller. @@ -757,8 +761,9 @@ class LinearScan : public LinearScanInterface (RBM_EAX | RBM_ECX | RBM_EBX | RBM_ETW_FRAMED_EBP | RBM_ESI | RBM_EDI); #endif // !UNIX_AMD64_ABI static const regMaskTP LsraLimitSmallFPSet = (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM6 | RBM_XMM7); - static const regMaskTP LsraLimitUpperSimdSet = (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 - | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31); + static const regMaskTP LsraLimitUpperSimdSet = + (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | + RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31); #elif defined(TARGET_ARM) // On ARM, we may need two registers to set up the target register for a virtual call, so we need // to have at least the maximum number of arg registers, plus 2. diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index b170e9baf9e8ad..f56960d492f8e9 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -569,7 +569,6 @@ RefPosition* LinearScan::newRefPosition(Interval* theInterval, { mask = RBM_NONE; } - } #endif if (mask == RBM_NONE) @@ -1875,8 +1874,9 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc JITDUMP("\n"); } -static const regNumber lsraRegOrder[] = {REG_VAR_ORDER}; -const unsigned lsraRegOrderSize = ArrLen(lsraRegOrder); +static const regNumber lsraRegOrder[] = {REG_VAR_ORDER}; +const unsigned lsraRegOrderSize = ArrLen(lsraRegOrder); +// TODO-XARCH-AVX512 we might want to move this to be configured with the rbm variables too static const regNumber lsraRegOrderFlt[] = {REG_VAR_ORDER_FLT}; const unsigned lsraRegOrderFltSize = ArrLen(lsraRegOrderFlt); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 2943452e169b14..589b3c21a4f51a 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -156,7 +156,7 @@ int LinearScan::BuildNode(GenTree* tree) srcCount = 0; assert(dstCount == 1); assert(!tree->IsReuseRegVal()); -#if defined(TARGET_AMD64) +#if defined(TARGET_AMD64) regMaskTP opRegMask = RBM_LOWSIMD; #else regMaskTP opRegMask = RBM_NONE; @@ -1891,8 +1891,8 @@ int LinearScan::BuildIntrinsic(GenTree* tree) } assert(tree->gtGetOp2IfPresent() == nullptr); - // TODO-XARCH-AVX512 this is overly constraining register available as NI_System_Math_Abs - // can be lowered to EVEX compatible instruction (the rest cannot) +// TODO-XARCH-AVX512 this is overly constraining register available as NI_System_Math_Abs +// can be lowered to EVEX compatible instruction (the rest cannot) #if defined(TARGET_AMD64) regMaskTP opRegMask = RBM_LOWSIMD; #else @@ -2019,8 +2019,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it // is not allocated the same register as the target. - bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); - bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic(compiler); + bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); + bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic(); // Create internal temps, and handle any other special requirements. // Note that the default case for building uses will handle the RMW flag, but if the uses @@ -2107,7 +2107,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou #if defined(TARGET_AMD64) regMaskTP opRegMask = RBM_LOWSIMD; #else - regMaskTP opRegMask = RBM_NONE; + regMaskTP opRegMask = RBM_NONE; #endif // MaskMove hardcodes the destination (op3) in DI/EDI/RDI srcCount += BuildOperandUses(op1, opRegMask); @@ -2136,7 +2136,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou tgtPrefUse = BuildUse(op1, opRegMask); srcCount += 1; - srcCount += op2->isContained() ? BuildOperandUses(op2, opRegMask) : BuildDelayFreeUses(op2, op1, opRegMask); + srcCount += + op2->isContained() ? BuildOperandUses(op2, opRegMask) : BuildDelayFreeUses(op2, op1, opRegMask); srcCount += BuildDelayFreeUses(op3, op1, RBM_XMM0); buildUses = false; @@ -2333,10 +2334,9 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou #if defined(TARGET_AMD64) regMaskTP opRegMask = RBM_LOWSIMD; #else - regMaskTP opRegMask = RBM_NONE; + regMaskTP opRegMask = RBM_NONE; #endif - // Any pair of the index, mask, or destination registers should be different srcCount += BuildOperandUses(op1, opRegMask); srcCount += BuildDelayFreeUses(op2, nullptr, opRegMask); @@ -2363,10 +2363,9 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou #if defined(TARGET_AMD64) regMaskTP opRegMask = RBM_LOWSIMD; #else - regMaskTP opRegMask = RBM_NONE; + regMaskTP opRegMask = RBM_NONE; #endif - // Any pair of the index, mask, or destination registers should be different srcCount += BuildOperandUses(op1, opRegMask); srcCount += BuildDelayFreeUses(op2, nullptr, opRegMask); @@ -2395,7 +2394,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert((numArgs > 0) && (numArgs < 4)); - regMaskTP op1RegCandidates = RBM_NONE; + regMaskTP op1RegCandidates = RBM_NONE; #if defined(TARGET_AMD64) if (!isEvexCompatible) { @@ -2419,7 +2418,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (op2 != nullptr) { - regMaskTP op2RegCandidates = RBM_NONE; + regMaskTP op2RegCandidates = RBM_NONE; #if defined(TARGET_AMD64) if (!isEvexCompatible) { @@ -2463,14 +2462,15 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (op3 != nullptr) { - regMaskTP op3RegCandidates = RBM_NONE; + regMaskTP op3RegCandidates = RBM_NONE; #if defined(TARGET_AMD64) if (!isEvexCompatible) { op3RegCandidates = RBM_LOWSIMD; } #endif - srcCount += isRMW ? BuildDelayFreeUses(op3, op1, op3RegCandidates) : BuildOperandUses(op3, op3RegCandidates); + srcCount += isRMW ? BuildDelayFreeUses(op3, op1, op3RegCandidates) + : BuildOperandUses(op3, op3RegCandidates); } } } @@ -2481,7 +2481,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (dstCount == 1) { #if defined(TARGET_AMD64) - if (!intrinsicTree->isEvexCompatibleHWIntrinsic(compiler)) + if (!intrinsicTree->isEvexCompatibleHWIntrinsic()) { dstCandidates = RBM_LOWSIMD; } diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index b9b2f82a85a781..954371d68012e7 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -139,7 +139,7 @@ enum _regMask_enum : unsigned __int64 #elif defined(TARGET_AMD64) -enum _regNumber_enum : unsigned +enum _regNumber_enum : unsigned { #define REGDEF(name, rnum, mask, sname) REG_##name = rnum, #define REGALIAS(alias, realname) REG_##alias = REG_##realname, @@ -152,14 +152,13 @@ enum _regNumber_enum : unsigned enum _regMask_enum : unsigned __int64 { - RBM_NONE = 0, - RBM_LOWSIMD = 1LL << 63, + RBM_NONE = 0, + RBM_LOWSIMD = 1ULL << 63, #define REGDEF(name, rnum, mask, sname) RBM_##name = mask, #define REGALIAS(alias, realname) RBM_##alias = RBM_##realname, #include "register.h" - }; #elif defined(TARGET_X86) diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index a17432f9d8e3f5..5b46515340ea81 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -78,9 +78,12 @@ #endif // !UNIX_AMD64_ABI #define CSE_CONSTS 1 // Enable if we want to CSE constants - #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15 | RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31) #define RBM_LOWFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15 ) #define RBM_HIGHFLOAT (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31) + + #define RBM_ALLFLOAT_INIT RBM_LOWFLOAT + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_ALLFLOAT rbmAllFloat #define RBM_ALLDOUBLE RBM_ALLFLOAT #define REG_FP_FIRST REG_XMM0 @@ -120,8 +123,12 @@ #define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15) #define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_RDI|RBM_RSI|RBM_EDX|RBM_ECX|RBM_R8|RBM_R9|RBM_R10|RBM_R11) #define RBM_FLT_CALLEE_SAVED (0) - #define RBM_FLT_CALLEE_TRASH (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM6|RBM_XMM7| \ + + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_FLT_CALLEE_TRASH_INIT (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM6|RBM_XMM7| \ RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15) + #define RBM_FLT_CALLEE_TRASH rbmFltCalleeTrash + #define REG_PROFILER_ENTER_ARG_0 REG_R14 #define RBM_PROFILER_ENTER_ARG_0 RBM_R14 #define REG_PROFILER_ENTER_ARG_1 REG_R15 @@ -135,7 +142,9 @@ #define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ESI|RBM_EDI|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15) #define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_ECX|RBM_EDX|RBM_R8|RBM_R9|RBM_R10|RBM_R11) #define RBM_FLT_CALLEE_SAVED (RBM_XMM6|RBM_XMM7|RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15) - #define RBM_FLT_CALLEE_TRASH (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5) + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_FLT_CALLEE_TRASH_INIT (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5) + #define RBM_FLT_CALLEE_TRASH rbmFltCalleeTrash #endif // !UNIX_AMD64_ABI #define RBM_OSR_INT_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_EBP) @@ -143,7 +152,9 @@ #define REG_FLT_CALLEE_SAVED_FIRST REG_XMM6 #define REG_FLT_CALLEE_SAVED_LAST REG_XMM15 - #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) + #define RBM_CALLEE_TRASH_INIT (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) + #define RBM_CALLEE_TRASH rbmCalleeTrash + #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) @@ -172,20 +183,29 @@ #define REG_WRITE_BARRIER_SRC REG_ARG_1 #define RBM_WRITE_BARRIER_SRC RBM_ARG_1 - #define RBM_CALLEE_TRASH_NOGC RBM_CALLEE_TRASH + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_CALLEE_TRASH_NOGC_INIT RBM_CALLEE_TRASH + #define RBM_CALLEE_TRASH_NOGC rbmCalleeTrashNoGC // Registers killed by CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. - #define RBM_CALLEE_TRASH_WRITEBARRIER RBM_CALLEE_TRASH_NOGC + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_CALLEE_TRASH_WRITEBARRIER_INIT RBM_CALLEE_TRASH_NOGC + #define RBM_CALLEE_TRASH_WRITEBARRIER rbmCalleeTrashWriteBarrier // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. - #define RBM_CALLEE_GCTRASH_WRITEBARRIER RBM_CALLEE_TRASH_NOGC + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_CALLEE_GCTRASH_WRITEBARRIER_INIT RBM_CALLEE_TRASH_NOGC + #define RBM_CALLEE_GCTRASH_WRITEBARRIER rbmCalleeGCTrashWriteBarrier // Registers killed by CORINFO_HELP_ASSIGN_BYREF. - #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF (RBM_RSI | RBM_RDI | RBM_CALLEE_TRASH_NOGC) + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF_INIT (RBM_RSI | RBM_RDI | RBM_CALLEE_TRASH_NOGC) + #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF rbmCalleeTrashWriteBarrierByref // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_BYREF. // Note that RDI and RSI are still valid byref pointers after this helper call, despite their value being changed. - #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF (RBM_CALLEE_TRASH_NOGC & ~(RBM_RDI | RBM_RSI)) + #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF_INIT (RBM_CALLEE_TRASH_NOGC & ~(RBM_RDI | RBM_RSI)) + #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF rbmCalleeGCTrashWriteBarrierByref #if 0 #define REG_VAR_ORDER REG_EAX,REG_EDX,REG_ECX,REG_ESI,REG_EDI,REG_EBX,REG_ETW_FRAMED_EBP_LIST \ @@ -206,7 +226,8 @@ #endif // !UNIX_AMD64_ABI #endif - #define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7,REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15 + //#define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7,REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15 + #define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7,REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15,REG_XMM16,REG_XMM17,REG_XMM18,REG_XMM19,REG_XMM20,REG_XMM22,REG_XMM23,REG_XMM24,REG_XMM25,REG_XMM26,REG_XMM27,REG_XMM28,REG_XMM29,REG_XMM30,REG_XMM31 #ifdef UNIX_AMD64_ABI #define CNT_CALLEE_SAVED (5 + REG_ETW_FRAMED_EBP_COUNT) @@ -214,7 +235,9 @@ #define CNT_CALLEE_ENREG (CNT_CALLEE_SAVED) #define CNT_CALLEE_SAVED_FLOAT (0) - #define CNT_CALLEE_TRASH_FLOAT (16) + #define CNT_CALLEE_TRASH_FLOAT_INIT (16) + /* NOTE: Sync with variable name defined in compiler.h */ + #define CNT_CALLEE_TRASH_FLOAT rbmCntCalleeTrashFloat #define REG_CALLEE_SAVED_ORDER REG_EBX,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15 #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 @@ -223,8 +246,10 @@ #define CNT_CALLEE_TRASH (7) #define CNT_CALLEE_ENREG (CNT_CALLEE_SAVED) - #define CNT_CALLEE_SAVED_FLOAT (10) - #define CNT_CALLEE_TRASH_FLOAT (6) + #define CNT_CALLEE_SAVED_FLOAT (10) + #define CNT_CALLEE_TRASH_FLOAT_INIT (6) + /* NOTE: Sync with variable name defined in compiler.h */ + #define CNT_CALLEE_TRASH_FLOAT rbmCntCalleeTrashFloat #define REG_CALLEE_SAVED_ORDER REG_EBX,REG_ESI,REG_EDI,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15 #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ESI,RBM_EDI,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 @@ -416,8 +441,13 @@ // The registers trashed by profiler enter/leave/tailcall hook // See vm\amd64\asmhelpers.asm for more details. - #define RBM_PROFILER_ENTER_TRASH RBM_CALLEE_TRASH - #define RBM_PROFILER_TAILCALL_TRASH RBM_PROFILER_LEAVE_TRASH + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_PROFILER_ENTER_TRASH_INIT RBM_CALLEE_TRASH + #define RBM_PROFILER_ENTER_TRASH rbmProfilerEnterTrash + + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_PROFILER_TAILCALL_TRASH_INIT RBM_PROFILER_LEAVE_TRASH + #define RBM_PROFILER_TAILCALL_TRASH rbmProfilerTailcallTrash // The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper. #ifdef UNIX_AMD64_ABI @@ -426,16 +456,26 @@ // On Unix a struct of size >=9 and <=16 bytes in size is returned in two return registers. // The return registers could be any two from the set { RAX, RDX, XMM0, XMM1 }. // STOP_FOR_GC helper preserves all the 4 possible return registers. - #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) - #define RBM_PROFILER_LEAVE_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_STOP_FOR_GC_TRASH_INIT (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) + #define RBM_STOP_FOR_GC_TRASH rbmStopForGCTrash + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_PROFILER_LEAVE_TRASH_INIT (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) + #define RBM_PROFILER_LEAVE_TRASH rbmProfilerLeaveTrash #else // See vm\amd64\asmhelpers.asm for more details. - #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) - #define RBM_PROFILER_LEAVE_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_STOP_FOR_GC_TRASH_INIT (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) + #define RBM_STOP_FOR_GC_TRASH rbmStopForGCTrash + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_PROFILER_LEAVE_TRASH_INIT (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) + #define RBM_PROFILER_LEAVE_TRASH rbmProfilerLeaveTrash #endif // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. - #define RBM_INIT_PINVOKE_FRAME_TRASH RBM_CALLEE_TRASH + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_INIT_PINVOKE_FRAME_TRASH_INIT RBM_CALLEE_TRASH + #define RBM_INIT_PINVOKE_FRAME_TRASH rbmInitPInvokeFrameTrash #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH & ~(RBM_R10 | RBM_RCX)) #define REG_VALIDATE_INDIRECT_CALL_ADDR REG_RCX diff --git a/src/coreclr/jit/utils.h b/src/coreclr/jit/utils.h index 0e129f1ed0340c..47558b161e3334 100644 --- a/src/coreclr/jit/utils.h +++ b/src/coreclr/jit/utils.h @@ -419,6 +419,16 @@ class PhasedVar return *this; } + PhasedVar& operator|=(const T& value) + { +#ifdef DEBUG + assert(m_writePhase); + m_initialized = true; +#endif // DEBUG + m_value |= value; + return *this; + } + // Note: if you need more = functions, you can define them here, like operator&= // Assign a value, but don't assert if we're not in the write phase, and From 9f711bab27f30c7e60f1667fee2a98d0cbfad983 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 14 Dec 2022 07:43:22 -0800 Subject: [PATCH 06/34] Fix for X86 throughput. --- src/coreclr/jit/gentree.cpp | 2 +- src/coreclr/jit/lsraxarch.cpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 2e5e4e0befd867..e721d813859dff 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19047,7 +19047,7 @@ bool GenTree::isEvexCompatibleHWIntrinsic() // TODO-XARCH-AVX512 remove the ReturnsPerElementMask check once K registers have been properly // implemented in the register allocator -#if defined(TARGET_XARCH) +#if defined(TARGET_AMD64) return HWIntrinsicInfo::HasEvexSemantics(AsHWIntrinsic()->GetHWIntrinsicId()) && !HWIntrinsicInfo::ReturnsPerElementMask(AsHWIntrinsic()->GetHWIntrinsicId()); #else diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 589b3c21a4f51a..f7437e2b29e133 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2020,7 +2020,9 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it // is not allocated the same register as the target. bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); +#if defined(TARGET_AMD64) bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic(); +#endif // Create internal temps, and handle any other special requirements. // Note that the default case for building uses will handle the RMW flag, but if the uses From f29c146911f5747075a38d27a520afd217d1abf0 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 14 Dec 2022 11:44:48 -0800 Subject: [PATCH 07/34] Add upper simd stress test to the AVX512 testing pipeline. --- eng/pipelines/common/templates/runtimes/run-test-job.yml | 1 + src/tests/Common/testenvironment.proj | 1 + 2 files changed, 2 insertions(+) diff --git a/eng/pipelines/common/templates/runtimes/run-test-job.yml b/eng/pipelines/common/templates/runtimes/run-test-job.yml index 6b238012679736..a2ca44678e6b1e 100644 --- a/eng/pipelines/common/templates/runtimes/run-test-job.yml +++ b/eng/pipelines/common/templates/runtimes/run-test-job.yml @@ -536,6 +536,7 @@ jobs: ${{ if in(parameters.testGroup, 'jitstress-isas-avx512') }}: scenarios: - jitstress_isas_avx512_forceevex + - jitstress_isas_avx512_forceevex_stresshighregs ${{ if in(parameters.testGroup, 'jitstressregs-x86') }}: scenarios: - jitstressregs1_x86_noavx diff --git a/src/tests/Common/testenvironment.proj b/src/tests/Common/testenvironment.proj index 11b358a7f39f2e..b3de6ea96268d6 100644 --- a/src/tests/Common/testenvironment.proj +++ b/src/tests/Common/testenvironment.proj @@ -153,6 +153,7 @@ + From b95d2963c0adcd7881dcde8b7111efe068175535 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 14 Dec 2022 13:39:27 -0800 Subject: [PATCH 08/34] Formatting. --- src/coreclr/jit/emitxarch.cpp | 3 +-- src/coreclr/jit/gentree.cpp | 2 +- src/coreclr/jit/lsraxarch.cpp | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 436f1e46619ded..cd412bb1fa4b35 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1106,7 +1106,7 @@ bool emitter::HasHighSIMDReg(const instrDesc* id) const } //------------------------------------------------------------------------ -// IsHighSIMDReg: Checks if a register is strictly an EVEX encoded high SIMD +// IsHighSIMDReg: Checks if a register is strictly an EVEX encoded high SIMD // registers (mm16-mm31). // // Arguments: @@ -1350,7 +1350,6 @@ emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) return code | 0x4000000000ULL; } - //------------------------------------------------------------------------ // AddEvexVPrimePrefix: Add the EVEX.V' bit to the EVEX prefix. EVEX.V' // is encoded in inverted form. diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index e721d813859dff..928896b6807f82 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19036,7 +19036,7 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp) //------------------------------------------------------------------------ // isEvexCompatibleHWIntrinsics: Checks if the intrinsic has a compatible -// EVEX form for its intended lowering instruction. +// EVEX form for its intended lowering instruction. // // Return Value: // true if the intrisic node lowering instruction has an EVEX form diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index f7437e2b29e133..0b5c58c4c942a4 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2019,7 +2019,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it // is not allocated the same register as the target. - bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); + bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); #if defined(TARGET_AMD64) bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic(); #endif From 6a4dcd1b044c67e215be86ea3533a9690a5c8b1d Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Fri, 16 Dec 2022 08:11:50 -0800 Subject: [PATCH 09/34] Fix wrong-sized attr for simd mov instruction. --- src/coreclr/jit/codegenxarch.cpp | 4 ++-- src/coreclr/jit/emitxarch.cpp | 11 ++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 5c1c26150eae19..f933f685ad6488 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3524,7 +3524,7 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) // this probably needs to be changed. // Load - genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src, offset); + genCodeForLoadOffset(INS_movdqu, EA_16BYTE, xmmTmpReg, src, offset); // Store genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset); @@ -8347,7 +8347,7 @@ void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset { ins = INS_movdqu; // This should be changed! - attr = EA_8BYTE; + attr = EA_16BYTE; size = 16; } else diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index cd412bb1fa4b35..9b0be54c925c33 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1362,8 +1362,12 @@ emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) // emitter::code_t emitter::AddEvexVPrimePrefix(code_t code) { +#if defined(TARGET_AMD64) assert(UseEvexEncoding() && hasEvexPrefix(code)); return emitter::code_t(code & 0xFFFFFFF7FFFFFFFFULL); +#else + unreached(); +#endif } //------------------------------------------------------------------------ @@ -1378,8 +1382,12 @@ emitter::code_t emitter::AddEvexVPrimePrefix(code_t code) // emitter::code_t emitter::AddEvexRPrimePrefix(code_t code) { +#if defined(TARGET_AMD64) assert(UseEvexEncoding() && hasEvexPrefix(code)); return emitter::code_t(code & 0xFFEFFFFFFFFFFFFFULL); +#else + unreached(); +#endif } #endif // TARGET_AMD64 @@ -2808,8 +2816,9 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber // TODO-XARCH-AVX512 I don't like that we redefine regBits on the EVEX case. // Rather see these paths cleaned up. - regBits = HighAwareRegEncoding(reg); #if defined(TARGET_AMD64) + regBits = HighAwareRegEncoding(reg); + if (IsHighSIMDReg(reg)) { // Have to set the EVEX V' bit From d1f2fdbf9e996d0092b4a40894466d66c62c5282 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Fri, 16 Dec 2022 14:28:55 -0800 Subject: [PATCH 10/34] Fix non-AMD64 LSRA stress mask. --- src/coreclr/jit/lsra.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 18d5815cb234c5..0c01f924d140c5 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -740,8 +740,13 @@ class LinearScan : public LinearScanInterface LSRA_LIMIT_CALLEE = 0x1, LSRA_LIMIT_CALLER = 0x2, LSRA_LIMIT_SMALL_SET = 0x3, +#if defined(TARGET_AMD64) LSRA_LIMIT_UPPER_SIMD_SET = 0x2000, - LSRA_LIMIT_MASK = 0x2003}; + LSRA_LIMIT_MASK = 0x2003 +#else + LSRA_LIMIT_MASK = 0x3 +#endif + }; // When LSRA_LIMIT_SMALL_SET is specified, it is desirable to select a "mixed" set of caller- and callee-save // registers, so as to get different coverage than limiting to callee or caller. From c7807c7f732f6a5807f1a67363056e0217f58c30 Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Mon, 19 Dec 2022 10:20:36 -0500 Subject: [PATCH 11/34] Update src/coreclr/jit/compiler.h Co-authored-by: Bruce Forstall --- src/coreclr/jit/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 98940a29be910c..535af8f538239b 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -11554,7 +11554,7 @@ extern const BYTE genActualTypes[]; /*****************************************************************************/ #if defined(TARGET_AMD64) -// The following are for initializing register allocator "constants" defined in targetamd.h +// The following are for initializing register allocator "constants" defined in targetamd64.h // that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which increases // the number of simd (xmm,ymm, and zmm) registers from 16 to 32. // As only 64-bit xarch has the capability to have the additional registers, we limit the changes From 0b6cb558e2574ade15ea8a3ba92f6909ca7ed669 Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Mon, 19 Dec 2022 10:21:01 -0500 Subject: [PATCH 12/34] Update src/coreclr/jit/compiler.cpp Co-authored-by: Bruce Forstall --- src/coreclr/jit/compiler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index cab8ec8ebfa592..da163eac36f8b6 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -10302,7 +10302,7 @@ void Compiler::EnregisterStats::Dump(FILE* fout) const #endif // TRACK_ENREG_STATS #if defined(TARGET_AMD64) -// The following are for initializing register allocator "constants" defined in targetamd.h +// The following are for initializing register allocator "constants" defined in targetamd64.h // that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which increases // the number of simd (xmm,ymm, and zmm) registers from 16 to 32. // As only 64-bit xarch has the capability to have the additional registers, we limit the changes From a45163cf6746b426ee047e31cab437635ddb5c48 Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Mon, 19 Dec 2022 10:22:10 -0500 Subject: [PATCH 13/34] Update src/coreclr/jit/gentree.cpp Co-authored-by: Bruce Forstall --- src/coreclr/jit/gentree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 928896b6807f82..55e40fa6fad5a7 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19035,7 +19035,7 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp) } //------------------------------------------------------------------------ -// isEvexCompatibleHWIntrinsics: Checks if the intrinsic has a compatible +// isEvexCompatibleHWIntrinsic: Checks if the intrinsic has a compatible // EVEX form for its intended lowering instruction. // // Return Value: From 730b4ebd39b2f302d7fafdbcbcf3661f9b6b9500 Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Mon, 19 Dec 2022 10:22:25 -0500 Subject: [PATCH 14/34] Update src/coreclr/jit/hwintrinsic.h Co-authored-by: Bruce Forstall --- src/coreclr/jit/hwintrinsic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index 2f867721e551d6..a4bedfca08f2b6 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -158,7 +158,7 @@ enum HWIntrinsicFlag : unsigned int // contained HW_Flag_MaybeCommutative = 0x80000, - // The intrinsic has EVEX compatible form + // The intrinsic has no EVEX compatible form HW_Flag_NoEvexSemantics = 0x100000 #elif defined(TARGET_ARM64) From 34d61992b9018a2dfe3c250a495d1e0f60303049 Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Mon, 19 Dec 2022 10:22:48 -0500 Subject: [PATCH 15/34] Update src/coreclr/jit/target.h Co-authored-by: Bruce Forstall --- src/coreclr/jit/target.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 954371d68012e7..c57404f4cde9b1 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -65,7 +65,7 @@ inline bool compUnixX86Abi() #define REGMASK_BITS 64 #define CSE_CONST_SHARED_LOW_BITS 16 -#elif defined(TARGET_XARCH) +#elif defined(TARGET_X86) #define REGMASK_BITS 32 #define CSE_CONST_SHARED_LOW_BITS 16 From 395f5e4f37ceb1e1251a98b90f16d118a027892e Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Mon, 19 Dec 2022 10:23:07 -0500 Subject: [PATCH 16/34] Update src/coreclr/jit/emitxarch.cpp Co-authored-by: Bruce Forstall --- src/coreclr/jit/emitxarch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 9b0be54c925c33..4dd327988003b6 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -11573,8 +11573,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Emit the REX prefix if required // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. - // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doind so currently - // since we cannot differentiate EVEX vs VEX without 'code' untill all paths have EVEX support. + // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doing so currently + // since we cannot differentiate EVEX vs VEX without 'code' until all paths have EVEX support. if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { code = AddRexWPrefix(id, code); From 852e0510198475e5fe622f844e5156d8de48893d Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 14 Dec 2022 18:05:47 -0800 Subject: [PATCH 17/34] Remove unneeded vars --- src/coreclr/jit/compiler.cpp | 22 --------------- src/coreclr/jit/compiler.h | 11 -------- src/coreclr/jit/targetamd64.h | 51 +++++++++-------------------------- 3 files changed, 13 insertions(+), 71 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index da163eac36f8b6..57490fa1ff7472 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3337,17 +3337,6 @@ void Compiler::compInitOptions(JitFlags* jitFlags) rbmFltCalleeTrash |= RBM_HIGHFLOAT; rbmCntCalleeTrashFloat += 16; } - rbmCalleeTrash = RBM_CALLEE_TRASH_INIT; - rbmCalleeTrashNoGC = RBM_CALLEE_TRASH_NOGC_INIT; - rbmCalleeTrashWriteBarrier = RBM_CALLEE_TRASH_WRITEBARRIER_INIT; - rbmCalleeGCTrashWriteBarrier = RBM_CALLEE_GCTRASH_WRITEBARRIER_INIT; - rbmCalleeTrashWriteBarrierByref = RBM_CALLEE_TRASH_WRITEBARRIER_BYREF_INIT; - rbmCalleeGCTrashWriteBarrierByref = RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF_INIT; - rbmStopForGCTrash = RBM_STOP_FOR_GC_TRASH_INIT; - rbmInitPInvokeFrameTrash = RBM_INIT_PINVOKE_FRAME_TRASH_INIT; - rbmProfilerEnterTrash = RBM_PROFILER_ENTER_TRASH_INIT; - rbmProfilerLeaveTrash = RBM_PROFILER_LEAVE_TRASH_INIT; - rbmProfilerTailcallTrash = RBM_PROFILER_TAILCALL_TRASH_INIT; #endif // TARGET_AMD64 } @@ -10309,16 +10298,5 @@ void Compiler::EnregisterStats::Dump(FILE* fout) const // to TARGET_AMD64 only. regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; -regMaskTP rbmCalleeTrash; -regMaskTP rbmCalleeTrashNoGC; -regMaskTP rbmCalleeTrashWriteBarrier; -regMaskTP rbmCalleeGCTrashWriteBarrier; -regMaskTP rbmCalleeTrashWriteBarrierByref; -regMaskTP rbmCalleeGCTrashWriteBarrierByref; -regMaskTP rbmStopForGCTrash; -regMaskTP rbmProfilerTailcallTrash; -regMaskTP rbmInitPInvokeFrameTrash; -regMaskTP rbmProfilerEnterTrash; -regMaskTP rbmProfilerLeaveTrash; unsigned rbmCntCalleeTrashFloat; #endif // TARGET_AMD64 diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 535af8f538239b..15c19ca7a1e2ae 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -11561,17 +11561,6 @@ extern const BYTE genActualTypes[]; // to TARGET_AMD64 only. extern regMaskTP rbmAllFloat; extern regMaskTP rbmFltCalleeTrash; -extern regMaskTP rbmCalleeTrash; -extern regMaskTP rbmCalleeTrashNoGC; -extern regMaskTP rbmCalleeTrashWriteBarrier; -extern regMaskTP rbmCalleeGCTrashWriteBarrier; -extern regMaskTP rbmCalleeTrashWriteBarrierByref; -extern regMaskTP rbmCalleeGCTrashWriteBarrierByref; -extern regMaskTP rbmStopForGCTrash; -extern regMaskTP rbmProfilerTailcallTrash; -extern regMaskTP rbmInitPInvokeFrameTrash; -extern regMaskTP rbmProfilerEnterTrash; -extern regMaskTP rbmProfilerLeaveTrash; extern unsigned rbmCntCalleeTrashFloat; #endif // TARGET_AMD64 diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 5b46515340ea81..d8535dcec2d578 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -152,8 +152,7 @@ #define REG_FLT_CALLEE_SAVED_FIRST REG_XMM6 #define REG_FLT_CALLEE_SAVED_LAST REG_XMM15 - #define RBM_CALLEE_TRASH_INIT (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) - #define RBM_CALLEE_TRASH rbmCalleeTrash + #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | rbmFltCalleeTrash) #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) @@ -183,29 +182,19 @@ #define REG_WRITE_BARRIER_SRC REG_ARG_1 #define RBM_WRITE_BARRIER_SRC RBM_ARG_1 - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_CALLEE_TRASH_NOGC_INIT RBM_CALLEE_TRASH - #define RBM_CALLEE_TRASH_NOGC rbmCalleeTrashNoGC + #define RBM_CALLEE_TRASH_NOGC RBM_CALLEE_TRASH // Registers killed by CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_CALLEE_TRASH_WRITEBARRIER_INIT RBM_CALLEE_TRASH_NOGC - #define RBM_CALLEE_TRASH_WRITEBARRIER rbmCalleeTrashWriteBarrier + #define RBM_CALLEE_TRASH_WRITEBARRIER RBM_CALLEE_TRASH_NOGC // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_CALLEE_GCTRASH_WRITEBARRIER_INIT RBM_CALLEE_TRASH_NOGC - #define RBM_CALLEE_GCTRASH_WRITEBARRIER rbmCalleeGCTrashWriteBarrier + #define RBM_CALLEE_GCTRASH_WRITEBARRIER RBM_CALLEE_TRASH_NOGC // Registers killed by CORINFO_HELP_ASSIGN_BYREF. - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF_INIT (RBM_RSI | RBM_RDI | RBM_CALLEE_TRASH_NOGC) - #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF rbmCalleeTrashWriteBarrierByref + #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF (RBM_RSI | RBM_RDI | RBM_CALLEE_TRASH_NOGC) // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_BYREF. - // Note that RDI and RSI are still valid byref pointers after this helper call, despite their value being changed. - #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF_INIT (RBM_CALLEE_TRASH_NOGC & ~(RBM_RDI | RBM_RSI)) - #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF rbmCalleeGCTrashWriteBarrierByref + #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF (RBM_CALLEE_TRASH_NOGC & ~(RBM_RDI | RBM_RSI)) #if 0 #define REG_VAR_ORDER REG_EAX,REG_EDX,REG_ECX,REG_ESI,REG_EDI,REG_EBX,REG_ETW_FRAMED_EBP_LIST \ @@ -441,13 +430,9 @@ // The registers trashed by profiler enter/leave/tailcall hook // See vm\amd64\asmhelpers.asm for more details. - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_PROFILER_ENTER_TRASH_INIT RBM_CALLEE_TRASH - #define RBM_PROFILER_ENTER_TRASH rbmProfilerEnterTrash + #define RBM_PROFILER_ENTER_TRASH RBM_CALLEE_TRASH - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_PROFILER_TAILCALL_TRASH_INIT RBM_PROFILER_LEAVE_TRASH - #define RBM_PROFILER_TAILCALL_TRASH rbmProfilerTailcallTrash + #define RBM_PROFILER_TAILCALL_TRASH RBM_PROFILER_LEAVE_TRASH // The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper. #ifdef UNIX_AMD64_ABI @@ -456,26 +441,16 @@ // On Unix a struct of size >=9 and <=16 bytes in size is returned in two return registers. // The return registers could be any two from the set { RAX, RDX, XMM0, XMM1 }. // STOP_FOR_GC helper preserves all the 4 possible return registers. - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_STOP_FOR_GC_TRASH_INIT (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) - #define RBM_STOP_FOR_GC_TRASH rbmStopForGCTrash - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_PROFILER_LEAVE_TRASH_INIT (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) - #define RBM_PROFILER_LEAVE_TRASH rbmProfilerLeaveTrash + #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) + #define RBM_PROFILER_LEAVE_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) #else // See vm\amd64\asmhelpers.asm for more details. - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_STOP_FOR_GC_TRASH_INIT (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) - #define RBM_STOP_FOR_GC_TRASH rbmStopForGCTrash - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_PROFILER_LEAVE_TRASH_INIT (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) - #define RBM_PROFILER_LEAVE_TRASH rbmProfilerLeaveTrash + #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) + #define RBM_PROFILER_LEAVE_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) #endif // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_INIT_PINVOKE_FRAME_TRASH_INIT RBM_CALLEE_TRASH - #define RBM_INIT_PINVOKE_FRAME_TRASH rbmInitPInvokeFrameTrash + #define RBM_INIT_PINVOKE_FRAME_TRASH RBM_CALLEE_TRASH #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH & ~(RBM_R10 | RBM_RCX)) #define REG_VALIDATE_INDIRECT_CALL_ADDR REG_RCX From 7238b499f399007cfed9ca3fdd6231c235bd3400 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Mon, 19 Dec 2022 08:39:53 -0800 Subject: [PATCH 18/34] Address PR comments. --- src/coreclr/jit/compiler.cpp | 6 +++--- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/hwintrinsic.h | 6 +----- src/coreclr/jit/target.h | 4 ++-- src/coreclr/jit/targetamd64.h | 6 ++++-- 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 57490fa1ff7472..806f1ac38f3cf6 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3329,13 +3329,13 @@ void Compiler::compInitOptions(JitFlags* jitFlags) #if defined(TARGET_AMD64) rbmAllFloat = RBM_ALLFLOAT_INIT; rbmFltCalleeTrash = RBM_FLT_CALLEE_TRASH_INIT; - rbmCntCalleeTrashFloat = CNT_CALLEE_TRASH_FLOAT_INIT; + cntCalleeTrashFloat = CNT_CALLEE_TRASH_FLOAT_INIT; if (DoJitStressEvexEncoding()) { rbmAllFloat |= RBM_HIGHFLOAT; rbmFltCalleeTrash |= RBM_HIGHFLOAT; - rbmCntCalleeTrashFloat += 16; + cntCalleeTrashFloat += CNT_CALLEE_TRASH_HIGHFLOAT; } #endif // TARGET_AMD64 } @@ -10298,5 +10298,5 @@ void Compiler::EnregisterStats::Dump(FILE* fout) const // to TARGET_AMD64 only. regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; -unsigned rbmCntCalleeTrashFloat; +unsigned cntCalleeTrashFloat; #endif // TARGET_AMD64 diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 15c19ca7a1e2ae..512f685a0ace81 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -11561,7 +11561,7 @@ extern const BYTE genActualTypes[]; // to TARGET_AMD64 only. extern regMaskTP rbmAllFloat; extern regMaskTP rbmFltCalleeTrash; -extern unsigned rbmCntCalleeTrashFloat; +extern unsigned cntCalleeTrashFloat; #endif // TARGET_AMD64 /*****************************************************************************/ diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index a4bedfca08f2b6..bacb22173cedfa 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -176,10 +176,6 @@ enum HWIntrinsicFlag : unsigned int // The intrinsic supports some sort of containment analysis HW_Flag_SupportsContainment = 0x2000, - - // The intrinsic does not have an EVEX compatible form - HW_Flag_NoEvexSemantics = 0x4000 - #else #error Unsupported platform #endif @@ -775,8 +771,8 @@ struct HWIntrinsicInfo // static bool HasEvexSemantics(NamedIntrinsic id) { - HWIntrinsicFlag flags = lookupFlags(id); #if defined(TARGET_XARCH) + HWIntrinsicFlag flags = lookupFlags(id); return (flags & HW_Flag_NoEvexSemantics) == 0; #else return false; diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index c57404f4cde9b1..3186cdde4e1250 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -150,10 +150,10 @@ enum _regNumber_enum : unsigned ACTUAL_REG_COUNT = REG_COUNT - 1 // everything but REG_STK (only real regs) }; -enum _regMask_enum : unsigned __int64 +enum _regMask_enum : uint64_t { RBM_NONE = 0, - RBM_LOWSIMD = 1ULL << 63, + RBM_LOWSIMD = UI64(1) << 63, #define REGDEF(name, rnum, mask, sname) RBM_##name = mask, #define REGALIAS(alias, realname) RBM_##alias = RBM_##realname, diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index d8535dcec2d578..1576486e6c1268 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -225,8 +225,9 @@ #define CNT_CALLEE_SAVED_FLOAT (0) #define CNT_CALLEE_TRASH_FLOAT_INIT (16) + #define CNT_CALLEE_TRASH_HIGHFLOAT (16) /* NOTE: Sync with variable name defined in compiler.h */ - #define CNT_CALLEE_TRASH_FLOAT rbmCntCalleeTrashFloat + #define CNT_CALLEE_TRASH_FLOAT cntCalleeTrashFloat #define REG_CALLEE_SAVED_ORDER REG_EBX,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15 #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 @@ -237,8 +238,9 @@ #define CNT_CALLEE_SAVED_FLOAT (10) #define CNT_CALLEE_TRASH_FLOAT_INIT (6) + #define CNT_CALLEE_TRASH_HIGHFLOAT (16) /* NOTE: Sync with variable name defined in compiler.h */ - #define CNT_CALLEE_TRASH_FLOAT rbmCntCalleeTrashFloat + #define CNT_CALLEE_TRASH_FLOAT cntCalleeTrashFloat #define REG_CALLEE_SAVED_ORDER REG_EBX,REG_ESI,REG_EDI,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15 #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ESI,RBM_EDI,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 From 2b56df7a64337c54b8bcedd2567ce0ef9fab397e Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Mon, 19 Dec 2022 08:50:38 -0800 Subject: [PATCH 19/34] Allow `emitinl.h` access to the `rbm` variables. --- src/coreclr/jit/emitinl.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/emitinl.h b/src/coreclr/jit/emitinl.h index 354fc23363a2f7..42a178d4f4759a 100644 --- a/src/coreclr/jit/emitinl.h +++ b/src/coreclr/jit/emitinl.h @@ -207,15 +207,22 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id) #endif // TARGET_XARCH +// TODO-XARCH-AVX512 the following are defined via compiler.h but re-defining via +// extern here to avoid having to introduce a dependency of compiler.h on to +// emitinl.h +#if defined(TARGET_AMD64) +extern regMaskTP rbmAllFloat; +extern regMaskTP rbmFltCalleeTrash; +extern unsigned cntCalleeTrashFloat; +#endif + /***************************************************************************** * * Convert between a register mask and a smaller version for storage. */ - /*static*/ inline void emitter::emitEncodeCallGCregs(regMaskTP regmask, instrDesc* id) { - // TODO-XARCH-AVX512 global defined in compiler.h, not in scope here - // assert((regmask & RBM_CALLEE_TRASH) == 0); + assert((regmask & RBM_CALLEE_TRASH) == 0); unsigned encodeMask; From fa1a550519d5be47186b0833ad37e73a261d3927 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Mon, 9 Jan 2023 15:30:04 -0800 Subject: [PATCH 20/34] Replace RBM_LOWSIMD with `BuildEvexIncompatibleMask`. --- src/coreclr/jit/compiler.cpp | 6 +- src/coreclr/jit/emitxarch.cpp | 4 +- src/coreclr/jit/lsra.h | 19 ++++--- src/coreclr/jit/lsrabuild.cpp | 14 ----- src/coreclr/jit/lsraxarch.cpp | 100 +++++++++++++++------------------- src/coreclr/jit/target.h | 3 +- 6 files changed, 61 insertions(+), 85 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 806f1ac38f3cf6..491d014675dd87 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3327,9 +3327,9 @@ void Compiler::compInitOptions(JitFlags* jitFlags) #endif // defined(DEBUG) && defined(TARGET_ARM64) #if defined(TARGET_AMD64) - rbmAllFloat = RBM_ALLFLOAT_INIT; - rbmFltCalleeTrash = RBM_FLT_CALLEE_TRASH_INIT; - cntCalleeTrashFloat = CNT_CALLEE_TRASH_FLOAT_INIT; + rbmAllFloat = RBM_ALLFLOAT_INIT; + rbmFltCalleeTrash = RBM_FLT_CALLEE_TRASH_INIT; + cntCalleeTrashFloat = CNT_CALLEE_TRASH_FLOAT_INIT; if (DoJitStressEvexEncoding()) { diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 4dd327988003b6..9fb35e9f94375a 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -2814,8 +2814,8 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber { assert(hasEvexPrefix(code) && TakesEvexPrefix(id)); - // TODO-XARCH-AVX512 I don't like that we redefine regBits on the EVEX case. - // Rather see these paths cleaned up. +// TODO-XARCH-AVX512 I don't like that we redefine regBits on the EVEX case. +// Rather see these paths cleaned up. #if defined(TARGET_AMD64) regBits = HighAwareRegEncoding(reg); diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 0c01f924d140c5..519c29a14d8157 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -736,17 +736,19 @@ class LinearScan : public LinearScanInterface unsigned lsraStressMask; // This controls the registers available for allocation - enum LsraStressLimitRegs{LSRA_LIMIT_NONE = 0, - LSRA_LIMIT_CALLEE = 0x1, - LSRA_LIMIT_CALLER = 0x2, - LSRA_LIMIT_SMALL_SET = 0x3, + enum LsraStressLimitRegs + { + LSRA_LIMIT_NONE = 0, + LSRA_LIMIT_CALLEE = 0x1, + LSRA_LIMIT_CALLER = 0x2, + LSRA_LIMIT_SMALL_SET = 0x3, #if defined(TARGET_AMD64) - LSRA_LIMIT_UPPER_SIMD_SET = 0x2000, - LSRA_LIMIT_MASK = 0x2003 + LSRA_LIMIT_UPPER_SIMD_SET = 0x2000, + LSRA_LIMIT_MASK = 0x2003 #else - LSRA_LIMIT_MASK = 0x3 + LSRA_LIMIT_MASK = 0x3 #endif - }; + }; // When LSRA_LIMIT_SMALL_SET is specified, it is desirable to select a "mixed" set of caller- and callee-save // registers, so as to get different coverage than limiting to callee or caller. @@ -1859,6 +1861,7 @@ class LinearScan : public LinearScanInterface int BuildCastUses(GenTreeCast* cast, regMaskTP candidates); #ifdef TARGET_XARCH int BuildRMWUses(GenTree* node, GenTree* op1, GenTree* op2, regMaskTP candidates = RBM_NONE); + inline regMaskTP BuildEvexIncompatibleMask(GenTree* tree); #endif // !TARGET_XARCH int BuildSelect(GenTreeOp* select); // This is the main entry point for building the RefPositions for a node. diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index f56960d492f8e9..553fc192f1e546 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -557,20 +557,6 @@ RefPosition* LinearScan::newRefPosition(Interval* theInterval, { if (theInterval != nullptr) { -#if defined(TARGET_AMD64) - if (mask == RBM_LOWSIMD) - { - // Constrain if we have to for float/simd types - if (varTypeIsFloating(theInterval->registerType) || varTypeIsSIMD(theInterval->registerType)) - { - mask = lowSIMDRegs(); - } - else - { - mask = RBM_NONE; - } - } -#endif if (mask == RBM_NONE) { mask = allRegs(theInterval->registerType); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 0b5c58c4c942a4..ef499520822196 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -156,12 +156,7 @@ int LinearScan::BuildNode(GenTree* tree) srcCount = 0; assert(dstCount == 1); assert(!tree->IsReuseRegVal()); -#if defined(TARGET_AMD64) - regMaskTP opRegMask = RBM_LOWSIMD; -#else - regMaskTP opRegMask = RBM_NONE; -#endif - RefPosition* def = BuildDef(tree, opRegMask); + RefPosition* def = BuildDef(tree, BuildEvexIncompatibleMask(tree)); def->getInterval()->isConstant = true; } break; @@ -1891,29 +1886,23 @@ int LinearScan::BuildIntrinsic(GenTree* tree) } assert(tree->gtGetOp2IfPresent() == nullptr); -// TODO-XARCH-AVX512 this is overly constraining register available as NI_System_Math_Abs -// can be lowered to EVEX compatible instruction (the rest cannot) -#if defined(TARGET_AMD64) - regMaskTP opRegMask = RBM_LOWSIMD; -#else - regMaskTP opRegMask = RBM_NONE; -#endif - + // TODO-XARCH-AVX512 this is overly constraining register available as NI_System_Math_Abs + // can be lowered to EVEX compatible instruction (the rest cannot) int srcCount; if (op1->isContained()) { - srcCount = BuildOperandUses(op1, opRegMask); + srcCount = BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); } else { - tgtPrefUse = BuildUse(op1, opRegMask); + tgtPrefUse = BuildUse(op1, BuildEvexIncompatibleMask(op1)); srcCount = 1; } if (internalFloatDef != nullptr) { buildInternalRegisterUses(); } - BuildDef(tree, opRegMask); + BuildDef(tree, BuildEvexIncompatibleMask(tree)); return srcCount; } @@ -2106,14 +2095,9 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(numArgs == 3); assert(!isRMW); -#if defined(TARGET_AMD64) - regMaskTP opRegMask = RBM_LOWSIMD; -#else - regMaskTP opRegMask = RBM_NONE; -#endif // MaskMove hardcodes the destination (op3) in DI/EDI/RDI - srcCount += BuildOperandUses(op1, opRegMask); - srcCount += BuildOperandUses(op2, opRegMask); + srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); + srcCount += BuildOperandUses(op2, BuildEvexIncompatibleMask(op2)); srcCount += BuildOperandUses(op3, RBM_EDI); buildUses = false; @@ -2128,18 +2112,12 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert(isRMW); -#if defined(TARGET_AMD64) - regMaskTP opRegMask = RBM_LOWSIMD; -#else - regMaskTP opRegMask = RBM_NONE; -#endif - // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0 - tgtPrefUse = BuildUse(op1, opRegMask); + tgtPrefUse = BuildUse(op1, BuildEvexIncompatibleMask(op1)); srcCount += 1; - srcCount += - op2->isContained() ? BuildOperandUses(op2, opRegMask) : BuildDelayFreeUses(op2, op1, opRegMask); + srcCount += op2->isContained() ? BuildOperandUses(op2, BuildEvexIncompatibleMask(op2)) + : BuildDelayFreeUses(op2, op1, BuildEvexIncompatibleMask(op2)); srcCount += BuildDelayFreeUses(op3, op1, RBM_XMM0); buildUses = false; @@ -2333,15 +2311,9 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(numArgs == 3); assert(!isRMW); -#if defined(TARGET_AMD64) - regMaskTP opRegMask = RBM_LOWSIMD; -#else - regMaskTP opRegMask = RBM_NONE; -#endif - // Any pair of the index, mask, or destination registers should be different - srcCount += BuildOperandUses(op1, opRegMask); - srcCount += BuildDelayFreeUses(op2, nullptr, opRegMask); + srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); + srcCount += BuildDelayFreeUses(op2, nullptr, BuildEvexIncompatibleMask(op2)); // op3 should always be contained assert(op3->isContained()); @@ -2362,17 +2334,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou GenTree* op4 = intrinsicTree->Op(4); GenTree* op5 = intrinsicTree->Op(5); -#if defined(TARGET_AMD64) - regMaskTP opRegMask = RBM_LOWSIMD; -#else - regMaskTP opRegMask = RBM_NONE; -#endif - // Any pair of the index, mask, or destination registers should be different - srcCount += BuildOperandUses(op1, opRegMask); - srcCount += BuildDelayFreeUses(op2, nullptr, opRegMask); - srcCount += BuildDelayFreeUses(op3, nullptr, opRegMask); - srcCount += BuildDelayFreeUses(op4, nullptr, opRegMask); + srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); + srcCount += BuildDelayFreeUses(op2, nullptr, BuildEvexIncompatibleMask(op2)); + srcCount += BuildDelayFreeUses(op3, nullptr, BuildEvexIncompatibleMask(op3)); + srcCount += BuildDelayFreeUses(op4, nullptr, BuildEvexIncompatibleMask(op4)); // op5 should always be contained assert(op5->isContained()); @@ -2400,7 +2366,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou #if defined(TARGET_AMD64) if (!isEvexCompatible) { - op1RegCandidates = RBM_LOWSIMD; + op1RegCandidates = BuildEvexIncompatibleMask(op1); } #endif @@ -2424,7 +2390,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou #if defined(TARGET_AMD64) if (!isEvexCompatible) { - op2RegCandidates = RBM_LOWSIMD; + op2RegCandidates = BuildEvexIncompatibleMask(op2); } #endif if (op2->OperIs(GT_HWINTRINSIC) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained()) @@ -2468,7 +2434,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou #if defined(TARGET_AMD64) if (!isEvexCompatible) { - op3RegCandidates = RBM_LOWSIMD; + op3RegCandidates = BuildEvexIncompatibleMask(op3); } #endif srcCount += isRMW ? BuildDelayFreeUses(op3, op1, op3RegCandidates) @@ -2483,9 +2449,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (dstCount == 1) { #if defined(TARGET_AMD64) - if (!intrinsicTree->isEvexCompatibleHWIntrinsic()) + if (!intrinsicTree->isEvexCompatibleHWIntrinsic() && + (varTypeIsFloating(intrinsicTree->gtType) || varTypeIsSIMD(intrinsicTree->gtType))) { - dstCandidates = RBM_LOWSIMD; + dstCandidates = lowSIMDRegs(); } #endif @@ -2772,4 +2739,25 @@ void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/) } } +inline regMaskTP LinearScan::BuildEvexIncompatibleMask(GenTree* tree) +{ +#if defined(TARGET_AMD64) + if (!(varTypeIsFloating(tree->gtType) || varTypeIsSIMD(tree->gtType))) + { + return RBM_NONE; + } + + if (tree->isContained() && + (tree->OperIsIndir() || (tree->OperIs(GT_HWINTRINSIC) && tree->AsHWIntrinsic()->OperIsMemoryLoad()) || + tree->OperIs(GT_LEA))) + { + return RBM_NONE; + } + + return lowSIMDRegs(); +#else + return RBM_NONE; +#endif +} + #endif // TARGET_XARCH diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 3186cdde4e1250..25e357224aac37 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -152,8 +152,7 @@ enum _regNumber_enum : unsigned enum _regMask_enum : uint64_t { - RBM_NONE = 0, - RBM_LOWSIMD = UI64(1) << 63, + RBM_NONE = 0, #define REGDEF(name, rnum, mask, sname) RBM_##name = mask, #define REGALIAS(alias, realname) RBM_##alias = RBM_##realname, From 564dc816fd37b40af04f9a4d12131c45e87d2bb1 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Thu, 12 Jan 2023 08:24:32 -0800 Subject: [PATCH 21/34] Move AVX512 dependent `targetamd.h` vars into compiler object. --- src/coreclr/jit/codegencommon.cpp | 23 ++++++++++++++++ src/coreclr/jit/codegenlinear.cpp | 13 +++++++++ src/coreclr/jit/codegenxarch.cpp | 12 +++++++++ src/coreclr/jit/compiler.cpp | 13 +-------- src/coreclr/jit/compiler.h | 30 +++++++++++++-------- src/coreclr/jit/emit.cpp | 13 +++++++++ src/coreclr/jit/emitinl.h | 23 +++++++++------- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 13 +++++++++ src/coreclr/jit/lsra.cpp | 15 ++++++++++- src/coreclr/jit/lsra.h | 13 ++++++++- src/coreclr/jit/lsrabuild.cpp | 12 +++++++++ src/coreclr/jit/optimizer.cpp | 13 +++++++++ src/coreclr/jit/targetamd64.h | 18 +++++++------ 13 files changed, 168 insertions(+), 43 deletions(-) diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 1be119806114c4..546215c42d396b 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -29,6 +29,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "patchpointinfo.h" +// Please see the comment for these instance variables in `compiler.h` +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (this->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (this->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (this->cntCalleeTrashFloat) +#endif + /*****************************************************************************/ void CodeGenInterface::setFramePointerRequiredEH(bool value) @@ -775,6 +782,17 @@ void Compiler::compChangeLife(VARSET_VALARG_TP newLife) } } +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT_USE + +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) +#endif + + // Need an explicit instantiation. template void Compiler::compChangeLife(VARSET_VALARG_TP newLife); @@ -9412,3 +9430,8 @@ bool CodeGen::genCanOmitNormalizationForBswap16(GenTree* tree) return (cast->gtCastType == TYP_USHORT) || (cast->gtCastType == TYP_SHORT); } + + +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT_USE \ No newline at end of file diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index b510ce4a558dc5..b784eaa0e10135 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -17,6 +17,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "emit.h" #include "codegen.h" +// Please see the comment for these instance variables in `compiler.h` +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) +#endif + + //------------------------------------------------------------------------ // genInitializeRegisterState: Initialize the register state contained in 'regSet'. // @@ -2684,3 +2692,8 @@ void CodeGen::genCodeForSetcc(GenTreeCC* setcc) genProduceReg(setcc); } #endif // !TARGET_LOONGARCH64 + + +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT_USE \ No newline at end of file diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index f933f685ad6488..8109b35b74790d 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -15,6 +15,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #pragma warning(disable : 4310) // cast truncates constant value - happens for (int8_t)0xb1 #endif + #ifdef TARGET_XARCH #include "emit.h" #include "codegen.h" @@ -23,6 +24,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "gcinfoencoder.h" #include "patchpointinfo.h" +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) +#endif + + //--------------------------------------------------------------------- // genSetGSSecurityCookie: Set the "GS" security cookie in the prolog. // @@ -11048,3 +11056,7 @@ bool CodeGenInterface::genCodeAddrNeedsReloc(size_t addr) } #endif // TARGET_XARCH + +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 491d014675dd87..43a93c35b08033 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -10288,15 +10288,4 @@ void Compiler::EnregisterStats::Dump(FILE* fout) const PRINT_STATS(m_stressLclFld, m_addrExposed); PRINT_STATS(m_dispatchRetBuf, m_addrExposed); } -#endif // TRACK_ENREG_STATS - -#if defined(TARGET_AMD64) -// The following are for initializing register allocator "constants" defined in targetamd64.h -// that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which increases -// the number of simd (xmm,ymm, and zmm) registers from 16 to 32. -// As only 64-bit xarch has the capability to have the additional registers, we limit the changes -// to TARGET_AMD64 only. -regMaskTP rbmAllFloat; -regMaskTP rbmFltCalleeTrash; -unsigned cntCalleeTrashFloat; -#endif // TARGET_AMD64 +#endif // TRACK_ENREG_STATS \ No newline at end of file diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 512f685a0ace81..0bcaf7374cfc34 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10635,6 +10635,25 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX GenTree* fgMorphMultiregStructArg(CallArg* arg); bool killGCRefs(GenTree* tree); + +#if defined(TARGET_AMD64) +public: + // The following are for initializing register allocator "constants" defined in targetamd64.h + // that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which increases + // the number of simd (xmm,ymm, and zmm) registers from 16 to 32. + // As only 64-bit xarch has the capability to have the additional registers, we limit the changes + // to TARGET_AMD64 only. + // + // Users of `targetamd.h` need to define three macros, RBM_ALLFLOAT_USE, RBM_FLT_CALLEE_TRASH_USE, + // and CNT_CALLEE_TRASH_FLOAT_USE which should point to these three variables respectively. + // We did this to avoid poluting all `targetXXX.h` macro definitions with a compiler parameter, where only + // TARGET_AMD64 requires one. + regMaskTP rbmAllFloat; + regMaskTP rbmFltCalleeTrash; + unsigned cntCalleeTrashFloat; +#endif // TARGET_AMD64 + + }; // end of class Compiler //--------------------------------------------------------------------------------------------------------------------- @@ -11553,17 +11572,6 @@ extern const BYTE genActualTypes[]; /*****************************************************************************/ -#if defined(TARGET_AMD64) -// The following are for initializing register allocator "constants" defined in targetamd64.h -// that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which increases -// the number of simd (xmm,ymm, and zmm) registers from 16 to 32. -// As only 64-bit xarch has the capability to have the additional registers, we limit the changes -// to TARGET_AMD64 only. -extern regMaskTP rbmAllFloat; -extern regMaskTP rbmFltCalleeTrash; -extern unsigned cntCalleeTrashFloat; -#endif // TARGET_AMD64 - /*****************************************************************************/ #ifdef DEBUG diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 692eb10d12359e..1a218385eb5d2c 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -20,6 +20,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "emit.h" #include "codegen.h" +// Please see the comment for these instance variables in `compiler.h` +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (emitComp->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (emitComp->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (emitComp->cntCalleeTrashFloat) +#endif + + /***************************************************************************** * * Represent an emitter location. @@ -9945,3 +9953,8 @@ void emitter::emitEnableGC() } } #endif // !defined(JIT32_GCENCODER) + + +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT_USE \ No newline at end of file diff --git a/src/coreclr/jit/emitinl.h b/src/coreclr/jit/emitinl.h index 42a178d4f4759a..39b3f0678a3134 100644 --- a/src/coreclr/jit/emitinl.h +++ b/src/coreclr/jit/emitinl.h @@ -7,6 +7,13 @@ #ifdef TARGET_XARCH +// Please see the comment for these instance variables in `compiler.h` +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (emitComp->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (emitComp->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (emitComp->cntCalleeTrashFloat) +#endif + /* static */ inline bool emitter::instrIs3opImul(instruction ins) { @@ -207,22 +214,13 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id) #endif // TARGET_XARCH -// TODO-XARCH-AVX512 the following are defined via compiler.h but re-defining via -// extern here to avoid having to introduce a dependency of compiler.h on to -// emitinl.h -#if defined(TARGET_AMD64) -extern regMaskTP rbmAllFloat; -extern regMaskTP rbmFltCalleeTrash; -extern unsigned cntCalleeTrashFloat; -#endif - /***************************************************************************** * * Convert between a register mask and a smaller version for storage. */ /*static*/ inline void emitter::emitEncodeCallGCregs(regMaskTP regmask, instrDesc* id) { - assert((regmask & RBM_CALLEE_TRASH) == 0); + //assert((regmask & RBM_CALLEE_TRASH) == 0); unsigned encodeMask; @@ -548,6 +546,11 @@ bool emitter::emitGenNoGCLst(Callback& cb) return true; } +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT_USE + /*****************************************************************************/ #endif //_EMITINL_H_ /*****************************************************************************/ + diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index cb114b2d197010..b4caf59e096e11 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -23,6 +23,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "gcinfo.h" #include "gcinfoencoder.h" +// Please see the comment for these instance variables in `compiler.h` +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) +#endif + + //------------------------------------------------------------------------ // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node // @@ -1997,4 +2005,9 @@ void CodeGen::genX86SerializeIntrinsic(GenTreeHWIntrinsic* node) genProduceReg(node); } +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT + #endif // FEATURE_HW_INTRINSICS + diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 644c7ebea1beb6..5ec57974e07615 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -101,6 +101,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX const char* LinearScan::resolveTypeName[] = {"Split", "Join", "Critical", "SharedCritical"}; #endif // DEBUG +// Please see the comment for these instance variables in `compiler.h` +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) +#endif + /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XX XX @@ -8979,6 +8986,7 @@ void dumpRegMask(regMaskTP regs) { printf("[allIntButFP]"); } + /* else if (regs == RBM_ALLFLOAT) { printf("[allFloat]"); @@ -8987,6 +8995,7 @@ void dumpRegMask(regMaskTP regs) { printf("[allDouble]"); } + */ else { dspRegMask(regs); @@ -11888,7 +11897,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, } else { - callerCalleePrefs = callerSaveRegs(currentInterval->registerType); + callerCalleePrefs = callerSaveRegs(currentInterval->registerType, linearScan->compiler); } // If this has a delayed use (due to being used in a rmw position of a @@ -12052,3 +12061,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, foundRegBit = candidates; return candidates; } + +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT_USE \ No newline at end of file diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 519c29a14d8157..be550fb34c416f 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -64,6 +64,13 @@ inline bool registerTypesEquivalent(RegisterType a, RegisterType b) return varTypeIsIntegralOrI(a) == varTypeIsIntegralOrI(b); } +// Please see the comment for these instance variables in `compiler.h` +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) +#endif + //------------------------------------------------------------------------ // calleeSaveRegs: Get the set of callee-save registers of the given RegisterType // @@ -75,11 +82,15 @@ inline regMaskTP calleeSaveRegs(RegisterType rt) //------------------------------------------------------------------------ // callerSaveRegs: Get the set of caller-save registers of the given RegisterType // -inline regMaskTP callerSaveRegs(RegisterType rt) +inline regMaskTP callerSaveRegs(RegisterType rt, Compiler *compiler) { return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_TRASH : RBM_FLT_CALLEE_TRASH; } +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT_USE + //------------------------------------------------------------------------ // RefInfo: Captures the necessary information for a definition that is "in-flight" // during `buildIntervals` (i.e. a tree-node definition has been encountered, diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 553fc192f1e546..72d777bc31d7ba 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -21,6 +21,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "lsra.h" +// Please see the comment for these instance variables in `compiler.h` +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) +#endif + //------------------------------------------------------------------------ // RefInfoList //------------------------------------------------------------------------ @@ -4142,3 +4149,8 @@ int LinearScan::BuildCmp(GenTree* tree) } return srcCount; } + + +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 50555a7c1a256a..4e68b0365b5768 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -15,6 +15,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #pragma hdrstop #endif +// Please see the comment for these instance variables in `compiler.h` +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (this->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (this->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (this->cntCalleeTrashFloat) +#endif + + /*****************************************************************************/ void Compiler::optInit() @@ -10699,3 +10707,8 @@ void Compiler::optMarkLoopRemoved(unsigned loopNum) // `fgDebugCheckLoopTable()` is called. #endif // DEBUG } + + +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 1576486e6c1268..bf30332723399b 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -82,8 +82,9 @@ #define RBM_HIGHFLOAT (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31) #define RBM_ALLFLOAT_INIT RBM_LOWFLOAT - /* NOTE: Sync with variable name defined in compiler.h */ - #define RBM_ALLFLOAT rbmAllFloat + + /* NOTE: Callee must define the use, which should point to the Compiler object rbmAllFloat field */ + #define RBM_ALLFLOAT RBM_ALLFLOAT_USE #define RBM_ALLDOUBLE RBM_ALLFLOAT #define REG_FP_FIRST REG_XMM0 @@ -127,7 +128,6 @@ /* NOTE: Sync with variable name defined in compiler.h */ #define RBM_FLT_CALLEE_TRASH_INIT (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM6|RBM_XMM7| \ RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15) - #define RBM_FLT_CALLEE_TRASH rbmFltCalleeTrash #define REG_PROFILER_ENTER_ARG_0 REG_R14 #define RBM_PROFILER_ENTER_ARG_0 RBM_R14 @@ -144,15 +144,17 @@ #define RBM_FLT_CALLEE_SAVED (RBM_XMM6|RBM_XMM7|RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15) /* NOTE: Sync with variable name defined in compiler.h */ #define RBM_FLT_CALLEE_TRASH_INIT (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5) - #define RBM_FLT_CALLEE_TRASH rbmFltCalleeTrash #endif // !UNIX_AMD64_ABI + /* NOTE: Callee must define the use, which should point to the Compiler object rbmFltCalleeTrash field */ + #define RBM_FLT_CALLEE_TRASH RBM_FLT_CALLEE_TRASH_USE + #define RBM_OSR_INT_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_EBP) #define REG_FLT_CALLEE_SAVED_FIRST REG_XMM6 #define REG_FLT_CALLEE_SAVED_LAST REG_XMM15 - #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | rbmFltCalleeTrash) + #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) @@ -227,7 +229,6 @@ #define CNT_CALLEE_TRASH_FLOAT_INIT (16) #define CNT_CALLEE_TRASH_HIGHFLOAT (16) /* NOTE: Sync with variable name defined in compiler.h */ - #define CNT_CALLEE_TRASH_FLOAT cntCalleeTrashFloat #define REG_CALLEE_SAVED_ORDER REG_EBX,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15 #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 @@ -240,12 +241,13 @@ #define CNT_CALLEE_TRASH_FLOAT_INIT (6) #define CNT_CALLEE_TRASH_HIGHFLOAT (16) /* NOTE: Sync with variable name defined in compiler.h */ - #define CNT_CALLEE_TRASH_FLOAT cntCalleeTrashFloat - #define REG_CALLEE_SAVED_ORDER REG_EBX,REG_ESI,REG_EDI,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15 #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ESI,RBM_EDI,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 #endif // !UNIX_AMD64_ABI + /* NOTE: Callee must define the use, which should point to the Compiler object cntCalleeTrashFloat field */ + #define CNT_CALLEE_TRASH_FLOAT CNT_CALLEE_TRASH_FLOAT_USE + #define CALLEE_SAVED_REG_MAXSZ (CNT_CALLEE_SAVED*REGSIZE_BYTES) #define CALLEE_SAVED_FLOAT_MAXSZ (CNT_CALLEE_SAVED_FLOAT*16) From 5ea489df3ec52cc9c6f802842ade72147a72005e Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Tue, 17 Jan 2023 10:13:04 -0800 Subject: [PATCH 22/34] Fixing some edge cases for `targetamd.h` variables. --- src/coreclr/jit/codegencommon.cpp | 10 +++---- src/coreclr/jit/codegenlinear.cpp | 6 ++-- src/coreclr/jit/codegenxarch.cpp | 8 ++--- src/coreclr/jit/compiler.h | 5 ++-- src/coreclr/jit/emit.cpp | 8 ++--- src/coreclr/jit/emitinl.h | 11 +++---- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 8 ++--- src/coreclr/jit/lsra.cpp | 33 +++++++++++++-------- src/coreclr/jit/lsra.h | 6 ++-- src/coreclr/jit/lsrabuild.cpp | 11 ++++--- src/coreclr/jit/optimizer.cpp | 8 ++--- 11 files changed, 53 insertions(+), 61 deletions(-) diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 546215c42d396b..3edcc728e2b7ed 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -782,8 +782,8 @@ void Compiler::compChangeLife(VARSET_VALARG_TP newLife) } } -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE #undef CNT_CALLEE_TRASH_FLOAT_USE #if defined(TARGET_AMD64) @@ -792,7 +792,6 @@ void Compiler::compChangeLife(VARSET_VALARG_TP newLife) #define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) #endif - // Need an explicit instantiation. template void Compiler::compChangeLife(VARSET_VALARG_TP newLife); @@ -9431,7 +9430,6 @@ bool CodeGen::genCanOmitNormalizationForBswap16(GenTree* tree) return (cast->gtCastType == TYP_USHORT) || (cast->gtCastType == TYP_SHORT); } - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE #undef CNT_CALLEE_TRASH_FLOAT_USE \ No newline at end of file diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index b784eaa0e10135..05b304c74605a8 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -24,7 +24,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) #endif - //------------------------------------------------------------------------ // genInitializeRegisterState: Initialize the register state contained in 'regSet'. // @@ -2693,7 +2692,6 @@ void CodeGen::genCodeForSetcc(GenTreeCC* setcc) } #endif // !TARGET_LOONGARCH64 - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE #undef CNT_CALLEE_TRASH_FLOAT_USE \ No newline at end of file diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 8109b35b74790d..9a5cd2729dd0ab 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -15,7 +15,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #pragma warning(disable : 4310) // cast truncates constant value - happens for (int8_t)0xb1 #endif - #ifdef TARGET_XARCH #include "emit.h" #include "codegen.h" @@ -30,7 +29,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) #endif - //--------------------------------------------------------------------- // genSetGSSecurityCookie: Set the "GS" security cookie in the prolog. // @@ -11057,6 +11055,6 @@ bool CodeGenInterface::genCodeAddrNeedsReloc(size_t addr) #endif // TARGET_XARCH -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 0bcaf7374cfc34..81d58422de0f1f 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10645,15 +10645,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // to TARGET_AMD64 only. // // Users of `targetamd.h` need to define three macros, RBM_ALLFLOAT_USE, RBM_FLT_CALLEE_TRASH_USE, - // and CNT_CALLEE_TRASH_FLOAT_USE which should point to these three variables respectively. + // and CNT_CALLEE_TRASH_FLOAT_USE which should point to these three variables respectively. // We did this to avoid poluting all `targetXXX.h` macro definitions with a compiler parameter, where only - // TARGET_AMD64 requires one. + // TARGET_AMD64 requires one. regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; unsigned cntCalleeTrashFloat; #endif // TARGET_AMD64 - }; // end of class Compiler //--------------------------------------------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 1a218385eb5d2c..1472864938956a 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -27,7 +27,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #define CNT_CALLEE_TRASH_FLOAT_USE (emitComp->cntCalleeTrashFloat) #endif - /***************************************************************************** * * Represent an emitter location. @@ -3386,6 +3385,7 @@ emitter::instrDesc* emitter::emitNewInstrCallInd(int argCnt, #endif // TARGET_XARCH /* Save the live GC registers in the unused register fields */ + assert((gcrefRegs & RBM_CALLEE_TRASH) == 0); emitEncodeCallGCregs(gcrefRegs, id); return id; @@ -3458,6 +3458,7 @@ emitter::instrDesc* emitter::emitNewInstrCallDir(int argCnt, assert(!id->idIsLargeCns()); /* Save the live GC registers in the unused register fields */ + assert((gcrefRegs & RBM_CALLEE_TRASH) == 0); emitEncodeCallGCregs(gcrefRegs, id); return id; @@ -9954,7 +9955,6 @@ void emitter::emitEnableGC() } #endif // !defined(JIT32_GCENCODER) - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE #undef CNT_CALLEE_TRASH_FLOAT_USE \ No newline at end of file diff --git a/src/coreclr/jit/emitinl.h b/src/coreclr/jit/emitinl.h index 39b3f0678a3134..9ded94c323dbac 100644 --- a/src/coreclr/jit/emitinl.h +++ b/src/coreclr/jit/emitinl.h @@ -212,6 +212,10 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id) return id->idAddr()->iiaAddrMode.amDisp; } +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT_USE + #endif // TARGET_XARCH /***************************************************************************** @@ -220,8 +224,6 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id) */ /*static*/ inline void emitter::emitEncodeCallGCregs(regMaskTP regmask, instrDesc* id) { - //assert((regmask & RBM_CALLEE_TRASH) == 0); - unsigned encodeMask; #ifdef TARGET_X86 @@ -546,11 +548,6 @@ bool emitter::emitGenNoGCLst(Callback& cb) return true; } -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT_USE - /*****************************************************************************/ #endif //_EMITINL_H_ /*****************************************************************************/ - diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index b4caf59e096e11..409e9af56ef6a9 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -30,7 +30,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) #endif - //------------------------------------------------------------------------ // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node // @@ -2005,9 +2004,8 @@ void CodeGen::genX86SerializeIntrinsic(GenTreeHWIntrinsic* node) genProduceReg(node); } -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT #endif // FEATURE_HW_INTRINSICS - diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 5ec57974e07615..4b079ad432da8b 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -1894,7 +1894,7 @@ void LinearScan::identifyCandidates() } } JITDUMP(" "); - DBEXEC(VERBOSE, newInt->dump()); + DBEXEC(VERBOSE, newInt->dump(compiler)); } else { @@ -5693,7 +5693,7 @@ void LinearScan::allocateRegisters() if (interval.isActive) { printf("Active "); - interval.dump(); + interval.dump(this->compiler); } } @@ -8975,8 +8975,18 @@ void LinearScan::dumpLsraStatsSummary(FILE* file) } #endif // TRACK_LSRA_STATS +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT_USE + #ifdef DEBUG -void dumpRegMask(regMaskTP regs) +// Please see the comment for these instance variables in `compiler.h` +#if defined(TARGET_AMD64) +#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) +#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) +#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) +#endif +void dumpRegMask(regMaskTP regs, Compiler* compiler) { if (regs == RBM_ALLINT) { @@ -8986,7 +8996,6 @@ void dumpRegMask(regMaskTP regs) { printf("[allIntButFP]"); } - /* else if (regs == RBM_ALLFLOAT) { printf("[allFloat]"); @@ -8995,12 +9004,14 @@ void dumpRegMask(regMaskTP regs) { printf("[allDouble]"); } - */ else { dspRegMask(regs); } } +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT_USE static const char* getRefTypeName(RefType refType) { @@ -9072,7 +9083,7 @@ void RefPosition::dump(LinearScan* linearScan) printf(FMT_BB " ", this->bbNum); printf("regmask="); - dumpRegMask(registerAssignment); + dumpRegMask(registerAssignment, linearScan->compiler); printf(" minReg=%d", minRegCandidateCount); @@ -9135,7 +9146,7 @@ void RegRecord::dump() tinyDump(); } -void Interval::dump() +void Interval::dump(Compiler* compiler) { printf("Interval %2u:", intervalIndex); @@ -9208,7 +9219,7 @@ void Interval::dump() printf(" physReg:%s", getRegName(physReg)); printf(" Preferences="); - dumpRegMask(this->registerPreferences); + dumpRegMask(this->registerPreferences, compiler); if (relatedInterval) { @@ -9290,7 +9301,7 @@ void LinearScan::lsraDumpIntervals(const char* msg) { // only dump something if it has references // if (interval->firstRefPosition) - interval.dump(); + interval.dump(this->compiler); } printf("\n"); @@ -12061,7 +12072,3 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, foundRegBit = candidates; return candidates; } - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT_USE \ No newline at end of file diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index be550fb34c416f..d002ed95781da6 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -82,7 +82,7 @@ inline regMaskTP calleeSaveRegs(RegisterType rt) //------------------------------------------------------------------------ // callerSaveRegs: Get the set of caller-save registers of the given RegisterType // -inline regMaskTP callerSaveRegs(RegisterType rt, Compiler *compiler) +inline regMaskTP callerSaveRegs(RegisterType rt, Compiler* compiler) { return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_TRASH : RBM_FLT_CALLEE_TRASH; } @@ -2003,7 +2003,7 @@ class Interval : public Referenceable #ifdef DEBUG // print out representation - void dump(); + void dump(Compiler* compiler); // concise representation for embedding void tinyDump(); // extremely concise representation @@ -2539,7 +2539,7 @@ class RefPosition }; #ifdef DEBUG -void dumpRegMask(regMaskTP regs); +void dumpRegMask(regMaskTP regs, Compiler* compiler); #endif // DEBUG /*****************************************************************************/ diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 72d777bc31d7ba..ba91d77dd3397e 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -166,7 +166,7 @@ Interval* LinearScan::newInterval(RegisterType theRegisterType) newInt->intervalIndex = static_cast(intervals.size() - 1); #endif // DEBUG - DBEXEC(VERBOSE, newInt->dump()); + DBEXEC(VERBOSE, newInt->dump(this->compiler)); return newInt; } @@ -3018,7 +3018,7 @@ void LinearScan::UpdatePreferencesOfDyingLocal(Interval* interval) { printf("Last use of V%02u between PUTARG and CALL. Removing occupied arg regs from preferences: ", compiler->lvaTrackedIndexToLclNum(varIndex)); - dumpRegMask(unpref); + dumpRegMask(unpref, this->compiler); printf("\n"); } #endif @@ -4150,7 +4150,6 @@ int LinearScan::BuildCmp(GenTree* tree) return srcCount; } - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 4e68b0365b5768..fc78cca58a4791 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -22,7 +22,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #define CNT_CALLEE_TRASH_FLOAT_USE (this->cntCalleeTrashFloat) #endif - /*****************************************************************************/ void Compiler::optInit() @@ -10708,7 +10707,6 @@ void Compiler::optMarkLoopRemoved(unsigned loopNum) #endif // DEBUG } - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file +#undef RBM_ALLFLOAT_USE +#undef RBM_FLT_CALLEE_TRASH_USE +#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file From e60aece3c5b6cc73ba01f3674f4bbf60691859c7 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 18 Jan 2023 15:31:02 -0800 Subject: [PATCH 23/34] Fix a merge/rebase bug. --- src/coreclr/jit/hwintrinsiclistxarch.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 55a92725760718..f474d5387333f4 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -69,7 +69,6 @@ HARDWARE_INTRINSIC(Vector128, EqualsAny, HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(Vector128, get_Count, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_One, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) @@ -173,7 +172,6 @@ HARDWARE_INTRINSIC(Vector256, EqualsAny, HARDWARE_INTRINSIC(Vector256, ExtractMostSignificantBits, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(Vector256, get_Count, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, get_One, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) From b28a2311105eecff27d04c641f2382e7f10a9f65 Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Wed, 25 Jan 2023 16:15:10 -0800 Subject: [PATCH 24/34] Update src/coreclr/jit/compiler.h Co-authored-by: Bruce Forstall --- src/coreclr/jit/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 81d58422de0f1f..590187b22f6171 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10644,7 +10644,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // As only 64-bit xarch has the capability to have the additional registers, we limit the changes // to TARGET_AMD64 only. // - // Users of `targetamd.h` need to define three macros, RBM_ALLFLOAT_USE, RBM_FLT_CALLEE_TRASH_USE, + // Users of `targetamd64.h` need to define three macros, RBM_ALLFLOAT_USE, RBM_FLT_CALLEE_TRASH_USE, // and CNT_CALLEE_TRASH_FLOAT_USE which should point to these three variables respectively. // We did this to avoid poluting all `targetXXX.h` macro definitions with a compiler parameter, where only // TARGET_AMD64 requires one. From 37af7c37b7f83d8d618b325dcdd92402cc6c91e1 Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Wed, 25 Jan 2023 16:15:50 -0800 Subject: [PATCH 25/34] Update src/coreclr/jit/lsra.cpp Co-authored-by: Bruce Forstall --- src/coreclr/jit/lsra.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 4b079ad432da8b..f8becf198d2637 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -707,7 +707,7 @@ LinearScan::LinearScan(Compiler* theCompiler) #endif // TARGET_AMD64 || TARGET_ARM64 #if defined(TARGET_AMD64) - // TODO-XARCH-AVX512 switch this to canUseEvexEncoding() once we independetly + // TODO-XARCH-AVX512 switch this to canUseEvexEncoding() once we independently // allow EVEX use from the stress flag (currently, if EVEX stress is turned off, // we cannot use EVEX at all) if (compiler->DoJitStressEvexEncoding()) From e9be8f85c63d8c72bcc5ccc3da6c48501b4b0f85 Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Wed, 25 Jan 2023 16:16:14 -0800 Subject: [PATCH 26/34] Update src/coreclr/jit/compiler.h Co-authored-by: Bruce Forstall --- src/coreclr/jit/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 590187b22f6171..e9e262c39f2ca7 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10646,7 +10646,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // // Users of `targetamd64.h` need to define three macros, RBM_ALLFLOAT_USE, RBM_FLT_CALLEE_TRASH_USE, // and CNT_CALLEE_TRASH_FLOAT_USE which should point to these three variables respectively. - // We did this to avoid poluting all `targetXXX.h` macro definitions with a compiler parameter, where only + // We did this to avoid polluting all `targetXXX.h` macro definitions with a compiler parameter, where only // TARGET_AMD64 requires one. regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; From f680448a5fbde0ff1a8acd5748cd112ae7e57b72 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 25 Jan 2023 16:23:16 -0800 Subject: [PATCH 27/34] Fix nits. --- src/coreclr/jit/codegencommon.cpp | 6 +----- src/coreclr/jit/codegenlinear.cpp | 6 +----- src/coreclr/jit/codegenxarch.cpp | 6 +----- src/coreclr/jit/compiler.h | 4 ---- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 4 ---- src/coreclr/jit/lsra.cpp | 4 +++- src/coreclr/jit/lsrabuild.cpp | 6 +----- src/coreclr/jit/optimizer.cpp | 6 +----- src/coreclr/jit/targetamd64.h | 8 +++++--- 9 files changed, 13 insertions(+), 37 deletions(-) diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 3edcc728e2b7ed..f058645581e203 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -9428,8 +9428,4 @@ bool CodeGen::genCanOmitNormalizationForBswap16(GenTree* tree) } return (cast->gtCastType == TYP_USHORT) || (cast->gtCastType == TYP_SHORT); -} - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT_USE \ No newline at end of file +} \ No newline at end of file diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 05b304c74605a8..e3f697b06554e7 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -2690,8 +2690,4 @@ void CodeGen::genCodeForSetcc(GenTreeCC* setcc) inst_SETCC(setcc->gtCondition, setcc->TypeGet(), setcc->GetRegNum()); genProduceReg(setcc); } -#endif // !TARGET_LOONGARCH64 - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT_USE \ No newline at end of file +#endif // !TARGET_LOONGARCH64 \ No newline at end of file diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 9a5cd2729dd0ab..5bf6a4966504d6 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -11053,8 +11053,4 @@ bool CodeGenInterface::genCodeAddrNeedsReloc(size_t addr) #endif // TARGET_X86 } -#endif // TARGET_XARCH - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file +#endif // TARGET_XARCH \ No newline at end of file diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 81d58422de0f1f..6f1ec940e588b6 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -11569,10 +11569,6 @@ extern const BYTE genActualTypes[]; /*****************************************************************************/ -/*****************************************************************************/ - -/*****************************************************************************/ - #ifdef DEBUG void dumpConvertedVarSet(Compiler* comp, VARSET_VALARG_TP vars); #endif // DEBUG diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 409e9af56ef6a9..fdbc20c2c77c5f 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -2004,8 +2004,4 @@ void CodeGen::genX86SerializeIntrinsic(GenTreeHWIntrinsic* node) genProduceReg(node); } -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT - #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 4b079ad432da8b..80417cfd663be9 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -8981,6 +8981,8 @@ void LinearScan::dumpLsraStatsSummary(FILE* file) #ifdef DEBUG // Please see the comment for these instance variables in `compiler.h` +// Here, the `compiler` object used must match the parameter name that +// follows. If it is changed below, change it here. #if defined(TARGET_AMD64) #define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) #define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) @@ -12071,4 +12073,4 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, assert(found && isSingleRegister(candidates)); foundRegBit = candidates; return candidates; -} +} \ No newline at end of file diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index ba91d77dd3397e..9e917f7698bb6d 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -4148,8 +4148,4 @@ int LinearScan::BuildCmp(GenTree* tree) BuildDef(tree, dstCandidates); } return srcCount; -} - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file +} \ No newline at end of file diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index fc78cca58a4791..744884a32e06bd 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -10705,8 +10705,4 @@ void Compiler::optMarkLoopRemoved(unsigned loopNum) // Assume the caller is going to fix up the table and `bbNatLoopNum` block annotations before the next time // `fgDebugCheckLoopTable()` is called. #endif // DEBUG -} - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT \ No newline at end of file +} \ No newline at end of file diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index bf30332723399b..ec310a4ab2128a 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -83,7 +83,7 @@ #define RBM_ALLFLOAT_INIT RBM_LOWFLOAT - /* NOTE: Callee must define the use, which should point to the Compiler object rbmAllFloat field */ + // NOTE: Callee must define the use, which should point to the Compiler object rbmAllFloat field #define RBM_ALLFLOAT RBM_ALLFLOAT_USE #define RBM_ALLDOUBLE RBM_ALLFLOAT @@ -217,8 +217,10 @@ #endif // !UNIX_AMD64_ABI #endif - //#define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7,REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15 - #define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7,REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15,REG_XMM16,REG_XMM17,REG_XMM18,REG_XMM19,REG_XMM20,REG_XMM22,REG_XMM23,REG_XMM24,REG_XMM25,REG_XMM26,REG_XMM27,REG_XMM28,REG_XMM29,REG_XMM30,REG_XMM31 + #define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7, \ + REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15, \ + REG_XMM16,REG_XMM17,REG_XMM18,REG_XMM19,REG_XMM20,REG_XMM21,REG_XMM22,REG_XMM23, \ + REG_XMM24,REG_XMM25,REG_XMM26,REG_XMM27,REG_XMM28,REG_XMM29,REG_XMM30,REG_XMM31 #ifdef UNIX_AMD64_ABI #define CNT_CALLEE_SAVED (5 + REG_ETW_FRAMED_EBP_COUNT) From 73f43b69509e70dc7c38570cec28d4349bee3df6 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Tue, 31 Jan 2023 12:33:38 -0800 Subject: [PATCH 28/34] Trying VM changes. --- src/coreclr/vm/threadsuspend.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp index 73f10f1ef4ce4f..564a13d3505be6 100644 --- a/src/coreclr/vm/threadsuspend.cpp +++ b/src/coreclr/vm/threadsuspend.cpp @@ -1970,14 +1970,14 @@ CONTEXT* AllocateOSContextHelper(BYTE** contextBuffer) // Determine if the processor supports AVX so we could // retrieve extended registers DWORD64 FeatureMask = GetEnabledXStateFeatures(); - if ((FeatureMask & XSTATE_MASK_AVX) != 0) + if ((FeatureMask & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)) != 0) { context = context | CONTEXT_XSTATE; } // Retrieve contextSize by passing NULL for Buffer DWORD contextSize = 0; - ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX; + ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX | XSTATE_MASK_AVX512; // The initialize call should fail but return contextSize BOOL success = g_pfnInitializeContext2 ? g_pfnInitializeContext2(NULL, context, NULL, &contextSize, xStateCompactionMask) : @@ -2899,7 +2899,7 @@ BOOL Thread::RedirectThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt) // This should not normally fail. // The system silently ignores any feature specified in the FeatureMask // which is not enabled on the processor. - SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX); + SetXStateFeaturesMask(pCtx, (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)); #endif //defined(TARGET_X86) || defined(TARGET_AMD64) // Make sure we specify CONTEXT_EXCEPTION_REQUEST to detect "trap frame reporting". @@ -3035,7 +3035,7 @@ BOOL Thread::RedirectCurrentThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt, CONT // Get may return 0 if no XState is set, which Set would not accept. if (srcFeatures != 0) { - success = SetXStateFeaturesMask(pCurrentThreadCtx, srcFeatures & XSTATE_MASK_AVX); + success = SetXStateFeaturesMask(pCurrentThreadCtx, srcFeatures & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)); _ASSERTE(success); if (!success) return FALSE; From c6f1a90fdacd6262e9a5e983c065b4144fca7edb Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 1 Feb 2023 15:45:39 -0800 Subject: [PATCH 29/34] VM hack. --- src/coreclr/vm/threadsuspend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp index 564a13d3505be6..89f2f9d33f7e2c 100644 --- a/src/coreclr/vm/threadsuspend.cpp +++ b/src/coreclr/vm/threadsuspend.cpp @@ -1977,7 +1977,7 @@ CONTEXT* AllocateOSContextHelper(BYTE** contextBuffer) // Retrieve contextSize by passing NULL for Buffer DWORD contextSize = 0; - ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX | XSTATE_MASK_AVX512; + ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX | XSTATE_MASK_MPX | XSTATE_MASK_AVX512; // The initialize call should fail but return contextSize BOOL success = g_pfnInitializeContext2 ? g_pfnInitializeContext2(NULL, context, NULL, &contextSize, xStateCompactionMask) : From 91cf3db9115e94ca1d759045fe4da6a496228cfc Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 1 Feb 2023 15:45:39 -0800 Subject: [PATCH 30/34] VM hack. --- src/coreclr/vm/threadsuspend.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp index 89f2f9d33f7e2c..85160bb0359432 100644 --- a/src/coreclr/vm/threadsuspend.cpp +++ b/src/coreclr/vm/threadsuspend.cpp @@ -3044,6 +3044,11 @@ BOOL Thread::RedirectCurrentThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt, CONT #endif //defined(TARGET_X86) || defined(TARGET_AMD64) success = CopyContext(pCtx, pCtx->ContextFlags, pCurrentThreadCtx); + if (!success) + { + DWORD hresult = GetLastError(); + printf("Last Error: %d\n", hresult); + } _ASSERTE(success); if (!success) return FALSE; From 228c0c571fba5db8fd6eee1e5d4be9a89a4d0fe3 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Fri, 3 Feb 2023 15:10:34 -0800 Subject: [PATCH 31/34] Revert "VM hack." This reverts commit 91cf3db9115e94ca1d759045fe4da6a496228cfc. --- src/coreclr/vm/threadsuspend.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp index 85160bb0359432..89f2f9d33f7e2c 100644 --- a/src/coreclr/vm/threadsuspend.cpp +++ b/src/coreclr/vm/threadsuspend.cpp @@ -3044,11 +3044,6 @@ BOOL Thread::RedirectCurrentThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt, CONT #endif //defined(TARGET_X86) || defined(TARGET_AMD64) success = CopyContext(pCtx, pCtx->ContextFlags, pCurrentThreadCtx); - if (!success) - { - DWORD hresult = GetLastError(); - printf("Last Error: %d\n", hresult); - } _ASSERTE(success); if (!success) return FALSE; From 5490617fdf941d5cfcbd9b9e8068684852e871dd Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Mon, 6 Feb 2023 10:23:19 -0800 Subject: [PATCH 32/34] Adjust ACTUAL_REG_COUNT based on availability of AVX512. --- src/coreclr/jit/compiler.cpp | 5 +++++ src/coreclr/jit/compiler.h | 1 + src/coreclr/jit/emit.cpp | 1 + src/coreclr/jit/lsra.cpp | 1 + src/coreclr/jit/lsrabuild.cpp | 1 + src/coreclr/jit/targetamd64.h | 1 + 6 files changed, 10 insertions(+) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 43a93c35b08033..b23be017e3421e 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3330,6 +3330,7 @@ void Compiler::compInitOptions(JitFlags* jitFlags) rbmAllFloat = RBM_ALLFLOAT_INIT; rbmFltCalleeTrash = RBM_FLT_CALLEE_TRASH_INIT; cntCalleeTrashFloat = CNT_CALLEE_TRASH_FLOAT_INIT; + actualRegCount = ACTUAL_REG_COUNT; if (DoJitStressEvexEncoding()) { @@ -3337,6 +3338,10 @@ void Compiler::compInitOptions(JitFlags* jitFlags) rbmFltCalleeTrash |= RBM_HIGHFLOAT; cntCalleeTrashFloat += CNT_CALLEE_TRASH_HIGHFLOAT; } + else + { + actualRegCount -= CNT_HIGHFLOAT; + } #endif // TARGET_AMD64 } diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index aaf4b110ebb8ba..66f19a7921d632 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10651,6 +10651,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; unsigned cntCalleeTrashFloat; + unsigned actualRegCount; #endif // TARGET_AMD64 }; // end of class Compiler diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 1472864938956a..4721a0608d980b 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -25,6 +25,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #define RBM_ALLFLOAT_USE (emitComp->rbmAllFloat) #define RBM_FLT_CALLEE_TRASH_USE (emitComp->rbmFltCalleeTrash) #define CNT_CALLEE_TRASH_FLOAT_USE (emitComp->cntCalleeTrashFloat) +#define ACTUAL_REG_COUNT (emitComp->actualRegCount) #endif /***************************************************************************** diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 27e4319bcc0732..3bb7e78f64e24e 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -106,6 +106,7 @@ const char* LinearScan::resolveTypeName[] = {"Split", "Join", "Critical", "Share #define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) #define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) #define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) +#define ACTUAL_REG_COUNT (compiler->actualRegCount) #endif /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 9e917f7698bb6d..aa684087dda5d1 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -26,6 +26,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) #define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) #define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) +#define ACTUAL_REG_COUNT (compiler->actualRegCount) #endif //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index ec310a4ab2128a..20034a9b0216fd 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -80,6 +80,7 @@ #define RBM_LOWFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15 ) #define RBM_HIGHFLOAT (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31) + #define CNT_HIGHFLOAT 16 #define RBM_ALLFLOAT_INIT RBM_LOWFLOAT From 3c7acdbb2e11f297fa772941751e83467aaa3a1c Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Tue, 7 Feb 2023 23:03:44 -0800 Subject: [PATCH 33/34] Use inline accessor functions instead of macros Convert from macros to accessor functions for RBM_ALLFLOAT, RBM_FLT_CALLEE_TRASH, CNT_CALLEE_TRASH_FLOAT. Convert LSRA use of ACTUAL_REG_COUNT to AVAILABLE_REG_COUNT, and create an accessor for that value for AMD64 as well. --- src/coreclr/jit/codegen.h | 11 ++++ src/coreclr/jit/codegencommon.cpp | 19 +----- src/coreclr/jit/codegenlinear.cpp | 9 +-- src/coreclr/jit/codegenxarch.cpp | 8 +-- src/coreclr/jit/compiler.cpp | 37 ++++++++++- src/coreclr/jit/compiler.h | 37 +++++++++-- src/coreclr/jit/emit.cpp | 33 ++++++---- src/coreclr/jit/emit.h | 5 ++ src/coreclr/jit/emitinl.h | 11 ---- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 7 -- src/coreclr/jit/lsra.cpp | 72 ++++----------------- src/coreclr/jit/lsra.h | 67 ++++++++++--------- src/coreclr/jit/lsrabuild.cpp | 16 ++--- src/coreclr/jit/optimizer.cpp | 13 +--- src/coreclr/jit/target.h | 7 ++ src/coreclr/jit/targetamd64.h | 9 +-- 16 files changed, 169 insertions(+), 192 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index aa3fbefad70039..2d5051be6c0583 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -35,6 +35,17 @@ class CodeGen final : public CodeGenInterface GenTree* addr, bool fold, bool* revPtr, GenTree** rv1Ptr, GenTree** rv2Ptr, unsigned* mulPtr, ssize_t* cnsPtr); private: +#if defined(TARGET_AMD64) + regMaskTP get_RBM_ALLFLOAT() const + { + return compiler->rbmAllFloat; + } + regMaskTP get_RBM_FLT_CALLEE_TRASH() const + { + return compiler->rbmFltCalleeTrash; + } +#endif // TARGET_AMD64 + #if defined(TARGET_XARCH) // Bit masks used in negating a float or double number. // This is to avoid creating more than one data constant for these bitmasks when a diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index f058645581e203..1be119806114c4 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -29,13 +29,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "patchpointinfo.h" -// Please see the comment for these instance variables in `compiler.h` -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (this->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (this->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (this->cntCalleeTrashFloat) -#endif - /*****************************************************************************/ void CodeGenInterface::setFramePointerRequiredEH(bool value) @@ -782,16 +775,6 @@ void Compiler::compChangeLife(VARSET_VALARG_TP newLife) } } -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT_USE - -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) -#endif - // Need an explicit instantiation. template void Compiler::compChangeLife(VARSET_VALARG_TP newLife); @@ -9428,4 +9411,4 @@ bool CodeGen::genCanOmitNormalizationForBswap16(GenTree* tree) } return (cast->gtCastType == TYP_USHORT) || (cast->gtCastType == TYP_SHORT); -} \ No newline at end of file +} diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index e3f697b06554e7..b510ce4a558dc5 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -17,13 +17,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "emit.h" #include "codegen.h" -// Please see the comment for these instance variables in `compiler.h` -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) -#endif - //------------------------------------------------------------------------ // genInitializeRegisterState: Initialize the register state contained in 'regSet'. // @@ -2690,4 +2683,4 @@ void CodeGen::genCodeForSetcc(GenTreeCC* setcc) inst_SETCC(setcc->gtCondition, setcc->TypeGet(), setcc->GetRegNum()); genProduceReg(setcc); } -#endif // !TARGET_LOONGARCH64 \ No newline at end of file +#endif // !TARGET_LOONGARCH64 diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 5bf6a4966504d6..f933f685ad6488 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -23,12 +23,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "gcinfoencoder.h" #include "patchpointinfo.h" -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) -#endif - //--------------------------------------------------------------------- // genSetGSSecurityCookie: Set the "GS" security cookie in the prolog. // @@ -11053,4 +11047,4 @@ bool CodeGenInterface::genCodeAddrNeedsReloc(size_t addr) #endif // TARGET_X86 } -#endif // TARGET_XARCH \ No newline at end of file +#endif // TARGET_XARCH diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index b23be017e3421e..84249af733545f 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3330,7 +3330,7 @@ void Compiler::compInitOptions(JitFlags* jitFlags) rbmAllFloat = RBM_ALLFLOAT_INIT; rbmFltCalleeTrash = RBM_FLT_CALLEE_TRASH_INIT; cntCalleeTrashFloat = CNT_CALLEE_TRASH_FLOAT_INIT; - actualRegCount = ACTUAL_REG_COUNT; + availableRegCount = ACTUAL_REG_COUNT; if (DoJitStressEvexEncoding()) { @@ -3340,7 +3340,7 @@ void Compiler::compInitOptions(JitFlags* jitFlags) } else { - actualRegCount -= CNT_HIGHFLOAT; + availableRegCount -= CNT_HIGHFLOAT; } #endif // TARGET_AMD64 } @@ -3546,6 +3546,37 @@ bool Compiler::compPromoteFewerStructs(unsigned lclNum) return rejectThisPromo; } +//------------------------------------------------------------------------ +// dumpRegMask: display a register mask. For well-known sets of registers, display a well-known token instead of +// a potentially large number of registers. +// +// Arguments: +// regs - The set of registers to display +// +void Compiler::dumpRegMask(regMaskTP regs) const +{ + if (regs == RBM_ALLINT) + { + printf("[allInt]"); + } + else if (regs == (RBM_ALLINT & ~RBM_FPBASE)) + { + printf("[allIntButFP]"); + } + else if (regs == RBM_ALLFLOAT) + { + printf("[allFloat]"); + } + else if (regs == RBM_ALLDOUBLE) + { + printf("[allDouble]"); + } + else + { + dspRegMask(regs); + } +} + #endif // DEBUG void Compiler::compInitDebuggingInfo() @@ -10293,4 +10324,4 @@ void Compiler::EnregisterStats::Dump(FILE* fout) const PRINT_STATS(m_stressLclFld, m_addrExposed); PRINT_STATS(m_dispatchRetBuf, m_addrExposed); } -#endif // TRACK_ENREG_STATS \ No newline at end of file +#endif // TRACK_ENREG_STATS diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 66f19a7921d632..8a231f698a663c 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10361,6 +10361,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX bool compJitHaltMethod(); + void dumpRegMask(regMaskTP regs) const; + #endif /* @@ -10637,21 +10639,44 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX bool killGCRefs(GenTree* tree); #if defined(TARGET_AMD64) -public: +private: // The following are for initializing register allocator "constants" defined in targetamd64.h // that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which increases - // the number of simd (xmm,ymm, and zmm) registers from 16 to 32. + // the number of SIMD (xmm, ymm, and zmm) registers from 16 to 32. // As only 64-bit xarch has the capability to have the additional registers, we limit the changes // to TARGET_AMD64 only. // - // Users of `targetamd64.h` need to define three macros, RBM_ALLFLOAT_USE, RBM_FLT_CALLEE_TRASH_USE, - // and CNT_CALLEE_TRASH_FLOAT_USE which should point to these three variables respectively. - // We did this to avoid polluting all `targetXXX.h` macro definitions with a compiler parameter, where only + // Users of these values need to define four accessor functions: + // + // regMaskTP get_RBM_ALLFLOAT(); + // regMaskTP get_RBM_FLT_CALLEE_TRASH(); + // unsigned get_CNT_CALLEE_TRASH_FLOAT(); + // unsigned get_AVAILABLE_REG_COUNT(); + // + // which return the values of these variables. + // + // This was done to avoid polluting all `targetXXX.h` macro definitions with a compiler parameter, where only // TARGET_AMD64 requires one. + // regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; unsigned cntCalleeTrashFloat; - unsigned actualRegCount; + unsigned availableRegCount; + +public: + regMaskTP get_RBM_ALLFLOAT() const + { + return rbmAllFloat; + } + regMaskTP get_RBM_FLT_CALLEE_TRASH() const + { + return rbmFltCalleeTrash; + } + unsigned get_CNT_CALLEE_TRASH_FLOAT() const + { + return cntCalleeTrashFloat; + } + #endif // TARGET_AMD64 }; // end of class Compiler diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 4721a0608d980b..0f12cd4644b93b 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -20,14 +20,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "emit.h" #include "codegen.h" -// Please see the comment for these instance variables in `compiler.h` -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (emitComp->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (emitComp->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (emitComp->cntCalleeTrashFloat) -#define ACTUAL_REG_COUNT (emitComp->actualRegCount) -#endif - /***************************************************************************** * * Represent an emitter location. @@ -107,6 +99,17 @@ void emitLocation::Print(LONG compMethodID) const } #endif // DEBUG +#if defined(TARGET_AMD64) +inline regMaskTP emitter::get_RBM_FLT_CALLEE_TRASH() const +{ + return emitComp->rbmFltCalleeTrash; +} +inline unsigned emitter::get_AVAILABLE_REG_COUNT() const +{ + return emitComp->availableRegCount; +} +#endif // TARGET_AMD64 + /***************************************************************************** * * Return the name of an instruction format. @@ -3212,11 +3215,19 @@ void emitter::emitDispRegSet(regMaskTP regs) for (reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) { - if ((regs & genRegMask(reg)) == 0) + if (regs == RBM_NONE) + { + break; + } + + regMaskTP curReg = genRegMask(reg); + if ((regs & curReg) == 0) { continue; } + regs -= curReg; + if (sp) { printf(" "); @@ -9955,7 +9966,3 @@ void emitter::emitEnableGC() } } #endif // !defined(JIT32_GCENCODER) - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT_USE \ No newline at end of file diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index e99183629c7a52..fe27b94ee62d10 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1973,6 +1973,11 @@ class emitter CORINFO_FIELD_HANDLE emitBlkConst(const void* cnsAddr, unsigned cnsSize, unsigned cnsAlign, var_types elemType); private: +#if defined(TARGET_AMD64) + regMaskTP get_RBM_FLT_CALLEE_TRASH() const; + unsigned get_AVAILABLE_REG_COUNT() const; +#endif // TARGET_AMD64 + CORINFO_FIELD_HANDLE emitFltOrDblConst(double constValue, emitAttr attr); CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue); CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue); diff --git a/src/coreclr/jit/emitinl.h b/src/coreclr/jit/emitinl.h index 9ded94c323dbac..125c1ddd0fbd3f 100644 --- a/src/coreclr/jit/emitinl.h +++ b/src/coreclr/jit/emitinl.h @@ -7,13 +7,6 @@ #ifdef TARGET_XARCH -// Please see the comment for these instance variables in `compiler.h` -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (emitComp->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (emitComp->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (emitComp->cntCalleeTrashFloat) -#endif - /* static */ inline bool emitter::instrIs3opImul(instruction ins) { @@ -212,10 +205,6 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id) return id->idAddr()->iiaAddrMode.amDisp; } -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT_USE - #endif // TARGET_XARCH /***************************************************************************** diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index fdbc20c2c77c5f..cb114b2d197010 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -23,13 +23,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "gcinfo.h" #include "gcinfoencoder.h" -// Please see the comment for these instance variables in `compiler.h` -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) -#endif - //------------------------------------------------------------------------ // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node // diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 3bb7e78f64e24e..877b5cd24d108a 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -101,14 +101,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX const char* LinearScan::resolveTypeName[] = {"Split", "Join", "Critical", "SharedCritical"}; #endif // DEBUG -// Please see the comment for these instance variables in `compiler.h` -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) -#define ACTUAL_REG_COUNT (compiler->actualRegCount) -#endif - /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XX XX @@ -4072,7 +4064,7 @@ void LinearScan::processBlockStartLocations(BasicBlock* currentBlock) { // Just clear any constant registers and return. resetAvailableRegs(); - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); Interval* assignedInterval = physRegRecord->assignedInterval; @@ -4320,7 +4312,7 @@ void LinearScan::processBlockStartLocations(BasicBlock* currentBlock) resetRegState(); setRegsInUse(liveRegs); } - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); if ((liveRegs & genRegMask(reg)) == 0) @@ -4602,7 +4594,7 @@ void LinearScan::allocateRegisters() } resetRegState(); - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->recentRefPosition = nullptr; @@ -4765,7 +4757,7 @@ void LinearScan::allocateRegisters() #ifdef DEBUG // Validate the current state just after we've freed the registers. This ensures that any pending // freed registers will have had their state updated to reflect the intervals they were holding. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { regMaskTP regMask = genRegMask(reg); // If this isn't available or if it's still waiting to be freed (i.e. it was in @@ -6685,7 +6677,7 @@ void LinearScan::resolveRegisters() // are encountered. if (enregisterLocalVars) { - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); Interval* assignedInterval = physRegRecord->assignedInterval; @@ -8976,45 +8968,7 @@ void LinearScan::dumpLsraStatsSummary(FILE* file) } #endif // TRACK_LSRA_STATS -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT_USE - #ifdef DEBUG -// Please see the comment for these instance variables in `compiler.h` -// Here, the `compiler` object used must match the parameter name that -// follows. If it is changed below, change it here. -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) -#endif -void dumpRegMask(regMaskTP regs, Compiler* compiler) -{ - if (regs == RBM_ALLINT) - { - printf("[allInt]"); - } - else if (regs == (RBM_ALLINT & ~RBM_FPBASE)) - { - printf("[allIntButFP]"); - } - else if (regs == RBM_ALLFLOAT) - { - printf("[allFloat]"); - } - else if (regs == RBM_ALLDOUBLE) - { - printf("[allDouble]"); - } - else - { - dspRegMask(regs); - } -} -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT_USE static const char* getRefTypeName(RefType refType) { @@ -9086,7 +9040,7 @@ void RefPosition::dump(LinearScan* linearScan) printf(FMT_BB " ", this->bbNum); printf("regmask="); - dumpRegMask(registerAssignment, linearScan->compiler); + linearScan->compiler->dumpRegMask(registerAssignment); printf(" minReg=%d", minRegCandidateCount); @@ -9222,7 +9176,7 @@ void Interval::dump(Compiler* compiler) printf(" physReg:%s", getRegName(physReg)); printf(" Preferences="); - dumpRegMask(this->registerPreferences, compiler); + compiler->dumpRegMask(this->registerPreferences); if (relatedInterval) { @@ -10440,7 +10394,7 @@ void LinearScan::verifyFinalAllocation() } // Clear register assignments. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->assignedInterval = nullptr; @@ -10544,7 +10498,7 @@ void LinearScan::verifyFinalAllocation() } // Clear register assignments. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->assignedInterval = nullptr; @@ -10869,7 +10823,7 @@ void LinearScan::verifyFinalAllocation() } // Clear register assignments. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->assignedInterval = nullptr; @@ -11895,7 +11849,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, if (preferCalleeSave) { - regMaskTP calleeSaveCandidates = calleeSaveRegs(currentInterval->registerType); + regMaskTP calleeSaveCandidates = linearScan->calleeSaveRegs(currentInterval->registerType); if (currentInterval->isWriteThru) { // We'll only prefer a callee-save register if it's already been used. @@ -11911,7 +11865,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, } else { - callerCalleePrefs = callerSaveRegs(currentInterval->registerType, linearScan->compiler); + callerCalleePrefs = linearScan->callerSaveRegs(currentInterval->registerType); } // If this has a delayed use (due to being used in a rmw position of a @@ -12074,4 +12028,4 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, assert(found && isSingleRegister(candidates)); foundRegBit = candidates; return candidates; -} \ No newline at end of file +} diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index d002ed95781da6..e6ca1d06e0955b 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -64,33 +64,6 @@ inline bool registerTypesEquivalent(RegisterType a, RegisterType b) return varTypeIsIntegralOrI(a) == varTypeIsIntegralOrI(b); } -// Please see the comment for these instance variables in `compiler.h` -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) -#endif - -//------------------------------------------------------------------------ -// calleeSaveRegs: Get the set of callee-save registers of the given RegisterType -// -inline regMaskTP calleeSaveRegs(RegisterType rt) -{ - return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_SAVED : RBM_FLT_CALLEE_SAVED; -} - -//------------------------------------------------------------------------ -// callerSaveRegs: Get the set of caller-save registers of the given RegisterType -// -inline regMaskTP callerSaveRegs(RegisterType rt, Compiler* compiler) -{ - return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_TRASH : RBM_FLT_CALLEE_TRASH; -} - -#undef RBM_ALLFLOAT_USE -#undef RBM_FLT_CALLEE_TRASH_USE -#undef CNT_CALLEE_TRASH_FLOAT_USE - //------------------------------------------------------------------------ // RefInfo: Captures the necessary information for a definition that is "in-flight" // during `buildIntervals` (i.e. a tree-node definition has been encountered, @@ -1953,6 +1926,40 @@ class LinearScan : public LinearScanInterface int BuildPutArgSplit(GenTreePutArgSplit* tree); #endif // FEATURE_ARG_SPLIT int BuildLclHeap(GenTree* tree); + +#if defined(TARGET_AMD64) + regMaskTP get_RBM_ALLFLOAT() const + { + return compiler->rbmAllFloat; + } + regMaskTP get_RBM_FLT_CALLEE_TRASH() const + { + return compiler->rbmFltCalleeTrash; + } + unsigned get_AVAILABLE_REG_COUNT() const + { + return compiler->availableRegCount; + } +#endif // TARGET_AMD64 + + //------------------------------------------------------------------------ + // calleeSaveRegs: Get the set of callee-save registers of the given RegisterType + // + // NOTE: we currently don't need a LinearScan `this` pointer for this definition, and some callers + // don't have one available, so make is static. + // + static regMaskTP calleeSaveRegs(RegisterType rt) + { + return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_SAVED : RBM_FLT_CALLEE_SAVED; + } + + //------------------------------------------------------------------------ + // callerSaveRegs: Get the set of caller-save registers of the given RegisterType + // + regMaskTP callerSaveRegs(RegisterType rt) const + { + return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_TRASH : RBM_FLT_CALLEE_TRASH; + } }; /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX @@ -2213,7 +2220,7 @@ class Interval : public Referenceable if (preferCalleeSave) { - regMaskTP calleeSaveMask = (calleeSaveRegs(this->registerType) & (newPreferences)); + regMaskTP calleeSaveMask = (LinearScan::calleeSaveRegs(this->registerType) & newPreferences); if (calleeSaveMask != RBM_NONE) { newPreferences = calleeSaveMask; @@ -2538,10 +2545,6 @@ class RefPosition #endif // DEBUG }; -#ifdef DEBUG -void dumpRegMask(regMaskTP regs, Compiler* compiler); -#endif // DEBUG - /*****************************************************************************/ #endif //_LSRA_H_ /*****************************************************************************/ diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index aa684087dda5d1..e6988402f5ce66 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -21,14 +21,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "lsra.h" -// Please see the comment for these instance variables in `compiler.h` -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (compiler->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (compiler->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (compiler->cntCalleeTrashFloat) -#define ACTUAL_REG_COUNT (compiler->actualRegCount) -#endif - //------------------------------------------------------------------------ // RefInfoList //------------------------------------------------------------------------ @@ -1220,7 +1212,7 @@ bool LinearScan::buildKillPositionsForNode(GenTree* tree, LsraLocation currentLo // If there are no callee-saved registers, the call could kill all the registers. // This is a valid state, so in that case assert should not trigger. The RA will spill in order // to free a register later. - assert(compiler->opts.compDbgEnC || (calleeSaveRegs(varDsc->lvType)) == RBM_NONE); + assert(compiler->opts.compDbgEnC || (calleeSaveRegs(varDsc->lvType) == RBM_NONE)); } } } @@ -1879,7 +1871,7 @@ const unsigned lsraRegOrderFltSize = ArrLen(lsraRegOrderFlt); // void LinearScan::buildPhysRegRecords() { - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* curr = &physRegs[reg]; curr->init(reg); @@ -3019,7 +3011,7 @@ void LinearScan::UpdatePreferencesOfDyingLocal(Interval* interval) { printf("Last use of V%02u between PUTARG and CALL. Removing occupied arg regs from preferences: ", compiler->lvaTrackedIndexToLclNum(varIndex)); - dumpRegMask(unpref, this->compiler); + compiler->dumpRegMask(unpref); printf("\n"); } #endif @@ -4149,4 +4141,4 @@ int LinearScan::BuildCmp(GenTree* tree) BuildDef(tree, dstCandidates); } return srcCount; -} \ No newline at end of file +} diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 744884a32e06bd..35786b781d544a 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -15,13 +15,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #pragma hdrstop #endif -// Please see the comment for these instance variables in `compiler.h` -#if defined(TARGET_AMD64) -#define RBM_ALLFLOAT_USE (this->rbmAllFloat) -#define RBM_FLT_CALLEE_TRASH_USE (this->rbmFltCalleeTrash) -#define CNT_CALLEE_TRASH_FLOAT_USE (this->cntCalleeTrashFloat) -#endif - /*****************************************************************************/ void Compiler::optInit() @@ -6973,7 +6966,7 @@ bool Compiler::optIsProfitableToHoistTree(GenTree* tree, unsigned lnum) // Don't hoist expressions that are not heavy: tree->GetCostEx() < (2*IND_COST_EX) if (tree->GetCostEx() < (2 * IND_COST_EX)) { - JITDUMP(" tree cost too low: %d < %d (loopVarCount %u >= availableRegCount %u)\n", tree->GetCostEx(), + JITDUMP(" tree cost too low: %d < %d (loopVarCount %u >= availRegCount %u)\n", tree->GetCostEx(), 2 * IND_COST_EX, loopVarCount, availRegCount); return false; } @@ -6992,7 +6985,7 @@ bool Compiler::optIsProfitableToHoistTree(GenTree* tree, unsigned lnum) // Don't hoist expressions that barely meet CSE cost requirements: tree->GetCostEx() == MIN_CSE_COST if (tree->GetCostEx() <= MIN_CSE_COST + 1) { - JITDUMP(" tree not good CSE: %d <= %d (varInOutCount %u > availableRegCount %u)\n", tree->GetCostEx(), + JITDUMP(" tree not good CSE: %d <= %d (varInOutCount %u > availRegCount %u)\n", tree->GetCostEx(), 2 * MIN_CSE_COST + 1, varInOutCount, availRegCount) return false; } @@ -10705,4 +10698,4 @@ void Compiler::optMarkLoopRemoved(unsigned loopNum) // Assume the caller is going to fix up the table and `bbNatLoopNum` block annotations before the next time // `fgDebugCheckLoopTable()` is called. #endif // DEBUG -} \ No newline at end of file +} diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 25e357224aac37..cc97831c9f5287 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -186,6 +186,13 @@ enum _regMask_enum : unsigned #error Unsupported target architecture #endif +#if defined(TARGET_AMD64) +// AVAILABLE_REG_COUNT is defined to be dynamic, based on whether AVX-512 high registers are available. +#define AVAILABLE_REG_COUNT get_AVAILABLE_REG_COUNT() +#else +#define AVAILABLE_REG_COUNT ACTUAL_REG_COUNT +#endif + /*****************************************************************************/ // TODO-Cleanup: The types defined below are mildly confusing: why are there both? diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 20034a9b0216fd..64af2659bd592d 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -84,8 +84,7 @@ #define RBM_ALLFLOAT_INIT RBM_LOWFLOAT - // NOTE: Callee must define the use, which should point to the Compiler object rbmAllFloat field - #define RBM_ALLFLOAT RBM_ALLFLOAT_USE + #define RBM_ALLFLOAT get_RBM_ALLFLOAT() #define RBM_ALLDOUBLE RBM_ALLFLOAT #define REG_FP_FIRST REG_XMM0 @@ -147,8 +146,7 @@ #define RBM_FLT_CALLEE_TRASH_INIT (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5) #endif // !UNIX_AMD64_ABI - /* NOTE: Callee must define the use, which should point to the Compiler object rbmFltCalleeTrash field */ - #define RBM_FLT_CALLEE_TRASH RBM_FLT_CALLEE_TRASH_USE + #define RBM_FLT_CALLEE_TRASH get_RBM_FLT_CALLEE_TRASH() #define RBM_OSR_INT_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_EBP) @@ -248,8 +246,7 @@ #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ESI,RBM_EDI,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 #endif // !UNIX_AMD64_ABI - /* NOTE: Callee must define the use, which should point to the Compiler object cntCalleeTrashFloat field */ - #define CNT_CALLEE_TRASH_FLOAT CNT_CALLEE_TRASH_FLOAT_USE + #define CNT_CALLEE_TRASH_FLOAT get_CNT_CALLEE_TRASH_FLOAT() #define CALLEE_SAVED_REG_MAXSZ (CNT_CALLEE_SAVED*REGSIZE_BYTES) #define CALLEE_SAVED_FLOAT_MAXSZ (CNT_CALLEE_SAVED_FLOAT*16) From 10b470368fd4f8c3c3f2871848cba02ba62a7675 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 8 Feb 2023 10:57:37 -0800 Subject: [PATCH 34/34] Clearifying comments. --- src/coreclr/jit/lsraxarch.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index ef499520822196..74d46520215a1f 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2739,6 +2739,22 @@ void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/) } } +//------------------------------------------------------------------------------ +// BuildEvexIncompatibleMask: Returns RMB_NONE or a mask representing the +// lower SIMD registers for a node that lowers to an instruction that does not +// have an EVEX form (thus cannot use the upper SIMD registers). +// The caller invokes this function when it knows the node is EVEX incompatible. +// +// Simply using lowSIMDRegs() on an incompatible node's operand will incorrectly mask +// same cases, e.g., memory loads. +// +// Arguments: +// tree - tree to check for EVEX lowering compatibility +// +// Return Value: +// RBM_NONE if compatible with EVEX (or not a floating/SIMD register), +// lowSIMDRegs() (XMM0-XMM16) otherwise. +// inline regMaskTP LinearScan::BuildEvexIncompatibleMask(GenTree* tree) { #if defined(TARGET_AMD64) @@ -2747,6 +2763,8 @@ inline regMaskTP LinearScan::BuildEvexIncompatibleMask(GenTree* tree) return RBM_NONE; } + // If a node is contained and is a memory load etc., use RBM_NONE as it will use an integer register for the + // load, not a SIMD register. if (tree->isContained() && (tree->OperIsIndir() || (tree->OperIs(GT_HWINTRINSIC) && tree->AsHWIntrinsic()->OperIsMemoryLoad()) || tree->OperIs(GT_LEA)))