diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index f776d7defe4d1a..30b5ea482838b8 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -5195,7 +5195,11 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, { ssize_t compressedDsp; - if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)) + // Only the scaling factor of the original EVEX instructions can be changed by embedded broadcast. + // If the instruction does not have tuple type info, say extended EVEX from APX, the scaling factor is + // constantly 1, then this optimization cannot be performed, and whether disp8 or disp32 should be applied + // only depends dspInByte. + if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte) && hasTupleTypeInfo(ins)) { SetEvexCompressedDisplacement(id); } @@ -5368,7 +5372,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) { ssize_t compressedDsp; - if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)) + if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte) && hasTupleTypeInfo(ins)) { SetEvexCompressedDisplacement(id); } @@ -14672,13 +14676,16 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) assert(isCompressed && dspInByte); dsp = compressedDsp; } - else if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + else if (TakesEvexPrefix(id) && !IsBMIInstruction(ins)) { - assert(!TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)); + assert(!(TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte) && hasTupleTypeInfo(ins))); dspInByte = false; } else { + // TODO-XArch-APX: for now, Extended Evex instruction will not have compressed displacement, or more + // accurately, extended evex may not have compressed displacement optimization as the scaling factor is + // constantly 1. dspInByte = ((signed char)dsp == (ssize_t)dsp); } dspIsZero = (dsp == 0); @@ -15556,7 +15563,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) assert(isCompressed && dspInByte); dsp = (int)compressedDsp; } - else if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + else if (TakesEvexPrefix(id) && !IsBMIInstruction(ins)) { #if FEATURE_FIXED_OUT_ARGS // TODO-AMD64-CQ: We should be able to accurately predict this when FEATURE_FIXED_OUT_ARGS @@ -17904,6 +17911,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) idAmd->idCodeSize(sz); code = insCodeRM(ins); + code = AddX86PrefixIfNeeded(id, code, id->idOpSize()); code |= (insEncodeReg345(id, id->idReg1(), EA_PTRSIZE, &code) << 8); dst = emitOutputAM(dst, idAmd, code, nullptr); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index e9cf023943f493..844a621b5cb563 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -692,7 +692,7 @@ void CodeGen::inst_SET(emitJumpKind condition, regNumber reg, insOpts instOption assert(INS_setge == (INS_setge_apx + offset)); assert(INS_setle == (INS_setle_apx + offset)); assert(INS_setg == (INS_setg_apx + offset)); - ins = (instruction)(ins + offset); + ins = (instruction)(ins - offset); } #endif diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 47c232540580d9..a947e5546e58c3 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1157,12 +1157,12 @@ int LinearScan::BuildShiftRotate(GenTree* tree) #endif if (!source->isContained()) { - tgtPrefUse = BuildUse(source, srcCandidates); + tgtPrefUse = BuildUse(source, ForceLowGprForApxIfNeeded(source, srcCandidates, getEvexIsSupported())); srcCount++; } else { - srcCount += BuildOperandUses(source, srcCandidates); + srcCount += BuildOperandUses(source, ForceLowGprForApxIfNeeded(source, srcCandidates, getEvexIsSupported())); } if (!tree->isContained()) @@ -1172,6 +1172,9 @@ int LinearScan::BuildShiftRotate(GenTree* tree) srcCount += BuildDelayFreeUses(shiftBy, source, SRBM_RCX); buildKillPositionsForNode(tree, currentLoc + 1, SRBM_RCX); } + dstCandidates = (tree->GetRegNum() == REG_NA) + ? ForceLowGprForApxIfNeeded(tree, dstCandidates, getEvexIsSupported()) + : dstCandidates; BuildDef(tree, dstCandidates); } else @@ -3280,8 +3283,8 @@ int LinearScan::BuildMul(GenTree* tree) srcCandidates1 = SRBM_RDX; } - srcCount = BuildOperandUses(op1, srcCandidates1); - srcCount += BuildOperandUses(op2, srcCandidates2); + srcCount = BuildOperandUses(op1, ForceLowGprForApxIfNeeded(op1, srcCandidates1, getEvexIsSupported())); + srcCount += BuildOperandUses(op2, ForceLowGprForApxIfNeeded(op2, srcCandidates2, getEvexIsSupported())); #if defined(TARGET_X86) if (tree->OperIs(GT_MUL_LONG))