@@ -6074,16 +6074,18 @@ void CodeGen::genCall(GenTreeCall* call)
60746074 }
60756075#endif // defined(DEBUG) && defined(TARGET_X86)
60766076
6077- // When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
6078- // if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
6079- // transition penalty, assuming the user function contains legacy SSE instruction.
6080- // To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue
6081- // VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens
6082- // when there's preceding 256-bit AVX to legacy SSE transition penalty.
6083- // This applies to 512bit AVX512 instructions as well.
6084- if (call->IsPInvoke () && (call->gtCallType == CT_USER_FUNC) && (GetEmitter ()->Contains256bitOrMoreAVX ()))
6085- {
6086- assert (compiler->canUseVexEncoding ());
6077+ if (GetEmitter ()->Contains256bitOrMoreAVX () && call->NeedsVzeroupper (compiler))
6078+ {
6079+ // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
6080+ // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
6081+ // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
6082+ // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
6083+ // register) and before any call to an unknown function.
6084+
6085+ // This method contains a call that needs vzeroupper but also uses 256-bit or higher
6086+ // AVX itself. This means we couldn't optimize to only emitting a single vzeroupper in
6087+ // the method prologue and instead need to insert one before each call that needs it.
6088+
60876089 instGen (INS_vzeroupper);
60886090 }
60896091
@@ -11188,12 +11190,27 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
1118811190// funclet frames: this will be FuncletInfo.fiSpDelta.
1118911191void CodeGen::genPreserveCalleeSavedFltRegs (unsigned lclFrameSize)
1119011192{
11191- genVzeroupperIfNeeded (false );
1119211193 regMaskTP regMask = compiler->compCalleeFPRegsSavedMask ;
1119311194
1119411195 // Only callee saved floating point registers should be in regMask
1119511196 assert ((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1119611197
11198+ if (GetEmitter ()->ContainsCallNeedingVzeroupper () && !GetEmitter ()->Contains256bitOrMoreAVX ())
11199+ {
11200+ // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11201+ // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11202+ // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11203+ // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11204+ // register) and before any call to an unknown function.
11205+
11206+ // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11207+ // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11208+ // This reduces the overall amount of codegen, particularly for more common paths not using any
11209+ // SIMD or floating-point.
11210+
11211+ instGen (INS_vzeroupper);
11212+ }
11213+
1119711214 // fast path return
1119811215 if (regMask == RBM_NONE)
1119911216 {
@@ -11241,10 +11258,20 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1124111258 // Only callee saved floating point registers should be in regMask
1124211259 assert ((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1124311260
11261+ if (GetEmitter ()->Contains256bitOrMoreAVX ())
11262+ {
11263+ // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11264+ // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11265+ // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11266+ // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11267+ // register) and before any call to an unknown function.
11268+
11269+ instGen (INS_vzeroupper);
11270+ }
11271+
1124411272 // fast path return
1124511273 if (regMask == RBM_NONE)
1124611274 {
11247- genVzeroupperIfNeeded ();
1124811275 return ;
1124911276 }
1125011277
@@ -11287,37 +11314,6 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1128711314 offset -= XMM_REGSIZE_BYTES;
1128811315 }
1128911316 }
11290- genVzeroupperIfNeeded ();
11291- }
11292-
11293- // Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the
11294- // AVX/Legacy SSE transition penalties can be avoided. This function is been used in genPreserveCalleeSavedFltRegs
11295- // (prolog) and genRestoreCalleeSavedFltRegs (epilog). Issue VZEROUPPER in Prolog if the method contains
11296- // 128-bit or 256-bit AVX code, to avoid legacy SSE to AVX transition penalty, which could happen when native
11297- // code contains legacy SSE code calling into JIT AVX code (e.g. reverse pinvoke). Issue VZEROUPPER in Epilog
11298- // if the method contains 256-bit AVX code, to avoid AVX to legacy SSE transition penalty.
11299- //
11300- // Params
11301- // check256bitOnly - true to check if the function contains 256-bit AVX instruction and generate Vzeroupper
11302- // instruction, false to check if the function contains AVX instruction (either 128-bit or 256-bit).
11303- //
11304- void CodeGen::genVzeroupperIfNeeded (bool check256bitOnly /* = true*/ )
11305- {
11306- bool emitVzeroUpper = false ;
11307- if (check256bitOnly)
11308- {
11309- emitVzeroUpper = GetEmitter ()->Contains256bitOrMoreAVX ();
11310- }
11311- else
11312- {
11313- emitVzeroUpper = GetEmitter ()->ContainsAVX ();
11314- }
11315-
11316- if (emitVzeroUpper)
11317- {
11318- assert (compiler->canUseVexEncoding ());
11319- instGen (INS_vzeroupper);
11320- }
1132111317}
1132211318
1132311319// -----------------------------------------------------------------------------------
0 commit comments