Skip to content

Commit 581d4d2

Browse files
Hide 'align' instruction behind jmp (#60787)
* Hide align behind a jmp fix the alignBytesRemoved Some fixes and working model Some fixes and redesign Some more fixes more fixes fix Add the check for fgFirstBB misc changes code cleanup + JitHideAlignBehindJmp switch validatePadding only if align are before the loop IG More cleanup, remove commented code jit format * Fix a problem where curIG==0 and loop might be emitted in curIG, adjust the targetIG to prevIG Add IGF_REMOVED_ALIGN flag for special scenarios * Add stress mode to emit int3 for xarch * Add stress mode to emit bkpt for arm64 * Add a loop align instruction placement phase * review comments * Change from unsigned short to unsigned * review comments around cleanup * emitForceNewIG * Remove emitPrevIG * Revert change to forceNewIG for align instruction * Use loopAlignCandidates * Use loopHeadIG reference * jit format * Remove unneeded method * Misc changes * Review feedback * Do not include align behind Jmp in PerfScore calculation * jit format and fix a bug * fix the loopCandidates == 0 scenario * Add unmarkLoopAlign(), add check for fgFirstBB * merge conflict fix * Add missing } * Grammar nit Co-authored-by: Bruce Forstall <[email protected]> Co-authored-by: Bruce Forstall <[email protected]>
1 parent 7f874ee commit 581d4d2

File tree

14 files changed

+490
-175
lines changed

14 files changed

+490
-175
lines changed

src/coreclr/jit/block.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1662,3 +1662,22 @@ BBswtDesc::BBswtDesc(Compiler* comp, const BBswtDesc* other)
16621662
bbsDstTab[i] = other->bbsDstTab[i];
16631663
}
16641664
}
1665+
1666+
//------------------------------------------------------------------------
1667+
// unmarkLoopAlign: Unmarks the LOOP_ALIGN flag from the block and reduce the
1668+
// loop alignment count.
1669+
//
1670+
// Arguments:
1671+
// compiler - Compiler instance
1672+
// reason - Reason to print in JITDUMP
1673+
//
1674+
void BasicBlock::unmarkLoopAlign(Compiler* compiler DEBUG_ARG(const char* reason))
1675+
{
1676+
// Make sure we unmark and count just once.
1677+
if (isLoopAlign())
1678+
{
1679+
compiler->loopAlignCandidates--;
1680+
bbFlags &= ~BBF_LOOP_ALIGN;
1681+
JITDUMP("Unmarking LOOP_ALIGN from " FMT_BB ". Reason= %s.", bbNum, reason);
1682+
}
1683+
}

src/coreclr/jit/block.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,7 @@ enum BasicBlockFlags : unsigned __int64
552552
BBF_PATCHPOINT = MAKE_BBFLAG(36), // Block is a patchpoint
553553
BBF_HAS_CLASS_PROFILE = MAKE_BBFLAG(37), // BB contains a call needing a class profile
554554
BBF_PARTIAL_COMPILATION_PATCHPOINT = MAKE_BBFLAG(38), // Block is a partial compilation patchpoint
555+
BBF_HAS_ALIGN = MAKE_BBFLAG(39), // BB ends with 'align' instruction
555556

556557
// The following are sets of flags.
557558

@@ -653,11 +654,19 @@ struct BasicBlock : private LIR::Range
653654
{
654655
return ((bbFlags & BBF_LOOP_HEAD) != 0);
655656
}
657+
656658
bool isLoopAlign() const
657659
{
658660
return ((bbFlags & BBF_LOOP_ALIGN) != 0);
659661
}
660662

663+
void unmarkLoopAlign(Compiler* comp DEBUG_ARG(const char* reason));
664+
665+
bool hasAlign() const
666+
{
667+
return ((bbFlags & BBF_HAS_ALIGN) != 0);
668+
}
669+
661670
#ifdef DEBUG
662671
void dspFlags(); // Print the flags
663672
unsigned dspCheapPreds(); // Print the predecessors (bbCheapPreds)

src/coreclr/jit/codegenlinear.cpp

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ void CodeGen::genCodeForBBlist()
174174

175175
for (block = compiler->fgFirstBB; block != nullptr; block = block->bbNext)
176176
{
177+
177178
#ifdef DEBUG
178179
if (compiler->verbose)
179180
{
@@ -782,21 +783,29 @@ void CodeGen::genCodeForBBlist()
782783
}
783784

784785
#if FEATURE_LOOP_ALIGN
786+
if (block->hasAlign())
787+
{
788+
// If this block has 'align' instruction in the end (identified by BBF_HAS_ALIGN),
789+
// then need to add align instruction in the current "block".
790+
//
791+
// For non-adaptive alignment, add alignment instruction of size depending on the
792+
// compJitAlignLoopBoundary.
793+
// For adaptive alignment, alignment instruction will always be of 15 bytes for xarch
794+
// and 16 bytes for arm64.
795+
assert(ShouldAlignLoops());
785796

786-
// If next block is the first block of a loop (identified by BBF_LOOP_ALIGN),
787-
// then need to add align instruction in current "block". Also mark the
788-
// corresponding IG with IGF_LOOP_ALIGN to know that there will be align
789-
// instructions at the end of that IG.
790-
//
791-
// For non-adaptive alignment, add alignment instruction of size depending on the
792-
// compJitAlignLoopBoundary.
793-
// For adaptive alignment, alignment instruction will always be of 15 bytes.
797+
GetEmitter()->emitLoopAlignment(DEBUG_ARG1(block->bbJumpKind == BBJ_ALWAYS));
798+
}
794799

795800
if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign()))
796801
{
797-
assert(ShouldAlignLoops());
798-
799-
GetEmitter()->emitLoopAlignment();
802+
if (compiler->opts.compJitHideAlignBehindJmp)
803+
{
804+
// The current IG is the one that is just before the IG having loop start.
805+
// Establish a connection of recent align instruction emitted to the loop
806+
// it actually is aligning using 'idaLoopHeadPredIG'.
807+
GetEmitter()->emitConnectAlignInstrWithCurIG();
808+
}
800809
}
801810
#endif
802811

src/coreclr/jit/compiler.cpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2548,11 +2548,13 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
25482548

25492549
opts.compJitAlignLoopForJcc = JitConfig.JitAlignLoopForJcc() == 1;
25502550
opts.compJitAlignLoopMaxCodeSize = (unsigned short)JitConfig.JitAlignLoopMaxCodeSize();
2551+
opts.compJitHideAlignBehindJmp = JitConfig.JitHideAlignBehindJmp() == 1;
25512552
#else
25522553
opts.compJitAlignLoopAdaptive = true;
25532554
opts.compJitAlignLoopBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
25542555
opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
25552556
opts.compJitAlignLoopMaxCodeSize = DEFAULT_MAX_LOOPSIZE_FOR_ALIGN;
2557+
opts.compJitHideAlignBehindJmp = true;
25562558
#endif
25572559

25582560
#ifdef TARGET_XARCH
@@ -5153,6 +5155,11 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
51535155
fgDebugCheckLinks();
51545156
#endif
51555157

5158+
#if FEATURE_LOOP_ALIGN
5159+
// Place loop alignment instructions
5160+
DoPhase(this, PHASE_ALIGN_LOOPS, &Compiler::placeLoopAlignInstructions);
5161+
#endif
5162+
51565163
// Generate code
51575164
codeGen->genGenerateCode(methodCodePtr, methodCodeSize);
51585165

@@ -5209,6 +5216,82 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
52095216
#endif // FUNC_INFO_LOGGING
52105217
}
52115218

5219+
#if FEATURE_LOOP_ALIGN
5220+
5221+
//------------------------------------------------------------------------
5222+
// placeLoopAlignInstructions: Iterate over all the blocks and determine
5223+
// the best position to place the 'align' instruction. Inserting 'align'
5224+
// instructions after an unconditional branch is preferred over inserting
5225+
// in the block before the loop. In case there are multiple blocks
5226+
// having 'jmp', the one that has lower weight is preferred.
5227+
// If the block having 'jmp' is hotter than the block before the loop,
5228+
// the align will still be placed after 'jmp' because the processor should
5229+
// be smart enough to not fetch extra instruction beyond jmp.
5230+
//
5231+
void Compiler::placeLoopAlignInstructions()
5232+
{
5233+
if (loopAlignCandidates == 0)
5234+
{
5235+
return;
5236+
}
5237+
5238+
int loopsToProcess = loopAlignCandidates;
5239+
5240+
// Add align only if there were any loops that needed alignment
5241+
weight_t minBlockSoFar = BB_MAX_WEIGHT;
5242+
BasicBlock* bbHavingAlign = nullptr;
5243+
for (BasicBlock* const block : Blocks())
5244+
{
5245+
if ((block == fgFirstBB) && block->isLoopAlign())
5246+
{
5247+
// Adding align instruction in prolog is not supported
5248+
// hence skip the align block if it is the first block.
5249+
loopsToProcess--;
5250+
continue;
5251+
}
5252+
5253+
// If there is a unconditional jump
5254+
if (opts.compJitHideAlignBehindJmp && (block->bbJumpKind == BBJ_ALWAYS))
5255+
{
5256+
if (block->bbWeight < minBlockSoFar)
5257+
{
5258+
minBlockSoFar = block->bbWeight;
5259+
bbHavingAlign = block;
5260+
JITDUMP(FMT_BB ", bbWeight=" FMT_WT " ends with unconditional 'jmp' \n", block->bbNum, block->bbWeight);
5261+
}
5262+
}
5263+
5264+
if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign()))
5265+
{
5266+
// If jmp was not found, then block before the loop start is where align instruction will be added.
5267+
if (bbHavingAlign == nullptr)
5268+
{
5269+
bbHavingAlign = block;
5270+
JITDUMP("Marking " FMT_BB " before the loop with BBF_HAS_ALIGN for loop at " FMT_BB "\n", block->bbNum,
5271+
block->bbNext->bbNum);
5272+
}
5273+
else
5274+
{
5275+
JITDUMP("Marking " FMT_BB " that ends with unconditional jump with BBF_HAS_ALIGN for loop at " FMT_BB
5276+
"\n",
5277+
bbHavingAlign->bbNum, block->bbNext->bbNum);
5278+
}
5279+
5280+
bbHavingAlign->bbFlags |= BBF_HAS_ALIGN;
5281+
minBlockSoFar = BB_MAX_WEIGHT;
5282+
bbHavingAlign = nullptr;
5283+
5284+
if (--loopsToProcess == 0)
5285+
{
5286+
break;
5287+
}
5288+
}
5289+
}
5290+
5291+
assert(loopsToProcess == 0);
5292+
}
5293+
#endif
5294+
52125295
//------------------------------------------------------------------------
52135296
// generatePatchpointInfo: allocate and fill in patchpoint info data,
52145297
// and report it to the VM

src/coreclr/jit/compiler.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3666,6 +3666,7 @@ class Compiler
36663666
#endif
36673667

36683668
BasicBlock* bbNewBasicBlock(BBjumpKinds jumpKind);
3669+
void placeLoopAlignInstructions();
36693670

36703671
/*
36713672
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
@@ -6871,13 +6872,13 @@ class Compiler
68716872
bool fgHasLoops; // True if this method has any loops, set in fgComputeReachability
68726873

68736874
public:
6874-
LoopDsc* optLoopTable; // loop descriptor table
6875-
unsigned char optLoopCount; // number of tracked loops
6875+
LoopDsc* optLoopTable; // loop descriptor table
6876+
unsigned char optLoopCount; // number of tracked loops
6877+
unsigned char loopAlignCandidates; // number of loops identified for alignment
68766878

68776879
#ifdef DEBUG
6878-
unsigned char loopAlignCandidates; // number of loops identified for alignment
6879-
unsigned char loopsAligned; // number of loops actually aligned
6880-
#endif // DEBUG
6880+
unsigned char loopsAligned; // number of loops actually aligned
6881+
#endif // DEBUG
68816882

68826883
bool optRecordLoop(BasicBlock* head,
68836884
BasicBlock* top,
@@ -9688,6 +9689,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
96889689
// If set, perform adaptive loop alignment that limits number of padding based on loop size.
96899690
bool compJitAlignLoopAdaptive;
96909691

9692+
// If set, tries to hide alignment instructions behind unconditional jumps.
9693+
bool compJitHideAlignBehindJmp;
9694+
96919695
#ifdef LATE_DISASM
96929696
bool doLateDisasm; // Run the late disassembler
96939697
#endif // LATE_DISASM

src/coreclr/jit/compphases.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ CompPhaseNameMacro(PHASE_INSERT_GC_POLLS, "Insert GC Polls",
8787
CompPhaseNameMacro(PHASE_DETERMINE_FIRST_COLD_BLOCK, "Determine first cold block", "COLD-BLK", false, -1, true)
8888
CompPhaseNameMacro(PHASE_RATIONALIZE, "Rationalize IR", "RAT", false, -1, false)
8989
CompPhaseNameMacro(PHASE_SIMPLE_LOWERING, "Do 'simple' lowering", "SMP-LWR", false, -1, false)
90+
CompPhaseNameMacro(PHASE_ALIGN_LOOPS, "Place 'align' instructions", "LOOP-ALIGN", false, -1, false)
9091

9192
CompPhaseNameMacro(PHASE_LCLVARLIVENESS, "Local var liveness", "LIVENESS", true, -1, false)
9293
CompPhaseNameMacro(PHASE_LCLVARLIVENESS_INIT, "Local var liveness init", "LIV-INIT", false, PHASE_LCLVARLIVENESS, false)

0 commit comments

Comments
 (0)