From 9e111bedf2701daa3610bd5eddd328cf9f2688fa Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Fri, 26 Jul 2024 12:02:52 +0200 Subject: [PATCH 1/2] JIT: Fix placement of `GT_START_NOGC` for tailcalls in face of bulk copy with write barrier calls When the JIT generates code for a tailcall it must generate code to write the arguments into the incoming parameter area. Since the GC ness of the arguments of the tailcall may not match the GC ness of the parameters, we have to disable GC before we start writing these. This is done by finding the earliest `GT_PUTARG_STK` node and placing the start of the NOGC region right before it. In addition, there is logic to take care of potential overlap between the arguments and parameters. For example, if the call has an operand that uses one of the parameters, then we must take care that we do not override that parameter with the tailcall argument before the use of it. To do so, we sometimes may need to introduce copies from the parameter locals to locals on the stack frame. This used to work fine, however, with #101761 we started transforming block copies into managed calls in certain scenarios. It was possible for the JIT to decide to introduce a copy to a local and for this transformation to then kick in. This would cause us to end up with the managed helper call after starting the nogc region. In checked builds this would hit an assert during GC scan; in release builds, it would end up with corrupted data. The fix here is to make sure we insert the `GT_START_NOGC` after all the potential temporary copies we may introduce as part of the tailcat stll logic. There was an additional assumption that the first `PUTARG_STK` operand was the earliest one in execution order. That is not guaranteed, so this change stops relying on that as well by introducing a new `LIR::FirstNode` and using that to determine the earliest `PUTARG_STK` node. Fix #102370 Fix #104123 Fix #105441 --- src/coreclr/jit/lir.cpp | 16 +++++++++ src/coreclr/jit/lir.h | 1 + src/coreclr/jit/lower.cpp | 73 ++++++++++++++++++--------------------- 3 files changed, 50 insertions(+), 40 deletions(-) diff --git a/src/coreclr/jit/lir.cpp b/src/coreclr/jit/lir.cpp index feabec03d02cad..44570ea9864041 100644 --- a/src/coreclr/jit/lir.cpp +++ b/src/coreclr/jit/lir.cpp @@ -1872,6 +1872,22 @@ GenTree* LIR::LastNode(GenTree** nodes, size_t numNodes) return lastNode; } +//------------------------------------------------------------------------ +// LIR::FirstNode: +// Given two nodes in the same block range, find which node appears first. +// +// Arguments: +// node1 - The first node +// node2 - The second node +// +// Returns: +// Node that appears first. +// +GenTree* LIR::FirstNode(GenTree* node1, GenTree* node2) +{ + return LastNode(node1, node2) == node1 ? node2 : node1; +} + #ifdef DEBUG void GenTree::dumpLIRFlags() { diff --git a/src/coreclr/jit/lir.h b/src/coreclr/jit/lir.h index 8a3a9a507a38bb..a3271e832fa8de 100644 --- a/src/coreclr/jit/lir.h +++ b/src/coreclr/jit/lir.h @@ -317,6 +317,7 @@ class LIR final static GenTree* LastNode(GenTree* node1, GenTree* node2); static GenTree* LastNode(GenTree** nodes, size_t numNodes); + static GenTree* FirstNode(GenTree* node1, GenTree* node2); }; inline void GenTree::SetUnusedValue() diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index acf3f052f62d21..0a5b73efdbc886 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -3057,52 +3057,22 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) // call could over-write the stack arg that is setup earlier. ArrayStack putargs(comp->getAllocator(CMK_ArrayStack)); - for (CallArg& arg : call->gtArgs.EarlyArgs()) - { - if (arg.GetEarlyNode()->OperIs(GT_PUTARG_STK)) - { - putargs.Push(arg.GetEarlyNode()); - } - } - - for (CallArg& arg : call->gtArgs.LateArgs()) + for (CallArg& arg : call->gtArgs.Args()) { - if (arg.GetLateNode()->OperIs(GT_PUTARG_STK)) + if (arg.GetNode()->OperIs(GT_PUTARG_STK)) { - putargs.Push(arg.GetLateNode()); + putargs.Push(arg.GetNode()); } } GenTree* startNonGCNode = nullptr; - if (!putargs.Empty()) + if (putargs.Height() > 0) { - // Get the earliest operand of the first PUTARG_STK node. We will make - // the required copies of args before this node. - bool unused; - GenTree* insertionPoint = BlockRange().GetTreeRange(putargs.Bottom(), &unused).FirstNode(); - // Insert GT_START_NONGC node before we evaluate the PUTARG_STK args. - // Note that if there are no args to be setup on stack, no need to - // insert GT_START_NONGC node. - startNonGCNode = new (comp, GT_START_NONGC) GenTree(GT_START_NONGC, TYP_VOID); - BlockRange().InsertBefore(insertionPoint, startNonGCNode); - - // Gc-interruptability in the following case: - // foo(a, b, c, d, e) { bar(a, b, c, d, e); } - // bar(a, b, c, d, e) { foo(a, b, d, d, e); } - // - // Since the instruction group starting from the instruction that sets up first - // stack arg to the end of the tail call is marked as non-gc interruptible, - // this will form a non-interruptible tight loop causing gc-starvation. To fix - // this we insert GT_NO_OP as embedded stmt before GT_START_NONGC, if the method - // has a single basic block and is not a GC-safe point. The presence of a single - // nop outside non-gc interruptible region will prevent gc starvation. - if ((comp->fgBBcount == 1) && !comp->compCurBB->HasFlag(BBF_GC_SAFE_POINT)) + GenTree* firstPutargStk = putargs.Bottom(0); + for (int i = 1; i < putargs.Height(); i++) { - assert(comp->fgFirstBB == comp->compCurBB); - GenTree* noOp = new (comp, GT_NO_OP) GenTree(GT_NO_OP, TYP_VOID); - BlockRange().InsertBefore(startNonGCNode, noOp); + firstPutargStk = LIR::FirstNode(firstPutargStk, putargs.Bottom(i)); } - // Since this is a fast tailcall each PUTARG_STK will place the argument in the // _incoming_ arg space area. This will effectively overwrite our already existing // incoming args that live in that area. If we have later uses of those args, this @@ -3172,10 +3142,10 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) GenTree* lookForUsesFrom = put->gtNext; if (overwrittenStart != argStart) { - lookForUsesFrom = insertionPoint; + lookForUsesFrom = firstPutargStk; } - RehomeArgForFastTailCall(callerArgLclNum, insertionPoint, lookForUsesFrom, call); + RehomeArgForFastTailCall(callerArgLclNum, firstPutargStk, lookForUsesFrom, call); // The above call can introduce temps and invalidate the pointer. callerArgDsc = comp->lvaGetDesc(callerArgLclNum); @@ -3189,10 +3159,33 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) unsigned int fieldsEnd = fieldsFirst + callerArgDsc->lvFieldCnt; for (unsigned int j = fieldsFirst; j < fieldsEnd; j++) { - RehomeArgForFastTailCall(j, insertionPoint, lookForUsesFrom, call); + RehomeArgForFastTailCall(j, firstPutargStk, lookForUsesFrom, call); } } } + + // Now insert GT_START_NONGC node before we evaluate the first PUTARG_STK. + // Note that if there are no args to be setup on stack, no need to + // insert GT_START_NONGC node. + startNonGCNode = new (comp, GT_START_NONGC) GenTree(GT_START_NONGC, TYP_VOID); + BlockRange().InsertBefore(firstPutargStk, startNonGCNode); + + // Gc-interruptability in the following case: + // foo(a, b, c, d, e) { bar(a, b, c, d, e); } + // bar(a, b, c, d, e) { foo(a, b, d, d, e); } + // + // Since the instruction group starting from the instruction that sets up first + // stack arg to the end of the tail call is marked as non-gc interruptible, + // this will form a non-interruptible tight loop causing gc-starvation. To fix + // this we insert GT_NO_OP as embedded stmt before GT_START_NONGC, if the method + // has a single basic block and is not a GC-safe point. The presence of a single + // nop outside non-gc interruptible region will prevent gc starvation. + if ((comp->fgBBcount == 1) && !comp->compCurBB->HasFlag(BBF_GC_SAFE_POINT)) + { + assert(comp->fgFirstBB == comp->compCurBB); + GenTree* noOp = new (comp, GT_NO_OP) GenTree(GT_NO_OP, TYP_VOID); + BlockRange().InsertBefore(startNonGCNode, noOp); + } } // Insert GT_PROF_HOOK node to emit profiler tail call hook. This should be From fd8300f3229b22ec8b5f4d78684c1f1eba1b28d3 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Fri, 26 Jul 2024 12:36:26 +0200 Subject: [PATCH 2/2] Nit --- src/coreclr/jit/lower.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 0a5b73efdbc886..30d2a1fbb80ef8 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -3066,7 +3066,7 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) } GenTree* startNonGCNode = nullptr; - if (putargs.Height() > 0) + if (!putargs.Empty()) { GenTree* firstPutargStk = putargs.Bottom(0); for (int i = 1; i < putargs.Height(); i++)