-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Reset minOccupancy if unclustered schedule was not run for any region. #162025
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…ny region. During init of unclustered schedule stage, the minOccupancy is temporarily increased. But subsequently, if none of the regions are scheduled because they don't meet the conditions of initGCNRegion, the minOccupancy should be reset to the initial occupancy. This change detects the above situation and resets minOccupancy during finalization.
@llvm/pr-subscribers-backend-amdgpu Author: Dhruva Chakrabarti (dhruvachak) ChangesDuring init of unclustered schedule stage, the minOccupancy is temporarily increased. But subsequently, if none of the regions are scheduled because they don't meet the conditions of initGCNRegion, the minOccupancy should be reset to the initial occupancy. This change detects the above situation and resets minOccupancy during finalization. Patch is 69.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162025.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index bdc08101c7119..6ed24c272c92c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -966,6 +966,7 @@ void GCNScheduleDAGMILive::runSchedStages() {
if (!Stage->initGCNSchedStage())
continue;
+ bool IsAnyRegionScheduled = false;
for (auto Region : Regions) {
RegionBegin = Region.first;
RegionEnd = Region.second;
@@ -989,11 +990,12 @@ void GCNScheduleDAGMILive::runSchedStages() {
Stage->getRegionIdx()));
}
+ IsAnyRegionScheduled = true;
ScheduleDAGMILive::schedule();
Stage->finalizeGCNRegion();
}
- Stage->finalizeGCNSchedStage();
+ Stage->finalizeGCNSchedStage(IsAnyRegionScheduled);
}
}
@@ -1134,21 +1136,28 @@ bool PreRARematStage::initGCNSchedStage() {
return true;
}
-void GCNSchedStage::finalizeGCNSchedStage() {
+void GCNSchedStage::finalizeGCNSchedStage(bool IsAnyRegionScheduled) {
DAG.finishBlock();
LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
}
-void UnclusteredHighRPStage::finalizeGCNSchedStage() {
+void UnclusteredHighRPStage::finalizeGCNSchedStage(bool IsAnyRegionScheduled) {
SavedMutations.swap(DAG.Mutations);
S.SGPRLimitBias = S.VGPRLimitBias = 0;
if (DAG.MinOccupancy > InitialOccupancy) {
- LLVM_DEBUG(dbgs() << StageID
- << " stage successfully increased occupancy to "
- << DAG.MinOccupancy << '\n');
+ if (IsAnyRegionScheduled) {
+ LLVM_DEBUG(dbgs() << StageID
+ << " stage successfully increased occupancy to "
+ << DAG.MinOccupancy << '\n');
+ } else {
+ DAG.MinOccupancy = InitialOccupancy;
+ LLVM_DEBUG(dbgs() << StageID
+ << ": No regions scheduled, resetting min occupancy to "
+ << InitialOccupancy << "\n");
+ }
}
- GCNSchedStage::finalizeGCNSchedStage();
+ GCNSchedStage::finalizeGCNSchedStage(IsAnyRegionScheduled);
}
bool GCNSchedStage::initGCNRegion() {
@@ -1962,7 +1971,7 @@ bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
return true;
}
-void PreRARematStage::finalizeGCNSchedStage() {
+void PreRARematStage::finalizeGCNSchedStage(bool IsAnyRegionScheduled) {
// We consider that reducing spilling is always beneficial so we never
// rollback rematerializations in such cases. It's also possible that
// rescheduling lowers occupancy over the one achieved just through remats, in
@@ -2015,7 +2024,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
for (auto &[I, OriginalRP] : ImpactedRegions)
DAG.Pressure[I] = OriginalRP;
- GCNSchedStage::finalizeGCNSchedStage();
+ GCNSchedStage::finalizeGCNSchedStage(IsAnyRegionScheduled);
}
void GCNScheduleDAGMILive::updateRegionBoundaries(
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 8ea42677454e4..a54c761135387 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -346,7 +346,7 @@ class GCNSchedStage {
virtual bool initGCNSchedStage();
// Finalize state after finishing a scheduling pass on the function.
- virtual void finalizeGCNSchedStage();
+ virtual void finalizeGCNSchedStage(bool IsAnyRegionScheduled);
// Setup for scheduling a region. Returns false if the current region should
// be skipped.
@@ -406,7 +406,7 @@ class UnclusteredHighRPStage : public GCNSchedStage {
public:
bool initGCNSchedStage() override;
- void finalizeGCNSchedStage() override;
+ void finalizeGCNSchedStage(bool IsAnyRegionScheduled) override;
bool initGCNRegion() override;
@@ -494,7 +494,7 @@ class PreRARematStage : public GCNSchedStage {
/// If remat alone did not increase occupancy to the target one, rollbacks all
/// rematerializations and resets live-ins/RP in all regions impacted by the
/// stage to their pre-stage values.
- void finalizeGCNSchedStage() override;
+ void finalizeGCNSchedStage(bool IsAnyRegionScheduled) override;
public:
bool initGCNSchedStage() override;
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir
new file mode 100644
index 0000000000000..345dfa24fc0eb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir
@@ -0,0 +1,735 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -debug-only=machine-scheduler %s -o - 2>&1 | FileCheck %s
+
+--- |
+ define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 {
+ ret void
+ }
+
+ attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+...
+
+# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. However, the RP is still high,
+# and RA is unable to allocate without spills. By running the high-RP-reschedule schedule we would have furhter decreased RP, which provides increased
+# flexibility for RA.
+
+# If Unclustered High RP Reschedule gets run, the following CHECK will have to be removed.
+# CHECK: Unclustered High Register Pressure Reschedule: No regions scheduled, resetting min occupancy
+
+---
+name: no_sched_metric_due_to_spills
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 4
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0_sgpr1, $sgpr15
+
+ %0:sgpr_32 = COPY $sgpr15
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %2:vgpr_32(s32) = COPY $vgpr0
+ %3:sgpr_128 = S_LOAD_DWORDX4_IMM %1(p4), 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
+ undef %4.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %1(p4), 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+ %5:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 32, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 64, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ %7:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 84, 0 :: (dereferenceable invariant load (s32), addrspace 4)
+ %8:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 112, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ %9:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 128, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ %10:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 176, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ %11:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 192, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 216, 0 :: (dereferenceable invariant load (s64), addrspace 4)
+ %13:sreg_32 = S_ADD_I32 %12.sub0, 127, implicit-def dead $scc
+ %14:sreg_32 = S_ASHR_I32 %13, 31, implicit-def dead $scc
+ %15:sreg_32 = S_LSHR_B32 %14, 25, implicit-def dead $scc
+ %16:sreg_32 = S_ADD_I32 %13, %15, implicit-def dead $scc
+ %17:sreg_32 = S_ASHR_I32 %16, 7, implicit-def dead $scc
+ %18:sreg_32 = S_ADD_I32 %12.sub1, 255, implicit-def dead $scc
+ %19:sreg_32 = S_ASHR_I32 %18, 31, implicit-def dead $scc
+ %20:sreg_32 = S_LSHR_B32 %19, 24, implicit-def dead $scc
+ %21:sreg_32 = S_ADD_I32 %18, %20, implicit-def dead $scc
+ %22:sreg_32 = S_ASHR_I32 %21, 8, implicit-def dead $scc
+ %23:sreg_32 = nsw S_MUL_I32 %22, %17
+ %24:sreg_32 = S_ASHR_I32 %0, 31, implicit-def dead $scc
+ %25:sreg_32 = S_ASHR_I32 %23, 31, implicit-def dead $scc
+ %26:sreg_32 = S_ADD_I32 %0, %24, implicit-def dead $scc
+ %27:sreg_32 = S_ADD_I32 %23, %25, implicit-def dead $scc
+ %28:sreg_32 = S_XOR_B32 %26, %24, implicit-def dead $scc
+ %29:sreg_32 = S_XOR_B32 %27, %25, implicit-def dead $scc
+ %30:vgpr_32 = V_CVT_F32_U32_e64 %29, 0, 0, implicit $mode, implicit $exec
+ %31:vgpr_32 = V_RCP_IFLAG_F32_e64 0, %30, 0, 0, implicit $mode, implicit $exec
+ %32:vgpr_32 = V_MUL_F32_e64 0, 1333788670, 0, %31, 0, 0, implicit $mode, implicit $exec
+ %33:vgpr_32 = V_CVT_U32_F32_e64 0, %32, 0, 0, implicit $mode, implicit $exec
+ undef %34.sub0:sgpr_256 = S_MOV_B32 0
+ %35:sreg_32 = S_SUB_I32 0, %29, implicit-def dead $scc
+ %36:sreg_32_xm0 = V_READFIRSTLANE_B32 %33, implicit $exec
+ %37:sreg_32 = S_MUL_I32 %35, %36
+ %38:sreg_32 = S_MUL_HI_U32 %36, %37
+ %39:sreg_32 = S_ADD_I32 %36, %38, implicit-def dead $scc
+ %40:sreg_32 = S_MUL_HI_U32 %28, %39
+ %41:sreg_32 = S_MUL_I32 %40, %29
+ %42:sreg_32 = S_SUB_I32 %28, %41, implicit-def dead $scc
+ %43:sreg_32 = S_SUB_I32 %42, %29, implicit-def dead $scc
+ S_CMP_GE_U32 %42, %29, implicit-def $scc
+ %44:sreg_32 = S_CSELECT_B32 %43, %42, implicit killed $scc
+ %45:sreg_32 = S_SUB_I32 %44, %29, implicit-def dead $scc
+ S_CMP_GE_U32 %44, %29, implicit-def $scc
+ %46:sreg_32 = S_CSELECT_B32 %45, %44, implicit killed $scc
+ %47:sreg_32 = S_XOR_B32 %46, %24, implicit-def dead $scc
+ %48:sreg_32 = S_SUB_I32 %47, %24, implicit-def dead $scc
+ %49:sreg_32 = S_ASHR_I32 %48, 31, implicit-def dead $scc
+ %50:sreg_32 = S_ASHR_I32 %22, 31, implicit-def dead $scc
+ %51:sreg_32 = S_XOR_B32 %49, %50, implicit-def dead $scc
+ %52:sreg_32 = S_ADD_I32 %48, %49, implicit-def dead $scc
+ %53:sreg_32 = S_ADD_I32 %22, %50, implicit-def dead $scc
+ %54:sreg_32 = S_XOR_B32 %52, %49, implicit-def dead $scc
+ %55:sreg_32 = S_XOR_B32 %53, %50, implicit-def dead $scc
+ %56:vgpr_32 = V_CVT_F32_U32_e64 %55, 0, 0, implicit $mode, implicit $exec
+ %57:vgpr_32 = V_RCP_IFLAG_F32_e64 0, %56, 0, 0, implicit $mode, implicit $exec
+ %58:vgpr_32 = V_MUL_F32_e64 0, 1333788670, 0, %57, 0, 0, implicit $mode, implicit $exec
+ %59:vgpr_32 = V_CVT_U32_F32_e64 0, %58, 0, 0, implicit $mode, implicit $exec
+ %60:sreg_32 = S_SUB_I32 0, %55, implicit-def dead $scc
+ %61:sreg_32_xm0 = V_READFIRSTLANE_B32 %59, implicit $exec
+ %62:sreg_32 = S_MUL_I32 %60, %61
+ %63:sreg_32 = S_MUL_HI_U32 %61, %62
+ %64:sreg_32 = S_ADD_I32 %61, %63, implicit-def dead $scc
+ %65:sreg_32 = S_MUL_HI_U32 %54, %64
+ %66:sreg_32 = S_MUL_I32 %65, %55
+ %67:sreg_32 = S_SUB_I32 %54, %66, implicit-def dead $scc
+ %68:sreg_32 = S_ADD_I32 %65, 1, implicit-def dead $scc
+ %69:sreg_32 = S_SUB_I32 %67, %55, implicit-def dead $scc
+ S_CMP_GE_U32 %67, %55, implicit-def $scc
+ %70:sreg_32 = S_CSELECT_B32 %68, %65, implicit $scc
+ %71:sreg_32 = S_CSELECT_B32 %69, %67, implicit killed $scc
+ %72:sreg_32 = S_ADD_I32 %70, 1, implicit-def dead $scc
+ S_CMP_GE_U32 %71, %55, implicit-def $scc
+ %73:sreg_32 = S_CSELECT_B32 %72, %70, implicit killed $scc
+ %74:sreg_32 = S_XOR_B32 %73, %51, implicit-def dead $scc
+ %75:sreg_32 = S_SUB_I32 %74, %51, implicit-def dead $scc
+ %76:sreg_32 = S_ASHR_I32 %16, 31, implicit-def dead $scc
+ %77:sreg_32 = S_ASHR_I32 %11, 31, implicit-def dead $scc
+ %78:sreg_32 = S_ADD_I32 %17, %76, implicit-def dead $scc
+ %79:sreg_32 = S_ADD_I32 %11, %77, implicit-def dead $scc
+ %80:sreg_32 = S_XOR_B32 %78, %76, implicit-def dead $scc
+ %81:sreg_32 = S_XOR_B32 %79, %77, implicit-def dead $scc
+ %82:vgpr_32 = V_CVT_F32_U32_e64 %81, 0, 0, implicit $mode, implicit $exec
+ %83:vgpr_32 = V_RCP_IFLAG_F32_e64 0, %82, 0, 0, implicit $mode, implicit $exec
+ %84:vgpr_32 = V_MUL_F32_e64 0, 1333788670, 0, %83, 0, 0, implicit $mode, implicit $exec
+ %85:vgpr_32 = V_CVT_U32_F32_e64 0, %84, 0, 0, implicit $mode, implicit $exec
+ %86:sreg_32 = S_SUB_I32 0, %81, implicit-def dead $scc
+ %87:sreg_32_xm0 = V_READFIRSTLANE_B32 %85, implicit $exec
+ %88:sreg_32 = S_MUL_I32 %86, %87
+ %89:sreg_32 = S_MUL_HI_U32 %87, %88
+ %90:sreg_32 = S_ADD_I32 %87, %89, implicit-def dead $scc
+ %91:sreg_32 = S_MUL_HI_U32 %80, %90
+ %92:sreg_32 = S_MUL_I32 %91, %81
+ %93:sreg_32 = S_SUB_I32 %80, %92, implicit-def dead $scc
+ %94:sreg_32 = S_SUB_I32 %93, %81, implicit-def dead $scc
+ S_CMP_GE_U32 %93, %81, implicit-def $scc
+ %95:sreg_32 = S_CSELECT_B32 %94, %93, implicit killed $scc
+ %96:sreg_32 = S_SUB_I32 %95, %81, implicit-def dead $scc
+ S_CMP_GE_U32 %95, %81, implicit-def $scc
+ %97:sreg_32 = S_CSELECT_B32 %96, %95, implicit killed $scc
+ %98:sreg_32 = S_XOR_B32 %97, %76, implicit-def dead $scc
+ %99:sreg_32 = S_SUB_I32 %98, %76, implicit-def dead $scc
+ %100:sreg_32 = nsw S_SUB_I32 %17, %99, implicit-def dead $scc
+ S_CMP_LT_I32 %75, %100, implicit-def $scc
+ %101:sreg_32 = S_CSELECT_B32 %11, %99, implicit killed $scc
+ %102:sreg_32 = S_MUL_I32 %75, %22
+ %103:sreg_32 = S_SUB_I32 %48, %102, implicit-def dead $scc
+ %104:sreg_32 = S_ASHR_I32 %75, 31, implicit-def dead $scc
+ %105:sreg_32 = S_ADD_I32 %75, %104, implicit-def dead $scc
+ %106:sreg_32 = S_XOR_B32 %105, %104, implicit-def dead $scc
+ %107:sreg_32 = S_MUL_HI_U32 %106, %90
+ %108:sreg_32 = S_MUL_I32 %107, %81
+ %109:sreg_32 = S_SUB_I32 %106, %108, implicit-def dead $scc
+ %110:sreg_32 = S_SUB_I32 %109, %81, implicit-def dead $scc
+ S_CMP_GE_U32 %109, %81, implicit-def $scc
+ %111:sreg_32 = S_CSELECT_B32 %110, %109, implicit killed $scc
+ %112:sreg_32 = S_SUB_I32 %111, %81, implicit-def dead $scc
+ S_CMP_GE_U32 %111, %81, implicit-def $scc
+ %113:sreg_32 = S_CSELECT_B32 %112, %111, implicit killed $scc
+ %114:sreg_32 = S_XOR_B32 %113, %104, implicit-def dead $scc
+ %115:sreg_32 = S_SUB_I32 %114, %104, implicit-def dead $scc
+ %116:sreg_32 = nsw S_MUL_I32 %115, %22
+ %117:sreg_32 = nsw S_ADD_I32 %116, %103, implicit-def dead $scc
+ %118:sreg_32 = S_ASHR_I32 %117, 31, implicit-def dead $scc
+ %119:sreg_32 = S_ASHR_I32 %101, 31, implicit-def dead $scc
+ %120:sreg_32 = S_XOR_B32 %118, %119, implicit-def dead $scc
+ %121:sreg_32 = S_ADD_I32 %117, %118, implicit-def dead $scc
+ %122:sreg_32 = S_ADD_I32 %101, %119, implicit-def dead $scc
+ %123:sreg_32 = S_XOR_B32 %121, %118, implicit-def dead $scc
+ %124:sreg_32 = S_XOR_B32 %122, %119, implicit-def dead $scc
+ %125:vgpr_32 = V_CVT_F32_U32_e64 %124, 0, 0, implicit $mode, implicit $exec
+ %126:vgpr_32 = V_RCP_IFLAG_F32_e64 0, %125, 0, 0, implicit $mode, implicit $exec
+ %127:vgpr_32 = V_MUL_F32_e64 0, 1333788670, 0, %126, 0, 0, implicit $mode, implicit $exec
+ %128:vgpr_32 = V_CVT_U32_F32_e64 0, %127, 0, 0, implicit $mode, implicit $exec
+ %129:sreg_32 = S_SUB_I32 0, %124, implicit-def dead $scc
+ %130:sreg_32_xm0 = V_READFIRSTLANE_B32 %128, implicit $exec
+ %131:sreg_32 = S_MUL_I32 %129, %130
+ %132:sreg_32 = S_MUL_HI_U32 %130, %131
+ %133:sreg_32 = S_ADD_I32 %130, %132, implicit-def dead $scc
+ %134:sreg_32 = S_MUL_HI_U32 %123, %133
+ %135:sreg_32 = S_MUL_I32 %134, %124
+ %136:sreg_32 = S_SUB_I32 %123, %135, implicit-def dead $scc
+ %137:sreg_32 = S_ADD_I32 %134, 1, implicit-def dead $scc
+ %138:sreg_32 = S_SUB_I32 %136, %124, implicit-def dead $scc
+ S_CMP_GE_U32 %136, %124, implicit-def $scc
+ %139:sreg_32 = S_CSELECT_B32 %137, %134, implicit $scc
+ %140:sreg_32 = S_CSELECT_B32 %138, %136, implicit killed $scc
+ %141:sreg_32 = S_ADD_I32 %139, 1, implicit-def dead $scc
+ S_CMP_GE_U32 %140, %124, implicit-def $scc
+ %142:sreg_32 = S_CSELECT_B32 %141, %139, implicit killed $scc
+ %143:sreg_32 = S_XOR_B32 %142, %120, implicit-def dead $scc
+ %144:sreg_32 = S_SUB_I32 %143, %120, implicit-def dead $scc
+ %145:sreg_32 = S_MUL_I32 %144, %101
+ %146:sreg_32 = S_SUB_I32 %117, %145, implicit-def dead $scc
+ %147:sreg_32 = nsw S_SUB_I32 %75, %115, implicit-def dead $scc
+ %148:sreg_32 = S_ADD_I32 %147, %146, implicit-def dead $scc
+ %149:sreg_32 = S_LSHL_B32 %148, 7, implicit-def dead $scc
+ %150:sreg_32 = nsw S_LSHL_B32 %144, 8, implicit-def dead $scc
+ %151:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 1, %2(s32), implicit $exec
+ %152:vgpr_32 = V_AND_B32_e64 6, %151, implicit $exec
+ %153:vgpr_32 = V_LSHRREV_B32_e64 1, %2(s32), implicit $exec
+ %154:vgpr_32 = V_AND_B32_e64 126, %153, implicit $exec
+ %155:vgpr_32 = nsw V_ADD_U32_e64 %149, %154, 0, implicit $exec
+ undef %156.sub0:vreg_64 = nuw nsw V_LSHLREV_B32_e64 3, %152, implicit $exec
+ early-clobber %157:vreg_64, $sgpr_null = V_MAD_U64_U32_gfx11_e64 %155, %5, %156, 0, implicit $exec
+ %158:vgpr_32 = V_MUL_U32_U24_e64 1032, %152, 0, implicit $exec
+ %159:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, %154, implicit $exec
+ %160:vgpr_32 = V_AND_B32_e64 252, %2(s32), implicit $exec
+ %161:vgpr_32 = nsw V_ADD_U32_e64 %150, %160, 0, implicit $exec
+ early-clobber %162:vreg_64, $sgpr_null = V_MAD_U64_U32_gfx11_e64 %161, %7, %156, 0, implicit $exec
+ %163:vgpr_32 = V_MUL_U32_U24_e64 2056, %152, 0, implicit $exec
+ %164:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, %160, implicit $exec
+ %165:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, %2(s32), implicit $exec
+ %166:vgpr_32 = V_BFE_U32_e64 %2(s32), 1, 3, implicit $exec
+ %167:vgpr_32 = V_AND_OR_B32_e64 %165, 8, %166, implicit $exec
+ %168:vgpr_32 = V_AND_B32_e64 128, %2(s32), implicit $exec
+ %169:vgpr_32 = V_AND_B32_e64 15, %2(s32), implicit $exec
+ %170:vgpr_32 = V_AND_OR_B32_e64 %153, 48, %169, implicit $exec
+ undef %171.sub2:sgpr_128 = S_LSHL_B32 %6, 1, implicit-def dead $scc
+ %171.sub3:sgpr_128 = S_MOV_B32 268566528
+ %171.sub0:sgpr_128 = COPY %3.sub0
+ %171.sub1:sgpr_128 = COPY %3.sub1
+ %172:vgpr_32 = V_LSHLREV_B32_e64 1, %157.sub0, implicit $exec
+ %173:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %172, %171, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
+ %174:vgpr_32 = V_ADD_U32_e64 8, %157.sub0, 0, implicit $exec
+ %175:vgpr_32 = V_LSHLREV_B32_e64 1, %174, implicit $exec
+ %176:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %175, %171, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
+ %177:vgpr_32 = V_ADD_LSHL_U32_e64 %174, %5, 1, implicit $exec
+ %178:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %177, %171, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
+ %179:vgpr_32 = V_ADD_LSHL_U32_e64 %157.sub0, %5, 1, implicit $exec
+ %180:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %179, %171, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
+ %171.sub2:sgpr_128 = S_LSHL_B32 %8, 1, implicit-def dead $scc
+ %171.sub0:sgpr_128 = COPY %3.sub2
+ %171.sub1:sgpr_128 = COPY %3.sub3
+ %181:vgpr_32 = V_LSHLREV_B32_e64 1, %162.sub0, implicit $exec
+ %182:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %181, %171, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
+ %183:vgpr_32 = V_ADD_U32_e64 8, %162.sub0, 0, implicit $exec
+ %184:vgpr_32 = V_LSHLREV_B32_e64 1, %183, implicit $exec
+ %185:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %184, %171, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
+ %186:vgpr_32 = V_ADD_LSHL_U32_e64 %183, %7, 1, implicit $exec
+ %187:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %186, %171, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
+ %188:vgpr_32 = V_ADD_U32_e64 %7, %162.sub0, 0, implicit $exec
+ %189:vgpr_32 = V_LSHLREV_B32_e64 1, %188, implicit $exec
+ %190:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %189, %171, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
+ %191:vgpr_32 = V_ADD_U32_e64 %7, %188, 0, implicit $exec
+ %192:vgpr_32 = V_LSHLREV_B32_e64 1, %191, implicit $exec
+ %193:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %192, %171, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
+ %194:vgpr_32 = V_ADD_U32_e64 8, %191, 0, implicit $exec
+ %195:vgpr_32 = V_LSHLREV_B32_e64 1, %194, implicit $exec
+ %196:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %195, %171, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
+ %197:vgpr_32 = V_ADD_LSHL_U32_e64 %194, %7, 1, implicit $exec
+ %198:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %197, %171, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
+ %199:vgpr_32 = V_ADD_L...
[truncated]
|
During init of unclustered schedule stage, the minOccupancy is temporarily increased. But subsequently, if none of the regions are scheduled because they don't meet the conditions of initGCNRegion, the minOccupancy should be reset to the initial occupancy. This change detects the above situation and resets minOccupancy during finalization.