Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 29 additions & 8 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
Position Pos) const override;

bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;

bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override {
Expand Down Expand Up @@ -2174,17 +2176,19 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

bool SIGfx10CacheControl::insertBarrierStart(
MachineBasicBlock::iterator &MI) const {
// We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
// mode. This is because a CU mode release fence does not emit any wait, which
// is fine when only dealing with vmem, but isn't sufficient in the presence
// of barriers which do not go through vmem.
// GFX12.5 does not require this additional wait.
if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
if (!ST.isCuModeEnabled())
return false;

// GFX10/11 CU MODE Workgroup fences do not emit anything.
// In the presence of barriers, we want to make sure previous memory
// operations are actually visible and can be released at a wider scope by
// another thread upon exiting the barrier. To make this possible, we must
// wait on previous stores.

BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(0);
return true;
}

Expand Down Expand Up @@ -2570,6 +2574,23 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
return Changed;
}

bool SIGfx12CacheControl::insertBarrierStart(
MachineBasicBlock::iterator &MI) const {
if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
return false;

// GFX12 CU MODE Workgroup fences do not emit anything (except in GFX12.5).
// In the presence of barriers, we want to make sure previous memory
// operations are actually visible and can be released at a wider scope by
// another thread upon exiting the barrier. To make this possible, we must
// wait on previous stores.

BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAIT_STORECNT_soft))
.addImm(0);
return true;
}

bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
; GFX10CU-NEXT: v_mov_b32_e32 v0, s13
; GFX10CU-NEXT: s_waitcnt vmcnt(0)
; GFX10CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10CU-NEXT: s_barrier
; GFX10CU-NEXT: ds_read_b32 v0, v0
; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
Expand Down
16 changes: 6 additions & 10 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX10-CU-LABEL: test_s_barrier:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
Expand All @@ -26,7 +26,7 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX11-CU-LABEL: test_s_barrier:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
Expand All @@ -38,7 +38,7 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX12-CU-LABEL: test_s_barrier:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
Expand All @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
; GFX10-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
Expand All @@ -78,7 +78,7 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
; GFX11-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
Expand All @@ -94,8 +94,7 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX12-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
Expand Down Expand Up @@ -125,7 +124,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
Expand All @@ -140,7 +138,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
Expand All @@ -160,7 +157,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
Expand Down
Loading