@@ -30,15 +30,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
3030; MUBUF-NEXT: s_cmp_lg_u32 s9, 0
3131; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3
3232; MUBUF-NEXT: ; %bb.2: ; %bb.1
33- ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
34- ; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
35- ; MUBUF-NEXT: s_mov_b32 s32, s6
33+ ; MUBUF-NEXT: s_mov_b32 s6, s32
3634; MUBUF-NEXT: v_mov_b32_e32 v1, 0
37- ; MUBUF-NEXT: v_mov_b32_e32 v2, s6
38- ; MUBUF-NEXT: v_mov_b32_e32 v3, 1
35+ ; MUBUF-NEXT: v_mov_b32_e32 v2, 1
36+ ; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
37+ ; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
38+ ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6
39+ ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
3940; MUBUF-NEXT: s_add_i32 s6, s6, s7
40- ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
41- ; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
4241; MUBUF-NEXT: v_mov_b32_e32 v2, s6
4342; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
4443; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -66,11 +65,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
6665; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0
6766; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3
6867; FLATSCR-NEXT: ; %bb.2: ; %bb.1
69- ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
68+ ; FLATSCR-NEXT: s_mov_b32 s2, s32
7069; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
7170; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
7271; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2
73- ; FLATSCR-NEXT: s_mov_b32 s32, s2
72+ ; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
7473; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
7574; FLATSCR-NEXT: s_add_i32 s2, s2, s3
7675; FLATSCR-NEXT: scratch_load_dword v2, off, s2
@@ -131,16 +130,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
131130; MUBUF-NEXT: s_cmp_lg_u32 s4, 0
132131; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2
133132; MUBUF-NEXT: ; %bb.1: ; %bb.0
134- ; MUBUF-NEXT: s_add_i32 s4, s32, 0x1000
135- ; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000
136- ; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
137- ; MUBUF-NEXT: s_mov_b32 s32, s4
133+ ; MUBUF-NEXT: s_mov_b32 s4, s32
138134; MUBUF-NEXT: v_mov_b32_e32 v1, 0
139- ; MUBUF-NEXT: v_mov_b32_e32 v2, s4
140- ; MUBUF-NEXT: v_mov_b32_e32 v3, 1
135+ ; MUBUF-NEXT: v_mov_b32_e32 v2, 1
136+ ; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
137+ ; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000
138+ ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4
139+ ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:4
141140; MUBUF-NEXT: s_add_i32 s4, s4, s5
142- ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
143- ; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
144141; MUBUF-NEXT: v_mov_b32_e32 v2, s4
145142; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
146143; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
@@ -165,12 +162,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
165162; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0
166163; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2
167164; FLATSCR-NEXT: ; %bb.1: ; %bb.0
168- ; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1000
169165; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
170- ; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000
166+ ; FLATSCR-NEXT: s_mov_b32 s0, s32
171167; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
172168; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2
173- ; FLATSCR-NEXT: s_mov_b32 s32, s0
169+ ; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000
174170; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
175171; FLATSCR-NEXT: s_add_i32 s0, s0, s1
176172; FLATSCR-NEXT: scratch_load_dword v2, off, s0
@@ -230,16 +226,15 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
230226; MUBUF-NEXT: s_and_b64 exec, exec, vcc
231227; MUBUF-NEXT: s_cbranch_execz .LBB2_3
232228; MUBUF-NEXT: ; %bb.2: ; %bb.1
233- ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
229+ ; MUBUF-NEXT: s_mov_b32 s6, s32
234230; MUBUF-NEXT: v_mov_b32_e32 v2, 0
235- ; MUBUF-NEXT: v_mov_b32_e32 v3, s6
236- ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
231+ ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
237232; MUBUF-NEXT: v_mov_b32_e32 v2, 1
238- ; MUBUF-NEXT: buffer_store_dword v2, v3 , s[0:3], 0 offen offset:4
233+ ; MUBUF-NEXT: buffer_store_dword v2, off , s[0:3], s6 offset:4
239234; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6
240235; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
241236; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
242- ; MUBUF-NEXT: s_mov_b32 s32, s6
237+ ; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
243238; MUBUF-NEXT: s_waitcnt vmcnt(0)
244239; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
245240; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -266,14 +261,14 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
266261; FLATSCR-NEXT: s_and_b64 exec, exec, vcc
267262; FLATSCR-NEXT: s_cbranch_execz .LBB2_3
268263; FLATSCR-NEXT: ; %bb.2: ; %bb.1
269- ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
264+ ; FLATSCR-NEXT: s_mov_b32 s2, s32
270265; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
271266; FLATSCR-NEXT: v_mov_b32_e32 v3, 1
272267; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
273268; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2
274269; FLATSCR-NEXT: scratch_load_dword v2, v2, off
275270; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
276- ; FLATSCR-NEXT: s_mov_b32 s32, s2
271+ ; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
277272; FLATSCR-NEXT: s_waitcnt vmcnt(0)
278273; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
279274; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
@@ -324,17 +319,15 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
324319; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc
325320; MUBUF-NEXT: s_cbranch_execz .LBB3_2
326321; MUBUF-NEXT: ; %bb.1: ; %bb.0
327- ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
328- ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
322+ ; MUBUF-NEXT: s_mov_b32 s6, s32
329323; MUBUF-NEXT: v_mov_b32_e32 v2, 0
330- ; MUBUF-NEXT: v_mov_b32_e32 v4, s6
331- ; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
324+ ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
332325; MUBUF-NEXT: v_mov_b32_e32 v2, 1
333- ; MUBUF-NEXT: buffer_store_dword v2, v4 , s[0:3], 0 offen offset:4
326+ ; MUBUF-NEXT: buffer_store_dword v2, off , s[0:3], s6 offset:4
334327; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6
335328; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
336329; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
337- ; MUBUF-NEXT: s_mov_b32 s32, s6
330+ ; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
338331; MUBUF-NEXT: s_waitcnt vmcnt(0)
339332; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
340333; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -358,15 +351,14 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
358351; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
359352; FLATSCR-NEXT: s_cbranch_execz .LBB3_2
360353; FLATSCR-NEXT: ; %bb.1: ; %bb.0
361- ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
362- ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
354+ ; FLATSCR-NEXT: s_mov_b32 s2, s32
363355; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
364356; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
365357; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
366358; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
367359; FLATSCR-NEXT: scratch_load_dword v2, v2, off
368360; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
369- ; FLATSCR-NEXT: s_mov_b32 s32, s2
361+ ; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
370362; FLATSCR-NEXT: s_waitcnt vmcnt(0)
371363; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
372364; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
0 commit comments