Skip to content

Commit 6636f32

Browse files
authored
[AMDGPU] Include WWM register spill into BB Prolog (#111496)
With #93526 we split the regalloc pipeline further to have a standalone allocation for wwm registers and per-lane VGPRs. Currently the presence of the wwm-spill reloads inserted at the bb-top limits the isBasicPrologue function during the per-lane vgpr regalloc to skip past the exec manipulation instruction and ended up causing incorrect codegen. The wmm-spill inserted during the wwm-regalloc pipeline should also be included in the bb-prolog so that the per-lane vgpr regalloc pipeline can identify the appropriate insertion points for their spills and copies.
1 parent 39cdfdf commit 6636f32

14 files changed

+583
-520
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -8903,7 +8903,7 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
89038903

89048904
uint16_t Opcode = MI.getOpcode();
89058905
return IsNullOrVectorRegister &&
8906-
(isSGPRSpill(Opcode) ||
8906+
(isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
89078907
(!MI.isTerminator() && Opcode != AMDGPU::COPY &&
89088908
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
89098909
}

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

+7-7
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
6868
; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
6969
; CHECK-NEXT: s_mov_b32 exec_lo, s21
7070
; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
71+
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
72+
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
73+
; CHECK-NEXT: s_mov_b32 exec_lo, s21
7174
; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
7275
; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
7376
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -84,10 +87,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
8487
; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
8588
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
8689
; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
87-
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
88-
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
89-
; CHECK-NEXT: s_mov_b32 exec_lo, s21
90-
; CHECK-NEXT: s_waitcnt vmcnt(1)
90+
; CHECK-NEXT: s_waitcnt vmcnt(0)
9191
; CHECK-NEXT: v_readfirstlane_b32 s12, v7
9292
; CHECK-NEXT: v_readfirstlane_b32 s10, v6
9393
; CHECK-NEXT: v_readfirstlane_b32 s9, v5
@@ -104,7 +104,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
104104
; CHECK-NEXT: s_mov_b32 s17, s6
105105
; CHECK-NEXT: s_mov_b32 s18, s5
106106
; CHECK-NEXT: s_mov_b32 s19, s4
107-
; CHECK-NEXT: s_waitcnt vmcnt(0)
108107
; CHECK-NEXT: v_writelane_b32 v16, s12, 5
109108
; CHECK-NEXT: v_writelane_b32 v16, s13, 6
110109
; CHECK-NEXT: v_writelane_b32 v16, s14, 7
@@ -138,8 +137,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
138137
; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
139138
; CHECK-NEXT: s_mov_b32 exec_lo, s21
140139
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
141-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
142-
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
143140
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
144141
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
145142
; CHECK-NEXT: s_mov_b32 exec_lo, s21
@@ -157,6 +154,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
157154
; CHECK-NEXT: v_readlane_b32 s17, v16, 1
158155
; CHECK-NEXT: v_readlane_b32 s18, v16, 2
159156
; CHECK-NEXT: v_readlane_b32 s19, v16, 3
157+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
158+
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
159+
; CHECK-NEXT: s_waitcnt vmcnt(0)
160160
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
161161
; CHECK-NEXT: s_waitcnt vmcnt(0)
162162
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill

llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
2929
; REGALLOC-NEXT: bb.1.Flow:
3030
; REGALLOC-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
3131
; REGALLOC-NEXT: {{ $}}
32-
; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
3332
; REGALLOC-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
3433
; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0, implicit-def $sgpr4_sgpr5
3534
; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 1
3635
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec
36+
; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
3737
; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
3838
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
3939
; REGALLOC-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr63, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
@@ -62,11 +62,11 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
6262
; REGALLOC-NEXT: S_BRANCH %bb.1
6363
; REGALLOC-NEXT: {{ $}}
6464
; REGALLOC-NEXT: bb.4.bb.3:
65-
; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
6665
; REGALLOC-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
6766
; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 2, implicit-def $sgpr4_sgpr5
6867
; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 3
6968
; REGALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
69+
; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
7070
; REGALLOC-NEXT: renamable $vgpr0 = V_LSHL_ADD_U32_e64 killed $vgpr0, 2, $vgpr0, implicit $exec
7171
; REGALLOC-NEXT: SI_RETURN implicit killed $vgpr0
7272
bb.0:

0 commit comments

Comments
 (0)