Skip to content

Commit b946e20

Browse files
cdevadaskzhuravl
authored andcommitted
[AMDGPU] Include WWM register spill into BB Prolog (llvm#111496)
With llvm#93526 we split the regalloc pipeline further to have a standalone allocation for wwm registers and per-lane VGPRs. Currently the presence of the wwm-spill reloads inserted at the bb-top limits the isBasicPrologue function during the per-lane vgpr regalloc to skip past the exec manipulation instruction and ended up causing incorrect codegen. The wmm-spill inserted during the wwm-regalloc pipeline should also be included in the bb-prolog so that the per-lane vgpr regalloc pipeline can identify the appropriate insertion points for their spills and copies. Change-Id: Icb5596a4ca8204414d54b4b30b614b46927accc2
1 parent ca1f20e commit b946e20

11 files changed

+344
-314
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -8938,7 +8938,7 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
89388938

89398939
uint16_t Opcode = MI.getOpcode();
89408940
return IsNullOrVectorRegister &&
8941-
(isSGPRSpill(Opcode) ||
8941+
(isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
89428942
(!MI.isTerminator() && Opcode != AMDGPU::COPY &&
89438943
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
89448944
}

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

+7-7
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
6868
; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
6969
; CHECK-NEXT: s_mov_b32 exec_lo, s21
7070
; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
71+
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
72+
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
73+
; CHECK-NEXT: s_mov_b32 exec_lo, s21
7174
; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
7275
; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
7376
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -84,10 +87,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
8487
; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
8588
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
8689
; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
87-
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
88-
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
89-
; CHECK-NEXT: s_mov_b32 exec_lo, s21
90-
; CHECK-NEXT: s_waitcnt vmcnt(1)
90+
; CHECK-NEXT: s_waitcnt vmcnt(0)
9191
; CHECK-NEXT: v_readfirstlane_b32 s12, v7
9292
; CHECK-NEXT: v_readfirstlane_b32 s10, v6
9393
; CHECK-NEXT: v_readfirstlane_b32 s9, v5
@@ -104,7 +104,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
104104
; CHECK-NEXT: s_mov_b32 s17, s6
105105
; CHECK-NEXT: s_mov_b32 s18, s5
106106
; CHECK-NEXT: s_mov_b32 s19, s4
107-
; CHECK-NEXT: s_waitcnt vmcnt(0)
108107
; CHECK-NEXT: v_writelane_b32 v16, s12, 5
109108
; CHECK-NEXT: v_writelane_b32 v16, s13, 6
110109
; CHECK-NEXT: v_writelane_b32 v16, s14, 7
@@ -138,8 +137,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
138137
; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
139138
; CHECK-NEXT: s_mov_b32 exec_lo, s21
140139
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
141-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
142-
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
143140
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
144141
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
145142
; CHECK-NEXT: s_mov_b32 exec_lo, s21
@@ -157,6 +154,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
157154
; CHECK-NEXT: v_readlane_b32 s17, v16, 1
158155
; CHECK-NEXT: v_readlane_b32 s18, v16, 2
159156
; CHECK-NEXT: v_readlane_b32 s19, v16, 3
157+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
158+
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
159+
; CHECK-NEXT: s_waitcnt vmcnt(0)
160160
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
161161
; CHECK-NEXT: s_waitcnt vmcnt(0)
162162
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill

llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

+47-42
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll

+4-8
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
; VMEM: [[ENDIF]]:
4848

4949
; Reload and restore exec mask
50-
; VGPR: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
5150
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
5251
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
5352

@@ -59,7 +58,7 @@
5958
; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
6059

6160
; Restore val
62-
; VMEM: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
61+
; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
6362

6463
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
6564

@@ -121,7 +120,6 @@ endif:
121120
; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
122121

123122
; GCN: [[END]]:
124-
; VGPR: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
125123
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
126124
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
127125

@@ -131,7 +129,7 @@ endif:
131129
; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
132130

133131
; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
134-
; VMEM: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
132+
; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
135133

136134
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
137135

@@ -191,7 +189,6 @@ end:
191189
; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]
192190

193191
; GCN: [[FLOW]]: ; %Flow
194-
; VGPR: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
195192
; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] ; 4-byte Folded Reload
196193
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
197194
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@@ -203,7 +200,7 @@ end:
203200

204201
; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]]
205202

206-
; VMEM: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
203+
; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
207204

208205
; Regular spill value restored after exec modification
209206
; Followed by spill
@@ -237,7 +234,6 @@ end:
237234
; GCN-NEXT: s_branch [[FLOW]]
238235

239236
; GCN: [[ENDIF]]:
240-
; VGPR: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
241237
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]]
242238
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
243239

@@ -249,7 +245,7 @@ end:
249245

250246
; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
251247

252-
; VMEM: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
248+
; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
253249

254250
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
255251
define amdgpu_kernel void @divergent_if_else_endif(ptr addrspace(1) %out) #0 {

0 commit comments

Comments
 (0)