Skip to content

Commit 09c4124

Browse files
authored
[AMDGPU] Fix restores in chain functions (#116193)
When spilling a VGPR in `emitPrologue`, chain functions prefer to use offsets to access the stack instead of the SP. This patch fixes `emitEpilogue` to do the same. It also brings back some test coverage that was lost in #93526, when WWM registers started being shifted to the lowest available range (which meant that tests that were originally spilling v8 would shift to spill v0, which is a scratch register for chain functions and didn't get spilled). Change-Id: Icb07fccd859b563cd45f74c25ae578ecb38bdeeb
1 parent 9bccf61 commit 09c4124

File tree

3 files changed

+41
-51
lines changed

3 files changed

+41
-51
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -1299,7 +1299,8 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
12991299
MIB.setMIFlag(MachineInstr::FrameDestroy);
13001300
} else {
13011301
// Insert the CSR spill restores with SP as the base register.
1302-
emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
1302+
emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits,
1303+
FuncInfo->isChainFunction() ? Register() : StackPtrReg,
13031304
FramePtrRegScratchCopy);
13041305
}
13051306
}

llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir

+27-7
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,24 @@ body: |
6767
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
6868
6969
; GCN-LABEL: name: preserve_all_lanes_wwm_above_args
70-
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
70+
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
7171
; GCN-NEXT: {{ $}}
72-
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
72+
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
73+
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
74+
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
75+
; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
76+
; GCN-NEXT: $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
7377
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
74-
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
75-
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 10, implicit $exec
76-
; GCN-NEXT: $vgpr8 = COPY killed $vgpr0
78+
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0
79+
; GCN-NEXT: $vgpr10 = V_MOV_B32_e32 10, implicit $exec
80+
; GCN-NEXT: $vgpr8 = COPY killed $vgpr10
7781
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
7882
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
83+
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
84+
; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_ST 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
85+
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
7986
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
87+
S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
8088
$vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
8189
$sgpr35 = S_MOV_B32 5
8290
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0
@@ -104,10 +112,12 @@ body: |
104112
; GCN-LABEL: name: dont_preserve_args
105113
; GCN: liveins: $sgpr0, $vgpr8, $vgpr9
106114
; GCN-NEXT: {{ $}}
115+
; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
107116
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
108117
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
109118
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
110119
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
120+
S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
111121
renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
112122
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
113123
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
@@ -131,15 +141,23 @@ body: |
131141
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
132142
133143
; GCN-LABEL: name: preserve_inactive_lanes_wwm_args
134-
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr10
144+
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
135145
; GCN-NEXT: {{ $}}
146+
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
147+
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
148+
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
149+
; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
136150
; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
137151
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
138152
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
139153
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
140154
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
141155
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
142-
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr0
156+
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
157+
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_ST 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
158+
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
159+
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
160+
S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
143161
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
144162
$sgpr35 = S_MOV_B32 5
145163
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
@@ -168,13 +186,15 @@ body: |
168186
; GCN-LABEL: name: dont_preserve_if_no_chain_calls
169187
; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr9
170188
; GCN-NEXT: {{ $}}
189+
; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
171190
; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
172191
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
173192
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
174193
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
175194
; GCN-NEXT: $vgpr9 = V_MOV_B32_e32 20, implicit $exec
176195
; GCN-NEXT: $vgpr10 = V_MOV_B32_e32 30, implicit $exec
177196
; GCN-NEXT: S_ENDPGM 0
197+
S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
178198
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
179199
$sgpr35 = S_MOV_B32 5
180200
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0

llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir

+12-43
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
declare amdgpu_gfx void @gfx_callee()
99

1010
define amdgpu_cs_chain void @preserve_inactive_wwm() {ret void}
11-
define amdgpu_cs_chain void @preserve_inactive_detected_wwm() {ret void}
1211
define amdgpu_cs_chain void @dont_preserve_wwm_if_no_chain_calls() {ret void}
1312
define amdgpu_cs_chain void @dont_preserve_wwm_if_init_whole_wave() {ret void}
1413
define amdgpu_cs_chain void @dont_preserve_non_wwm() {ret void}
@@ -36,55 +35,23 @@ body: |
3635
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
3736
3837
; GCN-LABEL: name: preserve_inactive_wwm
39-
; GCN: liveins: $sgpr0, $sgpr35
40-
; GCN-NEXT: {{ $}}
41-
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
42-
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
43-
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1
44-
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
45-
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
46-
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
47-
48-
...
49-
50-
# Check that it also works for SGPR to VGPR spills.
51-
52-
---
53-
name: preserve_inactive_detected_wwm
54-
tracksRegLiveness: true
55-
frameInfo:
56-
hasTailCall: true
57-
machineFunctionInfo:
58-
stackPtrOffsetReg: '$sgpr32'
59-
returnsVoid: true
60-
body: |
61-
bb.0:
62-
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
63-
64-
; GCN-LABEL: name: preserve_inactive_detected_wwm
6538
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
6639
; GCN-NEXT: {{ $}}
67-
; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
68-
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
69-
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
70-
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
71-
; GCN-NEXT: $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9
72-
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
73-
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr9, 0
74-
; GCN-NEXT: renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec
40+
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
41+
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
42+
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
43+
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
7544
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
7645
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
46+
; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
47+
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
48+
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_ST 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
49+
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_ST 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
50+
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
7751
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
78-
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
79-
$sgpr35 = S_MOV_B32 5
80-
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
81-
renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
82-
renamable $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9
83-
$sgpr35 = S_MOV_B32 5
84-
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr9, 0
85-
renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec
8652
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
8753
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
54+
S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
8855
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
8956
9057
...
@@ -110,11 +77,13 @@ body: |
11077
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
11178
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
11279
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
80+
; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
11381
; GCN-NEXT: S_ENDPGM 0
11482
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
11583
$sgpr35 = S_MOV_B32 5
11684
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
11785
renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
86+
S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
11887
S_ENDPGM 0
11988
...
12089

0 commit comments

Comments
 (0)