Skip to content

Commit 8efa509

Browse files
committed
[AMDGPU] RA inserted scalar instructions can be at the BB top
We adjust the insertion point at the BB top for spills/copies during RA to ensure they are placed after the exec restore instructions required for the divergent control flow execution. This is, however, required only for the vector operations. The insertions for scalar registers can still go at the BB top.
1 parent 8f7e9f3 commit 8efa509

10 files changed

+310
-76
lines changed

llvm/include/llvm/CodeGen/MachineBasicBlock.h

+5-2
Original file line numberDiff line numberDiff line change
@@ -846,8 +846,11 @@ class MachineBasicBlock
846846

847847
/// Return the first instruction in MBB after I that is not a PHI, label or
848848
/// debug. This is the correct point to insert copies at the beginning of a
849-
/// basic block.
850-
iterator SkipPHIsLabelsAndDebug(iterator I, bool SkipPseudoOp = true);
849+
/// basic block. \p Reg is an optional argument passed during register
850+
/// allocator to have additional target specific checks for its spill/copy
851+
/// insertion.
852+
iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg = Register(),
853+
bool SkipPseudoOp = true);
851854

852855
/// Returns an iterator to the first terminator instruction of this basic
853856
/// block. If a terminator does not exist, it returns end().

llvm/include/llvm/CodeGen/TargetInstrInfo.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -1988,8 +1988,10 @@ class TargetInstrInfo : public MCInstrInfo {
19881988

19891989
/// True if the instruction is bound to the top of its basic block and no
19901990
/// other instructions shall be inserted before it. This can be implemented
1991-
/// to prevent register allocator to insert spills before such instructions.
1992-
virtual bool isBasicBlockPrologue(const MachineInstr &MI) const {
1991+
/// to prevent register allocator to insert spills for \p Reg before such
1992+
/// instructions.
1993+
virtual bool isBasicBlockPrologue(const MachineInstr &MI,
1994+
Register Reg = Register()) const {
19931995
return false;
19941996
}
19951997

llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,8 @@ class StatepointState {
461461

462462
if (EHPad && !RC.hasReload(Reg, RegToSlotIdx[Reg], EHPad)) {
463463
RC.recordReload(Reg, RegToSlotIdx[Reg], EHPad);
464-
auto EHPadInsertPoint = EHPad->SkipPHIsLabelsAndDebug(EHPad->begin());
464+
auto EHPadInsertPoint =
465+
EHPad->SkipPHIsLabelsAndDebug(EHPad->begin(), Reg);
465466
insertReloadBefore(Reg, EHPadInsertPoint, EHPad);
466467
LLVM_DEBUG(dbgs() << "...also reload at EHPad "
467468
<< printMBBReference(*EHPad) << "\n");

llvm/lib/CodeGen/InlineSpiller.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
469469
MachineBasicBlock *MBB = LIS.getMBBFromIndex(SrcVNI->def);
470470
MachineBasicBlock::iterator MII;
471471
if (SrcVNI->isPHIDef())
472-
MII = MBB->SkipPHIsLabelsAndDebug(MBB->begin());
472+
MII = MBB->SkipPHIsLabelsAndDebug(MBB->begin(), SrcReg);
473473
else {
474474
MachineInstr *DefMI = LIS.getInstructionFromIndex(SrcVNI->def);
475475
assert(DefMI && "Defining instruction disappeared");

llvm/lib/CodeGen/MachineBasicBlock.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -223,13 +223,13 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
223223

224224
MachineBasicBlock::iterator
225225
MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I,
226-
bool SkipPseudoOp) {
226+
Register Reg, bool SkipPseudoOp) {
227227
const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
228228

229229
iterator E = end();
230230
while (I != E && (I->isPHI() || I->isPosition() || I->isDebugInstr() ||
231231
(SkipPseudoOp && I->isPseudoProbe()) ||
232-
TII->isBasicBlockPrologue(*I)))
232+
TII->isBasicBlockPrologue(*I, Reg)))
233233
++I;
234234
// FIXME: This needs to change if we wish to bundle labels / dbg_values
235235
// inside the bundle.

llvm/lib/CodeGen/SplitKit.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -795,8 +795,10 @@ SlotIndex SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) {
795795
return Start;
796796
}
797797

798-
VNInfo *VNI = defFromParent(0, ParentVNI, Start, MBB,
799-
MBB.SkipPHIsLabelsAndDebug(MBB.begin()));
798+
unsigned RegIdx = 0;
799+
Register Reg = LIS.getInterval(Edit->get(RegIdx)).reg();
800+
VNInfo *VNI = defFromParent(RegIdx, ParentVNI, Start, MBB,
801+
MBB.SkipPHIsLabelsAndDebug(MBB.begin(), Reg));
800802
RegAssign.insert(Start, VNI->def, OpenIdx);
801803
LLVM_DEBUG(dump());
802804
return VNI->def;

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+11-3
Original file line numberDiff line numberDiff line change
@@ -8474,16 +8474,24 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
84748474
return AMDGPU::COPY;
84758475
}
84768476

8477-
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
8477+
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
8478+
Register Reg) const {
84788479
// We need to handle instructions which may be inserted during register
84798480
// allocation to handle the prolog. The initial prolog instruction may have
84808481
// been separated from the start of the block by spills and copies inserted
8481-
// needed by the prolog.
8482+
// needed by the prolog. However, the insertions for scalar registers can
8483+
// always be placed at the BB top as they are independent of the exec mask
8484+
// value.
84828485
uint16_t Opc = MI.getOpcode();
8486+
const MachineFunction *MF = MI.getParent()->getParent();
8487+
const MachineRegisterInfo &MRI = MF->getRegInfo();
8488+
bool IsNullOrVectorRegister =
8489+
!Reg || !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
84838490

84848491
// FIXME: Copies inserted in the block prolog for live-range split should also
84858492
// be included.
8486-
return (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY &&
8493+
return IsNullOrVectorRegister &&
8494+
(isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY &&
84878495
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
84888496
}
84898497

llvm/lib/Target/AMDGPU/SIInstrInfo.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -1179,7 +1179,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
11791179
unsigned getLiveRangeSplitOpcode(Register Reg,
11801180
const MachineFunction &MF) const override;
11811181

1182-
bool isBasicBlockPrologue(const MachineInstr &MI) const override;
1182+
bool isBasicBlockPrologue(const MachineInstr &MI,
1183+
Register Reg = Register()) const override;
11831184

11841185
MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
11851186
MachineBasicBlock::iterator InsPt,

llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll

+60-59
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,6 @@ define void @main(i1 %arg) #0 {
168168
; CHECK-NEXT: s_mov_b64 vcc, vcc
169169
; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
170170
; CHECK-NEXT: .LBB0_3: ; %Flow14
171-
; CHECK-NEXT: s_or_saveexec_b64 s[20:21], s[26:27]
172171
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
173172
; CHECK-NEXT: v_readlane_b32 s12, v5, 32
174173
; CHECK-NEXT: v_readlane_b32 s13, v5, 33
@@ -178,39 +177,39 @@ define void @main(i1 %arg) #0 {
178177
; CHECK-NEXT: v_readlane_b32 s17, v5, 37
179178
; CHECK-NEXT: v_readlane_b32 s18, v5, 38
180179
; CHECK-NEXT: v_readlane_b32 s19, v5, 39
181-
; CHECK-NEXT: v_writelane_b32 v5, s4, 56
182-
; CHECK-NEXT: v_writelane_b32 v5, s5, 57
183-
; CHECK-NEXT: v_writelane_b32 v5, s6, 58
184-
; CHECK-NEXT: v_writelane_b32 v5, s7, 59
185-
; CHECK-NEXT: v_writelane_b32 v5, s8, 60
186-
; CHECK-NEXT: v_writelane_b32 v5, s9, 61
187-
; CHECK-NEXT: v_writelane_b32 v5, s10, 62
188-
; CHECK-NEXT: v_writelane_b32 v5, s11, 63
189-
; CHECK-NEXT: v_writelane_b32 v5, s52, 40
190-
; CHECK-NEXT: v_writelane_b32 v5, s53, 41
191-
; CHECK-NEXT: v_writelane_b32 v5, s54, 42
192-
; CHECK-NEXT: v_writelane_b32 v5, s55, 43
193-
; CHECK-NEXT: v_writelane_b32 v5, s56, 44
194-
; CHECK-NEXT: v_writelane_b32 v5, s57, 45
195-
; CHECK-NEXT: v_writelane_b32 v5, s58, 46
196-
; CHECK-NEXT: v_writelane_b32 v5, s59, 47
197-
; CHECK-NEXT: v_writelane_b32 v4, s12, 0
198-
; CHECK-NEXT: v_writelane_b32 v5, s60, 48
199-
; CHECK-NEXT: v_writelane_b32 v4, s13, 1
200-
; CHECK-NEXT: v_writelane_b32 v5, s61, 49
201-
; CHECK-NEXT: v_writelane_b32 v4, s14, 2
202-
; CHECK-NEXT: v_writelane_b32 v5, s62, 50
203-
; CHECK-NEXT: v_writelane_b32 v4, s15, 3
204-
; CHECK-NEXT: v_writelane_b32 v5, s63, 51
205-
; CHECK-NEXT: v_writelane_b32 v4, s16, 4
206-
; CHECK-NEXT: v_writelane_b32 v5, s64, 52
207-
; CHECK-NEXT: v_writelane_b32 v4, s17, 5
208-
; CHECK-NEXT: v_writelane_b32 v5, s65, 53
209-
; CHECK-NEXT: v_writelane_b32 v4, s18, 6
210-
; CHECK-NEXT: v_writelane_b32 v5, s66, 54
211-
; CHECK-NEXT: v_writelane_b32 v4, s19, 7
212-
; CHECK-NEXT: v_writelane_b32 v5, s67, 55
213-
; CHECK-NEXT: s_xor_b64 exec, exec, s[20:21]
180+
; CHECK-NEXT: v_writelane_b32 v5, s4, 40
181+
; CHECK-NEXT: v_writelane_b32 v5, s5, 41
182+
; CHECK-NEXT: v_writelane_b32 v5, s6, 42
183+
; CHECK-NEXT: v_writelane_b32 v5, s7, 43
184+
; CHECK-NEXT: v_writelane_b32 v5, s8, 44
185+
; CHECK-NEXT: v_writelane_b32 v5, s9, 45
186+
; CHECK-NEXT: v_writelane_b32 v5, s10, 46
187+
; CHECK-NEXT: v_writelane_b32 v5, s11, 47
188+
; CHECK-NEXT: v_writelane_b32 v5, s12, 48
189+
; CHECK-NEXT: v_writelane_b32 v5, s13, 49
190+
; CHECK-NEXT: v_writelane_b32 v5, s14, 50
191+
; CHECK-NEXT: v_writelane_b32 v5, s15, 51
192+
; CHECK-NEXT: v_writelane_b32 v5, s16, 52
193+
; CHECK-NEXT: v_writelane_b32 v5, s17, 53
194+
; CHECK-NEXT: v_writelane_b32 v5, s18, 54
195+
; CHECK-NEXT: v_writelane_b32 v5, s19, 55
196+
; CHECK-NEXT: v_writelane_b32 v5, s52, 56
197+
; CHECK-NEXT: v_writelane_b32 v4, s60, 0
198+
; CHECK-NEXT: v_writelane_b32 v5, s53, 57
199+
; CHECK-NEXT: v_writelane_b32 v4, s61, 1
200+
; CHECK-NEXT: v_writelane_b32 v5, s54, 58
201+
; CHECK-NEXT: v_writelane_b32 v4, s62, 2
202+
; CHECK-NEXT: v_writelane_b32 v5, s55, 59
203+
; CHECK-NEXT: v_writelane_b32 v4, s63, 3
204+
; CHECK-NEXT: v_writelane_b32 v5, s56, 60
205+
; CHECK-NEXT: v_writelane_b32 v4, s64, 4
206+
; CHECK-NEXT: v_writelane_b32 v5, s57, 61
207+
; CHECK-NEXT: v_writelane_b32 v4, s65, 5
208+
; CHECK-NEXT: v_writelane_b32 v5, s58, 62
209+
; CHECK-NEXT: v_writelane_b32 v4, s66, 6
210+
; CHECK-NEXT: v_writelane_b32 v5, s59, 63
211+
; CHECK-NEXT: v_writelane_b32 v4, s67, 7
212+
; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[26:27]
214213
; CHECK-NEXT: s_cbranch_execz .LBB0_10
215214
; CHECK-NEXT: ; %bb.4: ; %bb32
216215
; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[24:25]
@@ -265,59 +264,61 @@ define void @main(i1 %arg) #0 {
265264
; CHECK-NEXT: s_waitcnt vmcnt(1)
266265
; CHECK-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0
267266
; CHECK-NEXT: .LBB0_6: ; %Flow12
268-
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[22:23]
267+
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[22:23]
268+
; CHECK-NEXT: v_readlane_b32 s52, v5, 40
269+
; CHECK-NEXT: v_readlane_b32 s53, v5, 41
270+
; CHECK-NEXT: v_readlane_b32 s54, v5, 42
271+
; CHECK-NEXT: v_readlane_b32 s55, v5, 43
272+
; CHECK-NEXT: v_readlane_b32 s56, v5, 44
273+
; CHECK-NEXT: v_readlane_b32 s57, v5, 45
274+
; CHECK-NEXT: v_readlane_b32 s58, v5, 46
275+
; CHECK-NEXT: v_readlane_b32 s59, v5, 47
276+
; CHECK-NEXT: v_readlane_b32 s60, v5, 48
277+
; CHECK-NEXT: v_readlane_b32 s61, v5, 49
278+
; CHECK-NEXT: v_readlane_b32 s62, v5, 50
279+
; CHECK-NEXT: v_readlane_b32 s63, v5, 51
280+
; CHECK-NEXT: v_readlane_b32 s64, v5, 52
281+
; CHECK-NEXT: v_readlane_b32 s65, v5, 53
282+
; CHECK-NEXT: v_readlane_b32 s66, v5, 54
283+
; CHECK-NEXT: v_readlane_b32 s67, v5, 55
284+
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
269285
; CHECK-NEXT: s_cbranch_execz .LBB0_9
270286
; CHECK-NEXT: ; %bb.7: ; %bb33.preheader
271287
; CHECK-NEXT: s_mov_b32 s8, 0
272288
; CHECK-NEXT: s_mov_b32 s6, s8
273-
; CHECK-NEXT: v_readlane_b32 s36, v5, 40
274289
; CHECK-NEXT: s_mov_b32 s7, s8
275290
; CHECK-NEXT: v_mov_b32_e32 v2, s6
276-
; CHECK-NEXT: v_readlane_b32 s37, v5, 41
291+
; CHECK-NEXT: v_readlane_b32 s36, v5, 56
277292
; CHECK-NEXT: s_mov_b32 s9, s8
278293
; CHECK-NEXT: s_mov_b32 s10, s8
279294
; CHECK-NEXT: s_mov_b32 s11, s8
280295
; CHECK-NEXT: v_mov_b32_e32 v3, s7
281-
; CHECK-NEXT: v_readlane_b32 s38, v5, 42
282-
; CHECK-NEXT: v_readlane_b32 s39, v5, 43
283-
; CHECK-NEXT: v_readlane_b32 s40, v5, 44
284-
; CHECK-NEXT: v_readlane_b32 s41, v5, 45
285-
; CHECK-NEXT: v_readlane_b32 s42, v5, 46
286-
; CHECK-NEXT: v_readlane_b32 s43, v5, 47
287-
; CHECK-NEXT: v_readlane_b32 s44, v5, 48
288-
; CHECK-NEXT: v_readlane_b32 s45, v5, 49
289-
; CHECK-NEXT: v_readlane_b32 s46, v5, 50
290-
; CHECK-NEXT: v_readlane_b32 s47, v5, 51
291-
; CHECK-NEXT: v_readlane_b32 s48, v5, 52
292-
; CHECK-NEXT: v_readlane_b32 s49, v5, 53
293-
; CHECK-NEXT: v_readlane_b32 s50, v5, 54
294-
; CHECK-NEXT: v_readlane_b32 s51, v5, 55
295-
; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37]
296-
; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39]
297-
; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41]
298-
; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43]
299-
; CHECK-NEXT: image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
300-
; CHECK-NEXT: v_readlane_b32 s36, v5, 56
301296
; CHECK-NEXT: v_readlane_b32 s37, v5, 57
302297
; CHECK-NEXT: v_readlane_b32 s38, v5, 58
303298
; CHECK-NEXT: v_readlane_b32 s39, v5, 59
304299
; CHECK-NEXT: v_readlane_b32 s40, v5, 60
305300
; CHECK-NEXT: v_readlane_b32 s41, v5, 61
306301
; CHECK-NEXT: v_readlane_b32 s42, v5, 62
307302
; CHECK-NEXT: v_readlane_b32 s43, v5, 63
303+
; CHECK-NEXT: s_nop 4
304+
; CHECK-NEXT: image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
305+
; CHECK-NEXT: image_sample_lz v7, v[2:3], s[52:59], s[8:11] dmask:0x1
308306
; CHECK-NEXT: ; kill: killed $vgpr2_vgpr3
307+
; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37]
309308
; CHECK-NEXT: s_and_b64 vcc, exec, 0
310309
; CHECK-NEXT: v_readlane_b32 s44, v4, 0
311310
; CHECK-NEXT: v_readlane_b32 s45, v4, 1
312311
; CHECK-NEXT: v_readlane_b32 s46, v4, 2
313312
; CHECK-NEXT: v_readlane_b32 s47, v4, 3
314-
; CHECK-NEXT: image_sample_lz v7, v[2:3], s[36:43], s[8:11] dmask:0x1
315313
; CHECK-NEXT: v_readlane_b32 s48, v4, 4
316314
; CHECK-NEXT: v_readlane_b32 s49, v4, 5
317315
; CHECK-NEXT: v_readlane_b32 s50, v4, 6
318316
; CHECK-NEXT: v_readlane_b32 s51, v4, 7
317+
; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39]
318+
; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41]
319+
; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43]
319320
; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
320-
; CHECK-NEXT: ; kill: killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43
321+
; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
321322
; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
322323
; CHECK-NEXT: s_waitcnt vmcnt(0)
323324
; CHECK-NEXT: v_sub_f32_e32 v2, v7, v6

0 commit comments

Comments
 (0)