Skip to content

Commit 3b93451

Browse files
committed
[AMDGPU] Split vgpr regalloc pipeline (llvm#93526)
Allocating wwm-registers and regular VGPR operands together imposes many challenges in the way the registers are reused during allocation. There are times when regalloc reuses the registers of regular VGPRs operations for wwm-operations in a small range leading to unwantedly clobbering their inactive lanes causing correctness issues which are hard to trace. This patch splits the VGPR allocation pipeline further to allocate wwm-registers first and the regular VGPR operands in a separate pipeline. The splitting would ensure that the physical registers used for wwm allocations won't taken part in the next allocation pipeline to avoid any such clobbering. Change-Id: I372693d8545df6cad7968e59c06da88552df1d01
1 parent 2b7da5f commit 3b93451

File tree

83 files changed

+9146
-9650
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+9146
-9650
lines changed

llvm/include/llvm/CodeGen/MachineRegisterInfo.h

+2
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,8 @@ class MachineRegisterInfo {
184184
TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg);
185185
}
186186

187+
const MachineFunction &getMF() const { return *MF; }
188+
187189
//===--------------------------------------------------------------------===//
188190
// Function State
189191
//===--------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPU.h

+4
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
5656
FunctionPass *createAMDGPUCodeGenPreparePass();
5757
FunctionPass *createAMDGPULateCodeGenPreparePass();
5858
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
59+
FunctionPass *createAMDGPUReserveWWMRegsPass();
5960
FunctionPass *createAMDGPURewriteOutArgumentsPass();
6061
ModulePass *
6162
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
@@ -149,6 +150,9 @@ struct AMDGPULowerBufferFatPointersPass
149150
const TargetMachine &TM;
150151
};
151152

153+
void initializeAMDGPUReserveWWMRegsPass(PassRegistry &);
154+
extern char &AMDGPUReserveWWMRegsID;
155+
152156
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
153157
extern char &AMDGPURewriteOutArgumentsID;
154158

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to reserved regs list -----===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// This pass should be invoked at the end of wwm-regalloc pipeline.
11+
/// It identifies the WWM regs allocated during this pipeline and add
12+
/// them to the list of reserved registers so that they won't be available for
13+
/// regular VGPR allocation in the subsequent regalloc pipeline.
14+
//
15+
//===----------------------------------------------------------------------===//
16+
17+
#include "AMDGPU.h"
18+
#include "GCNSubtarget.h"
19+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20+
#include "SIMachineFunctionInfo.h"
21+
#include "llvm/CodeGen/LiveIntervals.h"
22+
#include "llvm/CodeGen/MachineFunctionPass.h"
23+
#include "llvm/CodeGen/VirtRegMap.h"
24+
#include "llvm/InitializePasses.h"
25+
26+
using namespace llvm;
27+
28+
#define DEBUG_TYPE "amdgpu-reserve-wwm-regs"
29+
30+
namespace {
31+
32+
class AMDGPUReserveWWMRegs : public MachineFunctionPass {
33+
public:
34+
static char ID;
35+
36+
AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) {
37+
initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry());
38+
}
39+
40+
bool runOnMachineFunction(MachineFunction &MF) override;
41+
42+
StringRef getPassName() const override {
43+
return "AMDGPU Reserve WWM Registers";
44+
}
45+
46+
void getAnalysisUsage(AnalysisUsage &AU) const override {
47+
AU.setPreservesAll();
48+
MachineFunctionPass::getAnalysisUsage(AU);
49+
}
50+
};
51+
52+
} // End anonymous namespace.
53+
54+
INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE,
55+
"AMDGPU Reserve WWM Registers", false, false)
56+
57+
char AMDGPUReserveWWMRegs::ID = 0;
58+
59+
char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID;
60+
61+
bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) {
62+
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
63+
64+
bool Changed = false;
65+
for (MachineBasicBlock &MBB : MF) {
66+
for (MachineInstr &MI : MBB) {
67+
unsigned Opc = MI.getOpcode();
68+
if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR &&
69+
Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR)
70+
continue;
71+
72+
Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR
73+
? MI.getOperand(0).getReg()
74+
: MI.getOperand(1).getReg();
75+
76+
assert(Reg.isPhysical() &&
77+
"All WWM registers should have been allocated by now.");
78+
79+
MFI->reserveWWMRegister(Reg);
80+
Changed |= true;
81+
}
82+
}
83+
84+
// The renamable flag can't be set for reserved registers. Reset the flag for
85+
// MOs involving wwm-regs as they will be reserved during vgpr-regalloc
86+
// pipeline.
87+
const MachineRegisterInfo &MRI = MF.getRegInfo();
88+
for (Register Reg : MFI->getWWMReservedRegs()) {
89+
for (MachineOperand &MO : MRI.reg_operands(Reg))
90+
MO.setIsRenamable(false);
91+
}
92+
93+
// Now clear the NonWWMRegMask earlier set during wwm-regalloc.
94+
MFI->clearNonWWMRegAllocMask();
95+
96+
return Changed;
97+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

+90-4
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
8484
: RegisterRegAllocBase(N, D, C) {}
8585
};
8686

87+
class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
88+
public:
89+
WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
90+
: RegisterRegAllocBase(N, D, C) {}
91+
};
92+
8793
static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
8894
const MachineRegisterInfo &MRI,
8995
const Register Reg) {
@@ -98,13 +104,24 @@ static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
98104
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
99105
}
100106

101-
/// -{sgpr|vgpr}-regalloc=... command line option.
107+
static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
108+
const MachineRegisterInfo &MRI,
109+
const Register Reg) {
110+
const SIMachineFunctionInfo *MFI =
111+
MRI.getMF().getInfo<SIMachineFunctionInfo>();
112+
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
113+
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
114+
MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
115+
}
116+
117+
/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
102118
static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
103119

104120
/// A dummy default pass factory indicates whether the register allocator is
105121
/// overridden on the command line.
106122
static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
107123
static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
124+
static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
108125

109126
static SGPRRegisterRegAlloc
110127
defaultSGPRRegAlloc("default",
@@ -121,6 +138,11 @@ static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
121138
VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
122139
cl::desc("Register allocator to use for VGPRs"));
123140

141+
static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
142+
RegisterPassParser<WWMRegisterRegAlloc>>
143+
WWMRegAlloc("wwm-regalloc", cl::Hidden,
144+
cl::init(&useDefaultRegisterAllocator),
145+
cl::desc("Register allocator to use for WWM registers"));
124146

125147
static void initializeDefaultSGPRRegisterAllocatorOnce() {
126148
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
@@ -140,6 +162,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() {
140162
}
141163
}
142164

165+
static void initializeDefaultWWMRegisterAllocatorOnce() {
166+
RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
167+
168+
if (!Ctor) {
169+
Ctor = WWMRegAlloc;
170+
WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
171+
}
172+
}
173+
143174
static FunctionPass *createBasicSGPRRegisterAllocator() {
144175
return createBasicRegisterAllocator(onlyAllocateSGPRs);
145176
}
@@ -164,6 +195,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() {
164195
return createFastRegisterAllocator(onlyAllocateVGPRs, true);
165196
}
166197

198+
static FunctionPass *createBasicWWMRegisterAllocator() {
199+
return createBasicRegisterAllocator(onlyAllocateWWMRegs);
200+
}
201+
202+
static FunctionPass *createGreedyWWMRegisterAllocator() {
203+
return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
204+
}
205+
206+
static FunctionPass *createFastWWMRegisterAllocator() {
207+
return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
208+
}
209+
167210
static SGPRRegisterRegAlloc basicRegAllocSGPR(
168211
"basic", "basic register allocator", createBasicSGPRRegisterAllocator);
169212
static SGPRRegisterRegAlloc greedyRegAllocSGPR(
@@ -180,6 +223,15 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR(
180223

181224
static VGPRRegisterRegAlloc fastRegAllocVGPR(
182225
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
226+
227+
static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
228+
"basic register allocator",
229+
createBasicWWMRegisterAllocator);
230+
static WWMRegisterRegAlloc
231+
greedyRegAllocWWMReg("greedy", "greedy register allocator",
232+
createGreedyWWMRegisterAllocator);
233+
static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
234+
createFastWWMRegisterAllocator);
183235
} // anonymous namespace
184236

185237
static cl::opt<bool>
@@ -437,6 +489,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
437489
initializeAMDGPUSwLowerLDSLegacyPass(*PR);
438490
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
439491
initializeAMDGPULowerBufferFatPointersPass(*PR);
492+
initializeAMDGPUReserveWWMRegsPass(*PR);
440493
initializeAMDGPURewriteOutArgumentsPass(*PR);
441494
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
442495
initializeAMDGPUUnifyMetadataPass(*PR);
@@ -995,6 +1048,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
9951048

9961049
FunctionPass *createSGPRAllocPass(bool Optimized);
9971050
FunctionPass *createVGPRAllocPass(bool Optimized);
1051+
FunctionPass *createWWMRegAllocPass(bool Optimized);
9981052
FunctionPass *createRegAllocPass(bool Optimized) override;
9991053

10001054
bool addRegAssignAndRewriteFast() override;
@@ -1409,7 +1463,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
14091463
}
14101464

14111465
bool GCNPassConfig::addPreRewrite() {
1412-
addPass(&SILowerWWMCopiesID);
14131466
if (EnableRegReassign)
14141467
addPass(&GCNNSAReassignID);
14151468
return true;
@@ -1445,12 +1498,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
14451498
return createFastVGPRRegisterAllocator();
14461499
}
14471500

1501+
FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1502+
// Initialize the global default.
1503+
llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
1504+
initializeDefaultWWMRegisterAllocatorOnce);
1505+
1506+
RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1507+
if (Ctor != useDefaultRegisterAllocator)
1508+
return Ctor();
1509+
1510+
if (Optimized)
1511+
return createGreedyWWMRegisterAllocator();
1512+
1513+
return createFastWWMRegisterAllocator();
1514+
}
1515+
14481516
FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
14491517
llvm_unreachable("should not be used");
14501518
}
14511519

14521520
static const char RegAllocOptNotSupportedMessage[] =
1453-
"-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1521+
"-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1522+
"and -vgpr-regalloc";
14541523

14551524
bool GCNPassConfig::addRegAssignAndRewriteFast() {
14561525
if (!usingDefaultRegAlloc())
@@ -1462,11 +1531,19 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
14621531

14631532
// Equivalent of PEI for SGPRs.
14641533
addPass(&SILowerSGPRSpillsID);
1534+
1535+
// To Allocate wwm registers used in whole quad mode operations (for shaders).
14651536
addPass(&SIPreAllocateWWMRegsID);
14661537

1467-
addPass(createVGPRAllocPass(false));
1538+
// For allocating other wwm register operands.
1539+
addPass(createWWMRegAllocPass(false));
14681540

14691541
addPass(&SILowerWWMCopiesID);
1542+
addPass(&AMDGPUReserveWWMRegsID);
1543+
1544+
// For allocating regular VGPRs.
1545+
addPass(createVGPRAllocPass(false));
1546+
14701547
return true;
14711548
}
14721549

@@ -1486,8 +1563,17 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
14861563

14871564
// Equivalent of PEI for SGPRs.
14881565
addPass(&SILowerSGPRSpillsID);
1566+
1567+
// To Allocate wwm registers used in whole quad mode operations (for shaders).
14891568
addPass(&SIPreAllocateWWMRegsID);
14901569

1570+
// For allocating other whole wave mode registers.
1571+
addPass(createWWMRegAllocPass(true));
1572+
addPass(&SILowerWWMCopiesID);
1573+
addPass(createVirtRegRewriter(false));
1574+
addPass(&AMDGPUReserveWWMRegsID);
1575+
1576+
// For allocating regular VGPRs.
14911577
addPass(createVGPRAllocPass(true));
14921578

14931579
addPreRewrite();

llvm/lib/Target/AMDGPU/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ add_llvm_target(AMDGPUCodeGen
9999
AMDGPURegBankSelect.cpp
100100
AMDGPURegisterBankInfo.cpp
101101
AMDGPURemoveIncompatibleFunctions.cpp
102+
AMDGPUReserveWWMRegs.cpp
102103
AMDGPUResourceUsageAnalysis.cpp
103104
AMDGPURewriteOutArguments.cpp
104105
AMDGPURewriteUndefForPHI.cpp

0 commit comments

Comments
 (0)