Skip to content

Commit ac0f64f

Browse files
authored
[AMDGPU] Split vgpr regalloc pipeline (#93526)
Allocating wwm-registers and per-thread VGPR operands together imposes many challenges in the way the registers are reused during allocation. There are times when regalloc reuses the registers of regular VGPRs operations for wwm-operations in a small range leading to unwantedly clobbering their inactive lanes causing correctness issues that are hard to trace. This patch splits the VGPR allocation pipeline further to allocate wwm-registers first and the regular VGPR operands in a separate pipeline. The splitting would ensure that the physical registers used for wwm allocations won't take part in the next allocation pipeline to avoid any such clobbering.
1 parent bfde178 commit ac0f64f

File tree

90 files changed

+10690
-11159
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

90 files changed

+10690
-11159
lines changed

llvm/include/llvm/CodeGen/MachineRegisterInfo.h

+2
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,8 @@ class MachineRegisterInfo {
184184
TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg);
185185
}
186186

187+
const MachineFunction &getMF() const { return *MF; }
188+
187189
//===--------------------------------------------------------------------===//
188190
// Function State
189191
//===--------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPU.h

+4
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
5757
ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
5858
FunctionPass *createAMDGPUCodeGenPreparePass();
5959
FunctionPass *createAMDGPULateCodeGenPrepareLegacyPass();
60+
FunctionPass *createAMDGPUReserveWWMRegsPass();
6061
FunctionPass *createAMDGPURewriteOutArgumentsPass();
6162
ModulePass *
6263
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
@@ -154,6 +155,9 @@ struct AMDGPULowerBufferFatPointersPass
154155
const TargetMachine &TM;
155156
};
156157

158+
void initializeAMDGPUReserveWWMRegsPass(PassRegistry &);
159+
extern char &AMDGPUReserveWWMRegsID;
160+
157161
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
158162
extern char &AMDGPURewriteOutArgumentsID;
159163

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to reserved regs list -----===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// This pass should be invoked at the end of wwm-regalloc pipeline.
11+
/// It identifies the WWM regs allocated during this pipeline and add
12+
/// them to the list of reserved registers so that they won't be available for
13+
/// per-thread VGPR allocation in the subsequent regalloc pipeline.
14+
//
15+
//===----------------------------------------------------------------------===//
16+
17+
#include "AMDGPU.h"
18+
#include "GCNSubtarget.h"
19+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20+
#include "SIMachineFunctionInfo.h"
21+
#include "llvm/CodeGen/MachineFunctionPass.h"
22+
#include "llvm/CodeGen/VirtRegMap.h"
23+
#include "llvm/InitializePasses.h"
24+
25+
using namespace llvm;
26+
27+
#define DEBUG_TYPE "amdgpu-reserve-wwm-regs"
28+
29+
namespace {
30+
31+
class AMDGPUReserveWWMRegs : public MachineFunctionPass {
32+
public:
33+
static char ID;
34+
35+
AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) {
36+
initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry());
37+
}
38+
39+
bool runOnMachineFunction(MachineFunction &MF) override;
40+
41+
StringRef getPassName() const override {
42+
return "AMDGPU Reserve WWM Registers";
43+
}
44+
45+
void getAnalysisUsage(AnalysisUsage &AU) const override {
46+
AU.setPreservesAll();
47+
MachineFunctionPass::getAnalysisUsage(AU);
48+
}
49+
};
50+
51+
} // End anonymous namespace.
52+
53+
INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE,
54+
"AMDGPU Reserve WWM Registers", false, false)
55+
56+
char AMDGPUReserveWWMRegs::ID = 0;
57+
58+
char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID;
59+
60+
bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) {
61+
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
62+
63+
bool Changed = false;
64+
for (MachineBasicBlock &MBB : MF) {
65+
for (MachineInstr &MI : MBB) {
66+
unsigned Opc = MI.getOpcode();
67+
if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR &&
68+
Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR)
69+
continue;
70+
71+
Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR
72+
? MI.getOperand(0).getReg()
73+
: MI.getOperand(1).getReg();
74+
75+
assert(Reg.isPhysical() &&
76+
"All WWM registers should have been allocated by now.");
77+
78+
MFI->reserveWWMRegister(Reg);
79+
Changed |= true;
80+
}
81+
}
82+
83+
// The renamable flag can't be set for reserved registers. Reset the flag for
84+
// MOs involving wwm-regs as they will be reserved during vgpr-regalloc
85+
// pipeline.
86+
const MachineRegisterInfo &MRI = MF.getRegInfo();
87+
for (Register Reg : MFI->getWWMReservedRegs()) {
88+
for (MachineOperand &MO : MRI.reg_operands(Reg))
89+
MO.setIsRenamable(false);
90+
}
91+
92+
// Now clear the NonWWMRegMask earlier set during wwm-regalloc.
93+
MFI->clearNonWWMRegAllocMask();
94+
95+
return Changed;
96+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

+89-4
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,12 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
108108
: RegisterRegAllocBase(N, D, C) {}
109109
};
110110

111+
class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
112+
public:
113+
WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
114+
: RegisterRegAllocBase(N, D, C) {}
115+
};
116+
111117
static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
112118
const MachineRegisterInfo &MRI,
113119
const Register Reg) {
@@ -122,13 +128,24 @@ static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
122128
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
123129
}
124130

125-
/// -{sgpr|vgpr}-regalloc=... command line option.
131+
static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
132+
const MachineRegisterInfo &MRI,
133+
const Register Reg) {
134+
const SIMachineFunctionInfo *MFI =
135+
MRI.getMF().getInfo<SIMachineFunctionInfo>();
136+
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
137+
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
138+
MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
139+
}
140+
141+
/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
126142
static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
127143

128144
/// A dummy default pass factory indicates whether the register allocator is
129145
/// overridden on the command line.
130146
static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
131147
static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
148+
static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
132149

133150
static SGPRRegisterRegAlloc
134151
defaultSGPRRegAlloc("default",
@@ -145,6 +162,11 @@ static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
145162
VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
146163
cl::desc("Register allocator to use for VGPRs"));
147164

165+
static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
166+
RegisterPassParser<WWMRegisterRegAlloc>>
167+
WWMRegAlloc("wwm-regalloc", cl::Hidden,
168+
cl::init(&useDefaultRegisterAllocator),
169+
cl::desc("Register allocator to use for WWM registers"));
148170

149171
static void initializeDefaultSGPRRegisterAllocatorOnce() {
150172
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
@@ -164,6 +186,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() {
164186
}
165187
}
166188

189+
static void initializeDefaultWWMRegisterAllocatorOnce() {
190+
RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
191+
192+
if (!Ctor) {
193+
Ctor = WWMRegAlloc;
194+
WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
195+
}
196+
}
197+
167198
static FunctionPass *createBasicSGPRRegisterAllocator() {
168199
return createBasicRegisterAllocator(onlyAllocateSGPRs);
169200
}
@@ -188,6 +219,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() {
188219
return createFastRegisterAllocator(onlyAllocateVGPRs, true);
189220
}
190221

222+
static FunctionPass *createBasicWWMRegisterAllocator() {
223+
return createBasicRegisterAllocator(onlyAllocateWWMRegs);
224+
}
225+
226+
static FunctionPass *createGreedyWWMRegisterAllocator() {
227+
return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
228+
}
229+
230+
static FunctionPass *createFastWWMRegisterAllocator() {
231+
return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
232+
}
233+
191234
static SGPRRegisterRegAlloc basicRegAllocSGPR(
192235
"basic", "basic register allocator", createBasicSGPRRegisterAllocator);
193236
static SGPRRegisterRegAlloc greedyRegAllocSGPR(
@@ -204,6 +247,14 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR(
204247

205248
static VGPRRegisterRegAlloc fastRegAllocVGPR(
206249
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
250+
static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
251+
"basic register allocator",
252+
createBasicWWMRegisterAllocator);
253+
static WWMRegisterRegAlloc
254+
greedyRegAllocWWMReg("greedy", "greedy register allocator",
255+
createGreedyWWMRegisterAllocator);
256+
static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
257+
createFastWWMRegisterAllocator);
207258
} // anonymous namespace
208259

209260
static cl::opt<bool>
@@ -440,6 +491,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
440491
initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
441492
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
442493
initializeAMDGPULowerBufferFatPointersPass(*PR);
494+
initializeAMDGPUReserveWWMRegsPass(*PR);
443495
initializeAMDGPURewriteOutArgumentsPass(*PR);
444496
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
445497
initializeAMDGPUUnifyMetadataPass(*PR);
@@ -989,6 +1041,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
9891041

9901042
FunctionPass *createSGPRAllocPass(bool Optimized);
9911043
FunctionPass *createVGPRAllocPass(bool Optimized);
1044+
FunctionPass *createWWMRegAllocPass(bool Optimized);
9921045
FunctionPass *createRegAllocPass(bool Optimized) override;
9931046

9941047
bool addRegAssignAndRewriteFast() override;
@@ -1382,7 +1435,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
13821435
}
13831436

13841437
bool GCNPassConfig::addPreRewrite() {
1385-
addPass(&SILowerWWMCopiesID);
13861438
if (EnableRegReassign)
13871439
addPass(&GCNNSAReassignID);
13881440
return true;
@@ -1418,12 +1470,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
14181470
return createFastVGPRRegisterAllocator();
14191471
}
14201472

1473+
FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1474+
// Initialize the global default.
1475+
llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
1476+
initializeDefaultWWMRegisterAllocatorOnce);
1477+
1478+
RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1479+
if (Ctor != useDefaultRegisterAllocator)
1480+
return Ctor();
1481+
1482+
if (Optimized)
1483+
return createGreedyWWMRegisterAllocator();
1484+
1485+
return createFastWWMRegisterAllocator();
1486+
}
1487+
14211488
FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
14221489
llvm_unreachable("should not be used");
14231490
}
14241491

14251492
static const char RegAllocOptNotSupportedMessage[] =
1426-
"-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1493+
"-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1494+
"and -vgpr-regalloc";
14271495

14281496
bool GCNPassConfig::addRegAssignAndRewriteFast() {
14291497
if (!usingDefaultRegAlloc())
@@ -1435,11 +1503,19 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
14351503

14361504
// Equivalent of PEI for SGPRs.
14371505
addPass(&SILowerSGPRSpillsLegacyID);
1506+
1507+
// To Allocate wwm registers used in whole quad mode operations (for shaders).
14381508
addPass(&SIPreAllocateWWMRegsID);
14391509

1440-
addPass(createVGPRAllocPass(false));
1510+
// For allocating other wwm register operands.
1511+
addPass(createWWMRegAllocPass(false));
14411512

14421513
addPass(&SILowerWWMCopiesID);
1514+
addPass(&AMDGPUReserveWWMRegsID);
1515+
1516+
// For allocating per-thread VGPRs.
1517+
addPass(createVGPRAllocPass(false));
1518+
14431519
return true;
14441520
}
14451521

@@ -1459,8 +1535,17 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
14591535

14601536
// Equivalent of PEI for SGPRs.
14611537
addPass(&SILowerSGPRSpillsLegacyID);
1538+
1539+
// To Allocate wwm registers used in whole quad mode operations (for shaders).
14621540
addPass(&SIPreAllocateWWMRegsID);
14631541

1542+
// For allocating other whole wave mode registers.
1543+
addPass(createWWMRegAllocPass(true));
1544+
addPass(&SILowerWWMCopiesID);
1545+
addPass(createVirtRegRewriter(false));
1546+
addPass(&AMDGPUReserveWWMRegsID);
1547+
1548+
// For allocating per-thread VGPRs.
14641549
addPass(createVGPRAllocPass(true));
14651550

14661551
addPreRewrite();

llvm/lib/Target/AMDGPU/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ add_llvm_target(AMDGPUCodeGen
9595
AMDGPURegBankSelect.cpp
9696
AMDGPURegisterBankInfo.cpp
9797
AMDGPURemoveIncompatibleFunctions.cpp
98+
AMDGPUReserveWWMRegs.cpp
9899
AMDGPUResourceUsageAnalysis.cpp
99100
AMDGPURewriteOutArguments.cpp
100101
AMDGPURewriteUndefForPHI.cpp

0 commit comments

Comments
 (0)