-
Notifications
You must be signed in to change notification settings - Fork 12.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Split vgpr regalloc pipeline #93526
Conversation
@llvm/pr-subscribers-llvm-regalloc @llvm/pr-subscribers-backend-amdgpu Author: Christudasan Devadasan (cdevadas) ChangesAllocating wwm-registers and per-thread VGPR operands This patch splits the VGPR allocation pipeline further Patch is 1.41 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93526.diff 85 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 09d9a0b4ec402..01d91982ae1c7 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -184,6 +184,8 @@ class MachineRegisterInfo {
TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg);
}
+ const MachineFunction &getMF() const { return *MF; }
+
//===--------------------------------------------------------------------===//
// Function State
//===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/CodeGen/RegAllocCommon.h b/llvm/include/llvm/CodeGen/RegAllocCommon.h
index 757ca8e112eec..f3423083eef3a 100644
--- a/llvm/include/llvm/CodeGen/RegAllocCommon.h
+++ b/llvm/include/llvm/CodeGen/RegAllocCommon.h
@@ -10,22 +10,25 @@
#define LLVM_CODEGEN_REGALLOCCOMMON_H
#include <functional>
+#include <llvm/CodeGen/Register.h>
namespace llvm {
class TargetRegisterClass;
class TargetRegisterInfo;
+class MachineRegisterInfo;
typedef std::function<bool(const TargetRegisterInfo &TRI,
- const TargetRegisterClass &RC)> RegClassFilterFunc;
+ const MachineRegisterInfo &MRI, const Register Reg)>
+ RegClassFilterFunc;
/// Default register class filter function for register allocation. All virtual
/// registers should be allocated.
static inline bool allocateAllRegClasses(const TargetRegisterInfo &,
- const TargetRegisterClass &) {
+ const MachineRegisterInfo &,
+ const Register) {
return true;
}
-
}
#endif // LLVM_CODEGEN_REGALLOCCOMMON_H
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index d0dec372f6896..a4645ed93029d 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -181,8 +181,7 @@ void RegAllocBase::enqueue(const LiveInterval *LI) {
if (VRM->hasPhys(Reg))
return;
- const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
- if (ShouldAllocateClass(*TRI, RC)) {
+ if (ShouldAllocateClass(*TRI, *MRI, Reg)) {
LLVM_DEBUG(dbgs() << "Enqueuing " << printReg(Reg, TRI) << '\n');
enqueueImpl(LI);
} else {
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 6740e1f0edb4f..f6419daba6a2d 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -417,8 +417,7 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
bool RegAllocFast::shouldAllocateRegister(const Register Reg) const {
assert(Reg.isVirtual());
- const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
- return ShouldAllocateClass(*TRI, RC);
+ return ShouldAllocateClass(*TRI, *MRI, Reg);
}
void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 348277224c7ae..b3bf1899ceeaf 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2306,9 +2306,9 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
if (Reg.isPhysical())
continue;
- // This may be a skipped class
+ // This may be a skipped register.
if (!VRM->hasPhys(Reg)) {
- assert(!ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg)) &&
+ assert(!ShouldAllocateClass(*TRI, *MRI, Reg) &&
"We have an unallocated variable which should have been handled");
continue;
}
@@ -2698,7 +2698,7 @@ bool RAGreedy::hasVirtRegAlloc() {
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
if (!RC)
continue;
- if (ShouldAllocateClass(*TRI, *RC))
+ if (ShouldAllocateClass(*TRI, *MRI, Reg))
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 6016bd5187d88..cd9f3fb162fd2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -56,6 +56,7 @@ ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+FunctionPass *createAMDGPUReserveWWMRegsPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
@@ -149,6 +150,9 @@ struct AMDGPULowerBufferFatPointersPass
const TargetMachine &TM;
};
+void initializeAMDGPUReserveWWMRegsPass(PassRegistry &);
+extern char &AMDGPUReserveWWMRegsID;
+
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
new file mode 100644
index 0000000000000..5ed8cd4231d00
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
@@ -0,0 +1,104 @@
+//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to the reserved regs list
+//---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass should be invoked at the end of wwm-regalloc pipeline.
+/// It identifies the WWM regs allocated during this pipeline and add
+/// them to the list of reserved registers so that they won't be available for
+/// per-thread VGPR allocation in the subsequent regalloc pipeline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-reserve-wwm-regs"
+
+namespace {
+
+class AMDGPUReserveWWMRegs : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) {
+ initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Reserve WWM Registers";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE,
+ "AMDGPU Reserve WWM Registers", false, false)
+
+char AMDGPUReserveWWMRegs::ID = 0;
+
+char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID;
+
+bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ unsigned Opc = MI.getOpcode();
+ if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR &&
+ Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR)
+ continue;
+
+ Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR
+ ? MI.getOperand(0).getReg()
+ : MI.getOperand(1).getReg();
+
+ assert(Reg.isPhysical() &&
+ "All WWM registers should have been allocated by now.");
+
+ MFI->reserveWWMRegister(Reg);
+ Changed |= true;
+ }
+ }
+
+ // Reset the renamable flag for MOs involving wwm-regs to get rid of the MIR
+ // Verifier error.
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical() &&
+ llvm::is_contained(MFI->getWWMReservedRegs(), Reg))
+ MO.setIsRenamable(false);
+ }
+ }
+ }
+
+ // Now clear the NonWWMRegMask earlier set during wwm-regalloc.
+ MFI->clearNonWWMRegAllocMask();
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dbbfe34a63863..e3375c758b8d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -82,24 +82,44 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
: RegisterRegAllocBase(N, D, C) {}
};
+class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
+public:
+ WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+ : RegisterRegAllocBase(N, D, C) {}
+};
+
static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
- const TargetRegisterClass &RC) {
- return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+ const MachineRegisterInfo &MRI,
+ const Register Reg) {
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
}
static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
- const TargetRegisterClass &RC) {
- return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+ const MachineRegisterInfo &MRI,
+ const Register Reg) {
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
}
+static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI,
+ const Register Reg) {
+ const SIMachineFunctionInfo *MFI =
+ MRI.getMF().getInfo<SIMachineFunctionInfo>();
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
+ MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
+}
-/// -{sgpr|vgpr}-regalloc=... command line option.
+/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
/// A dummy default pass factory indicates whether the register allocator is
/// overridden on the command line.
static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
+static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
static SGPRRegisterRegAlloc
defaultSGPRRegAlloc("default",
@@ -116,6 +136,11 @@ static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for VGPRs"));
+static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
+ RegisterPassParser<WWMRegisterRegAlloc>>
+ WWMRegAlloc("wwm-regalloc", cl::Hidden,
+ cl::init(&useDefaultRegisterAllocator),
+ cl::desc("Register allocator to use for WWM registers"));
static void initializeDefaultSGPRRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
@@ -135,6 +160,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() {
}
}
+static void initializeDefaultWWMRegisterAllocatorOnce() {
+ RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
+
+ if (!Ctor) {
+ Ctor = WWMRegAlloc;
+ WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
+ }
+}
+
static FunctionPass *createBasicSGPRRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateSGPRs);
}
@@ -159,6 +193,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateVGPRs, true);
}
+static FunctionPass *createBasicWWMRegisterAllocator() {
+ return createBasicRegisterAllocator(onlyAllocateWWMRegs);
+}
+
+static FunctionPass *createGreedyWWMRegisterAllocator() {
+ return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
+}
+
+static FunctionPass *createFastWWMRegisterAllocator() {
+ return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
+}
+
static SGPRRegisterRegAlloc basicRegAllocSGPR(
"basic", "basic register allocator", createBasicSGPRRegisterAllocator);
static SGPRRegisterRegAlloc greedyRegAllocSGPR(
@@ -175,7 +221,15 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR(
static VGPRRegisterRegAlloc fastRegAllocVGPR(
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
-}
+static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
+ "basic register allocator",
+ createBasicWWMRegisterAllocator);
+static WWMRegisterRegAlloc
+ greedyRegAllocWWMReg("greedy", "greedy register allocator",
+ createGreedyWWMRegisterAllocator);
+static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
+ createFastWWMRegisterAllocator);
+} // namespace
static cl::opt<bool>
EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
@@ -424,6 +478,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
+ initializeAMDGPUReserveWWMRegsPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
@@ -923,6 +978,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
FunctionPass *createSGPRAllocPass(bool Optimized);
FunctionPass *createVGPRAllocPass(bool Optimized);
+ FunctionPass *createWWMRegAllocPass(bool Optimized);
FunctionPass *createRegAllocPass(bool Optimized) override;
bool addRegAssignAndRewriteFast() override;
@@ -1331,7 +1387,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
}
bool GCNPassConfig::addPreRewrite() {
- addPass(&SILowerWWMCopiesID);
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
return true;
@@ -1367,12 +1422,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
return createFastVGPRRegisterAllocator();
}
+FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
+ // Initialize the global default.
+ llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
+ initializeDefaultWWMRegisterAllocatorOnce);
+
+ RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
+ if (Ctor != useDefaultRegisterAllocator)
+ return Ctor();
+
+ if (Optimized)
+ return createGreedyWWMRegisterAllocator();
+
+ return createFastWWMRegisterAllocator();
+}
+
FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
llvm_unreachable("should not be used");
}
static const char RegAllocOptNotSupportedMessage[] =
- "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
+ "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
+ "and -vgpr-regalloc";
bool GCNPassConfig::addRegAssignAndRewriteFast() {
if (!usingDefaultRegAlloc())
@@ -1384,11 +1455,20 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
+
+ // To Allocate wwm registers used in whole quad mode operations (for pixel
+ // shaders).
addPass(&SIPreAllocateWWMRegsID);
- addPass(createVGPRAllocPass(false));
+ // For allocating other wwm register operands.
+ addPass(createWWMRegAllocPass(false));
addPass(&SILowerWWMCopiesID);
+ addPass(&AMDGPUReserveWWMRegsID);
+
+ // For allocating per-thread VGPRs.
+ addPass(createVGPRAllocPass(false));
+
return true;
}
@@ -1408,8 +1488,18 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
+
+ // To Allocate wwm registers used in whole quad mode operations (for pixel
+ // shaders).
addPass(&SIPreAllocateWWMRegsID);
+ // For allocating other whole wave mode registers.
+ addPass(createWWMRegAllocPass(true));
+ addPass(&SILowerWWMCopiesID);
+ addPass(createVirtRegRewriter(false));
+ addPass(&AMDGPUReserveWWMRegsID);
+
+ // For allocating per-thread VGPRs.
addPass(createVGPRAllocPass(true));
addPreRewrite();
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c992352cb78da..178af07048571 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -94,6 +94,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPURegBankSelect.cpp
AMDGPURegisterBankInfo.cpp
AMDGPURemoveIncompatibleFunctions.cpp
+ AMDGPUReserveWWMRegs.cpp
AMDGPUResourceUsageAnalysis.cpp
AMDGPURewriteOutArguments.cpp
AMDGPURewriteUndefForPHI.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index eae666ab0e7d7..fbb6c3d9fe24b 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1555,6 +1555,17 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
}
}
+// Mark all WWM VGPRs as BB LiveIns.
+static void addWwmRegBBLiveIn(MachineFunction &MF) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto &Reg : MFI->getWWMReservedRegs())
+ MBB.addLiveIn(Reg);
+
+ MBB.sortUniqueLiveIns();
+ }
+}
+
// Only report VGPRs to generic code.
void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedVGPRs,
@@ -1567,11 +1578,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
return;
- MFI->shiftSpillPhysVGPRsToLowestRange(MF);
-
TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
- if (MFI->isEntryFunction())
- return;
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1581,19 +1588,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
MachineInstr *ReturnMI = nullptr;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- // WRITELANE instructions used for SGPR spills can overwrite the inactive
- // lanes of VGPRs and callee must spill and restore them even if they are
- // marked Caller-saved.
-
- // TODO: Handle this elsewhere at an early point. Walking through all MBBs
- // here would be a bad heuristic. A better way should be by calling
- // allocateWWMSpill during the regalloc pipeline whenever a physical
- // register is allocated for the intended virtual registers.
- if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
- MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
- else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
- MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
- else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
+ // TODO: Walking through all MBBs here would be a bad heuristic. Better
+ // handle them elsewhere.
+ if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
NeedExecCopyReservedReg = true;
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
@@ -1608,6 +1605,25 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
}
}
+ SmallVector<Register> SortedWWMVGPRs;
+ for (auto &Reg : MFI->getWWMReservedRegs()) {
+ // The shift-back is needed only for the VGPRs used for SGPR spills and they
+ // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
+ // reserved registers.
+ const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
+ if (TRI->getRegSizeInBits(*RC) > 32)
+ continue;
+ SortedWWMVGPRs.push_back(Reg);
+...
[truncated]
|
@llvm/pr-subscribers-backend-x86 Author: Christudasan Devadasan (cdevadas) ChangesAllocating wwm-registers and per-thread VGPR operands This patch splits the VGPR allocation pipeline further Patch is 1.41 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93526.diff 85 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 09d9a0b4ec402..01d91982ae1c7 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -184,6 +184,8 @@ class MachineRegisterInfo {
TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg);
}
+ const MachineFunction &getMF() const { return *MF; }
+
//===--------------------------------------------------------------------===//
// Function State
//===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/CodeGen/RegAllocCommon.h b/llvm/include/llvm/CodeGen/RegAllocCommon.h
index 757ca8e112eec..f3423083eef3a 100644
--- a/llvm/include/llvm/CodeGen/RegAllocCommon.h
+++ b/llvm/include/llvm/CodeGen/RegAllocCommon.h
@@ -10,22 +10,25 @@
#define LLVM_CODEGEN_REGALLOCCOMMON_H
#include <functional>
+#include <llvm/CodeGen/Register.h>
namespace llvm {
class TargetRegisterClass;
class TargetRegisterInfo;
+class MachineRegisterInfo;
typedef std::function<bool(const TargetRegisterInfo &TRI,
- const TargetRegisterClass &RC)> RegClassFilterFunc;
+ const MachineRegisterInfo &MRI, const Register Reg)>
+ RegClassFilterFunc;
/// Default register class filter function for register allocation. All virtual
/// registers should be allocated.
static inline bool allocateAllRegClasses(const TargetRegisterInfo &,
- const TargetRegisterClass &) {
+ const MachineRegisterInfo &,
+ const Register) {
return true;
}
-
}
#endif // LLVM_CODEGEN_REGALLOCCOMMON_H
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index d0dec372f6896..a4645ed93029d 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -181,8 +181,7 @@ void RegAllocBase::enqueue(const LiveInterval *LI) {
if (VRM->hasPhys(Reg))
return;
- const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
- if (ShouldAllocateClass(*TRI, RC)) {
+ if (ShouldAllocateClass(*TRI, *MRI, Reg)) {
LLVM_DEBUG(dbgs() << "Enqueuing " << printReg(Reg, TRI) << '\n');
enqueueImpl(LI);
} else {
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 6740e1f0edb4f..f6419daba6a2d 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -417,8 +417,7 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
bool RegAllocFast::shouldAllocateRegister(const Register Reg) const {
assert(Reg.isVirtual());
- const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
- return ShouldAllocateClass(*TRI, RC);
+ return ShouldAllocateClass(*TRI, *MRI, Reg);
}
void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 348277224c7ae..b3bf1899ceeaf 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2306,9 +2306,9 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
if (Reg.isPhysical())
continue;
- // This may be a skipped class
+ // This may be a skipped register.
if (!VRM->hasPhys(Reg)) {
- assert(!ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg)) &&
+ assert(!ShouldAllocateClass(*TRI, *MRI, Reg) &&
"We have an unallocated variable which should have been handled");
continue;
}
@@ -2698,7 +2698,7 @@ bool RAGreedy::hasVirtRegAlloc() {
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
if (!RC)
continue;
- if (ShouldAllocateClass(*TRI, *RC))
+ if (ShouldAllocateClass(*TRI, *MRI, Reg))
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 6016bd5187d88..cd9f3fb162fd2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -56,6 +56,7 @@ ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+FunctionPass *createAMDGPUReserveWWMRegsPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
@@ -149,6 +150,9 @@ struct AMDGPULowerBufferFatPointersPass
const TargetMachine &TM;
};
+void initializeAMDGPUReserveWWMRegsPass(PassRegistry &);
+extern char &AMDGPUReserveWWMRegsID;
+
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
new file mode 100644
index 0000000000000..5ed8cd4231d00
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
@@ -0,0 +1,104 @@
+//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to the reserved regs list
+//---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass should be invoked at the end of wwm-regalloc pipeline.
+/// It identifies the WWM regs allocated during this pipeline and add
+/// them to the list of reserved registers so that they won't be available for
+/// per-thread VGPR allocation in the subsequent regalloc pipeline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-reserve-wwm-regs"
+
+namespace {
+
+class AMDGPUReserveWWMRegs : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) {
+ initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Reserve WWM Registers";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE,
+ "AMDGPU Reserve WWM Registers", false, false)
+
+char AMDGPUReserveWWMRegs::ID = 0;
+
+char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID;
+
+bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ unsigned Opc = MI.getOpcode();
+ if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR &&
+ Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR)
+ continue;
+
+ Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR
+ ? MI.getOperand(0).getReg()
+ : MI.getOperand(1).getReg();
+
+ assert(Reg.isPhysical() &&
+ "All WWM registers should have been allocated by now.");
+
+ MFI->reserveWWMRegister(Reg);
+ Changed |= true;
+ }
+ }
+
+ // Reset the renamable flag for MOs involving wwm-regs to get rid of the MIR
+ // Verifier error.
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical() &&
+ llvm::is_contained(MFI->getWWMReservedRegs(), Reg))
+ MO.setIsRenamable(false);
+ }
+ }
+ }
+
+ // Now clear the NonWWMRegMask earlier set during wwm-regalloc.
+ MFI->clearNonWWMRegAllocMask();
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dbbfe34a63863..e3375c758b8d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -82,24 +82,44 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
: RegisterRegAllocBase(N, D, C) {}
};
+class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
+public:
+ WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+ : RegisterRegAllocBase(N, D, C) {}
+};
+
static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
- const TargetRegisterClass &RC) {
- return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+ const MachineRegisterInfo &MRI,
+ const Register Reg) {
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
}
static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
- const TargetRegisterClass &RC) {
- return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+ const MachineRegisterInfo &MRI,
+ const Register Reg) {
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
}
+static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI,
+ const Register Reg) {
+ const SIMachineFunctionInfo *MFI =
+ MRI.getMF().getInfo<SIMachineFunctionInfo>();
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
+ MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
+}
-/// -{sgpr|vgpr}-regalloc=... command line option.
+/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
/// A dummy default pass factory indicates whether the register allocator is
/// overridden on the command line.
static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
+static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
static SGPRRegisterRegAlloc
defaultSGPRRegAlloc("default",
@@ -116,6 +136,11 @@ static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for VGPRs"));
+static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
+ RegisterPassParser<WWMRegisterRegAlloc>>
+ WWMRegAlloc("wwm-regalloc", cl::Hidden,
+ cl::init(&useDefaultRegisterAllocator),
+ cl::desc("Register allocator to use for WWM registers"));
static void initializeDefaultSGPRRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
@@ -135,6 +160,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() {
}
}
+static void initializeDefaultWWMRegisterAllocatorOnce() {
+ RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
+
+ if (!Ctor) {
+ Ctor = WWMRegAlloc;
+ WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
+ }
+}
+
static FunctionPass *createBasicSGPRRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateSGPRs);
}
@@ -159,6 +193,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateVGPRs, true);
}
+static FunctionPass *createBasicWWMRegisterAllocator() {
+ return createBasicRegisterAllocator(onlyAllocateWWMRegs);
+}
+
+static FunctionPass *createGreedyWWMRegisterAllocator() {
+ return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
+}
+
+static FunctionPass *createFastWWMRegisterAllocator() {
+ return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
+}
+
static SGPRRegisterRegAlloc basicRegAllocSGPR(
"basic", "basic register allocator", createBasicSGPRRegisterAllocator);
static SGPRRegisterRegAlloc greedyRegAllocSGPR(
@@ -175,7 +221,15 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR(
static VGPRRegisterRegAlloc fastRegAllocVGPR(
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
-}
+static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
+ "basic register allocator",
+ createBasicWWMRegisterAllocator);
+static WWMRegisterRegAlloc
+ greedyRegAllocWWMReg("greedy", "greedy register allocator",
+ createGreedyWWMRegisterAllocator);
+static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
+ createFastWWMRegisterAllocator);
+} // namespace
static cl::opt<bool>
EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
@@ -424,6 +478,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
+ initializeAMDGPUReserveWWMRegsPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
@@ -923,6 +978,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
FunctionPass *createSGPRAllocPass(bool Optimized);
FunctionPass *createVGPRAllocPass(bool Optimized);
+ FunctionPass *createWWMRegAllocPass(bool Optimized);
FunctionPass *createRegAllocPass(bool Optimized) override;
bool addRegAssignAndRewriteFast() override;
@@ -1331,7 +1387,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
}
bool GCNPassConfig::addPreRewrite() {
- addPass(&SILowerWWMCopiesID);
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
return true;
@@ -1367,12 +1422,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
return createFastVGPRRegisterAllocator();
}
+FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
+ // Initialize the global default.
+ llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
+ initializeDefaultWWMRegisterAllocatorOnce);
+
+ RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
+ if (Ctor != useDefaultRegisterAllocator)
+ return Ctor();
+
+ if (Optimized)
+ return createGreedyWWMRegisterAllocator();
+
+ return createFastWWMRegisterAllocator();
+}
+
FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
llvm_unreachable("should not be used");
}
static const char RegAllocOptNotSupportedMessage[] =
- "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
+ "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
+ "and -vgpr-regalloc";
bool GCNPassConfig::addRegAssignAndRewriteFast() {
if (!usingDefaultRegAlloc())
@@ -1384,11 +1455,20 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
+
+ // To Allocate wwm registers used in whole quad mode operations (for pixel
+ // shaders).
addPass(&SIPreAllocateWWMRegsID);
- addPass(createVGPRAllocPass(false));
+ // For allocating other wwm register operands.
+ addPass(createWWMRegAllocPass(false));
addPass(&SILowerWWMCopiesID);
+ addPass(&AMDGPUReserveWWMRegsID);
+
+ // For allocating per-thread VGPRs.
+ addPass(createVGPRAllocPass(false));
+
return true;
}
@@ -1408,8 +1488,18 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
+
+ // To Allocate wwm registers used in whole quad mode operations (for pixel
+ // shaders).
addPass(&SIPreAllocateWWMRegsID);
+ // For allocating other whole wave mode registers.
+ addPass(createWWMRegAllocPass(true));
+ addPass(&SILowerWWMCopiesID);
+ addPass(createVirtRegRewriter(false));
+ addPass(&AMDGPUReserveWWMRegsID);
+
+ // For allocating per-thread VGPRs.
addPass(createVGPRAllocPass(true));
addPreRewrite();
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c992352cb78da..178af07048571 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -94,6 +94,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPURegBankSelect.cpp
AMDGPURegisterBankInfo.cpp
AMDGPURemoveIncompatibleFunctions.cpp
+ AMDGPUReserveWWMRegs.cpp
AMDGPUResourceUsageAnalysis.cpp
AMDGPURewriteOutArguments.cpp
AMDGPURewriteUndefForPHI.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index eae666ab0e7d7..fbb6c3d9fe24b 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1555,6 +1555,17 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
}
}
+// Mark all WWM VGPRs as BB LiveIns.
+static void addWwmRegBBLiveIn(MachineFunction &MF) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto &Reg : MFI->getWWMReservedRegs())
+ MBB.addLiveIn(Reg);
+
+ MBB.sortUniqueLiveIns();
+ }
+}
+
// Only report VGPRs to generic code.
void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedVGPRs,
@@ -1567,11 +1578,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
return;
- MFI->shiftSpillPhysVGPRsToLowestRange(MF);
-
TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
- if (MFI->isEntryFunction())
- return;
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1581,19 +1588,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
MachineInstr *ReturnMI = nullptr;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- // WRITELANE instructions used for SGPR spills can overwrite the inactive
- // lanes of VGPRs and callee must spill and restore them even if they are
- // marked Caller-saved.
-
- // TODO: Handle this elsewhere at an early point. Walking through all MBBs
- // here would be a bad heuristic. A better way should be by calling
- // allocateWWMSpill during the regalloc pipeline whenever a physical
- // register is allocated for the intended virtual registers.
- if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
- MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
- else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
- MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
- else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
+ // TODO: Walking through all MBBs here would be a bad heuristic. Better
+ // handle them elsewhere.
+ if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
NeedExecCopyReservedReg = true;
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
@@ -1608,6 +1605,25 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
}
}
+ SmallVector<Register> SortedWWMVGPRs;
+ for (auto &Reg : MFI->getWWMReservedRegs()) {
+ // The shift-back is needed only for the VGPRs used for SGPR spills and they
+ // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
+ // reserved registers.
+ const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
+ if (TRI->getRegSizeInBits(*RC) > 32)
+ continue;
+ SortedWWMVGPRs.push_back(Reg);
+...
[truncated]
|
This patch would replace the PR #86012. The two generic features introduced earlier were strongly discouraged. This patch splits the VGPR allocation further to handle the wwm-allocation in a target-specific way. |
Some tests I marked XFAIL for now. They should be fixed. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good to me
Added a negative test llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll to capture the error during wwm-regalloc. |
Title should be fixed before merge |
09e8fa3
to
5e8c62b
Compare
5e8c62b
to
96f7af4
Compare
Allocating wwm-registers and regular VGPR operands together imposes many challenges in the way the registers are reused during allocation. There are times when regalloc reuses the registers of regular VGPRs operations for wwm-operations in a small range leading to unwantedly clobbering their inactive lanes causing correctness issues which are hard to trace. This patch splits the VGPR allocation pipeline further to allocate wwm-registers first and the regular VGPR operands in a separate pipeline. The splitting would ensure that the physical registers used for wwm allocations won't taken part in the next allocation pipeline to avoid any such clobbering.
96f7af4
to
0eedc73
Compare
Can this finally land after 735a5f6? |
Yes. Will be landing in a few minutes. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/123/builds/6661 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/146/builds/1277 Here is the relevant piece of the build log for the reference
|
Allocating wwm-registers and regular VGPR operands together imposes many challenges in the way the registers are reused during allocation. There are times when regalloc reuses the registers of regular VGPRs operations for wwm-operations in a small range leading to unwantedly clobbering their inactive lanes causing correctness issues which are hard to trace. This patch splits the VGPR allocation pipeline further to allocate wwm-registers first and the regular VGPR operands in a separate pipeline. The splitting would ensure that the physical registers used for wwm allocations won't taken part in the next allocation pipeline to avoid any such clobbering. Change-Id: Ib2c5b9b53944bf78709465a9d1786d129434ce40
The failure was due to blender perf degradation about 10%. Since this patch is critical for correctness, we won't revert this patch due to perf degradation. Instead, we will address it with following up fix. Currently this failure was silenced by rebasing the perf results in the buildbot. |
Allocating wwm-registers and per-thread VGPR operands together imposes many challenges in the way the registers are reused during allocation. There are times when regalloc reuses the registers of regular VGPRs operations for wwm-operations in a small range leading to unwantedly clobbering their inactive lanes causing correctness issues that are hard to trace. This patch splits the VGPR allocation pipeline further to allocate wwm-registers first and the regular VGPR operands in a separate pipeline. The splitting would ensure that the physical registers used for wwm allocations won't take part in the next allocation pipeline to avoid any such clobbering.
Allocating wwm-registers and per-thread VGPR operands together imposes many challenges in the way the registers are reused during allocation. There are times when regalloc reuses the registers of regular VGPRs operations for wwm-operations in a small range leading to unwantedly clobbering their inactive lanes causing correctness issues that are hard to trace. This patch splits the VGPR allocation pipeline further to allocate wwm-registers first and the regular VGPR operands in a separate pipeline. The splitting would ensure that the physical registers used for wwm allocations won't take part in the next allocation pipeline to avoid any such clobbering.
Allocating wwm-registers and per-thread VGPR operands together imposes many challenges in the way the registers are reused during allocation. There are times when regalloc reuses the registers of regular VGPRs operations for wwm-operations in a small range leading to unwantedly clobbering their inactive lanes causing correctness issues that are hard to trace. This patch splits the VGPR allocation pipeline further to allocate wwm-registers first and the regular VGPR operands in a separate pipeline. The splitting would ensure that the physical registers used for wwm allocations won't take part in the next allocation pipeline to avoid any such clobbering.
With llvm#93526 we split the regalloc pipeline further to have a standalone allocation for wwm registers and per-lane VGPRs. Currently the presence of the wwm-spill reloads inserted at the bb-top limits the isBasicPrologue function during the per-lane vgpr regalloc to skip past the exec manipulation instruction and ended up causing incorrect codegen. The wmm-spill inserted during the wwm-regalloc pipeline should also be included in the bb-prolog so that the per-lane vgpr regalloc pipeline can identify the appropriate insertion points for their spills and copies.
With #93526 we split the regalloc pipeline further to have a standalone allocation for wwm registers and per-lane VGPRs. Currently the presence of the wwm-spill reloads inserted at the bb-top limits the isBasicPrologue function during the per-lane vgpr regalloc to skip past the exec manipulation instruction and ended up causing incorrect codegen. The wmm-spill inserted during the wwm-regalloc pipeline should also be included in the bb-prolog so that the per-lane vgpr regalloc pipeline can identify the appropriate insertion points for their spills and copies.
Allocating wwm-registers and regular VGPR operands together imposes many challenges in the way the registers are reused during allocation. There are times when regalloc reuses the registers of regular VGPRs operations for wwm-operations in a small range leading to unwantedly clobbering their inactive lanes causing correctness issues which are hard to trace. This patch splits the VGPR allocation pipeline further to allocate wwm-registers first and the regular VGPR operands in a separate pipeline. The splitting would ensure that the physical registers used for wwm allocations won't taken part in the next allocation pipeline to avoid any such clobbering. Change-Id: Ie5a8f7a934c39eac1c6bd38b68a721f5178ddfbe
With llvm#93526 we split the regalloc pipeline further to have a standalone allocation for wwm registers and per-lane VGPRs. Currently the presence of the wwm-spill reloads inserted at the bb-top limits the isBasicPrologue function during the per-lane vgpr regalloc to skip past the exec manipulation instruction and ended up causing incorrect codegen. The wmm-spill inserted during the wwm-regalloc pipeline should also be included in the bb-prolog so that the per-lane vgpr regalloc pipeline can identify the appropriate insertion points for their spills and copies. Change-Id: Icb5596a4ca8204414d54b4b30b614b46927accc2
When spilling a VGPR in `emitPrologue`, chain functions prefer to use offsets to access the stack instead of the SP. This patch fixes `emitEpilogue` to do the same. It also brings back some test coverage that was lost in #93526, when WWM registers started being shifted to the lowest available range (which meant that tests that were originally spilling v8 would shift to spill v0, which is a scratch register for chain functions and didn't get spilled). Change-Id: Icb07fccd859b563cd45f74c25ae578ecb38bdeeb
Allocating wwm-registers and regular VGPR operands together imposes many challenges in the way the registers are reused during allocation. There are times when regalloc reuses the registers of regular VGPRs operations for wwm-operations in a small range leading to unwantedly clobbering their inactive lanes causing correctness issues which are hard to trace. This patch splits the VGPR allocation pipeline further to allocate wwm-registers first and the regular VGPR operands in a separate pipeline. The splitting would ensure that the physical registers used for wwm allocations won't taken part in the next allocation pipeline to avoid any such clobbering. Change-Id: I372693d8545df6cad7968e59c06da88552df1d01
With llvm#93526 we split the regalloc pipeline further to have a standalone allocation for wwm registers and per-lane VGPRs. Currently the presence of the wwm-spill reloads inserted at the bb-top limits the isBasicPrologue function during the per-lane vgpr regalloc to skip past the exec manipulation instruction and ended up causing incorrect codegen. The wmm-spill inserted during the wwm-regalloc pipeline should also be included in the bb-prolog so that the per-lane vgpr regalloc pipeline can identify the appropriate insertion points for their spills and copies. Change-Id: Icb5596a4ca8204414d54b4b30b614b46927accc2
Allocating wwm-registers and per-thread VGPR operands
together imposes many challenges in the way the
registers are reused during allocation. There are
times when regalloc reuses the registers of regular
VGPRs operations for wwm-operations in a small range
leading to unwantedly clobbering their inactive lanes
causing correctness issues that are hard to trace.
This patch splits the VGPR allocation pipeline further
to allocate wwm-registers first and the regular VGPR
operands in a separate pipeline. The splitting would
ensure that the physical registers used for wwm
allocations won't take part in the next allocation
pipeline to avoid any such clobbering.