Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Change CF intrinsics lowering to reconverge on predecessors #108596

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope
// GFX90A-CAS-LABEL: _Z14atomic_add_casPf
// GFX90A-CAS: flat_atomic_cmpswap
// GFX90A-CAS: s_cbranch_execnz
// GFX90A-CAS: s_cbranch_scc1
__device__ float atomic_add_cas(float *p) {
return __atomic_fetch_add(p, 1.0f, memory_order_relaxed);
}
8 changes: 5 additions & 3 deletions llvm/include/llvm/Analysis/CFGPrinter.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,9 +272,11 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
unsigned OpNo = I.getSuccessorIndex();
const Instruction *TI = Node->getTerminator();
BasicBlock *SuccBB = TI->getSuccessor(OpNo);
auto BranchProb = CFGInfo->getBPI()->getEdgeProbability(Node, SuccBB);
double WeightPercent = ((double)BranchProb.getNumerator()) /
((double)BranchProb.getDenominator());
// auto BranchProb = CFGInfo->getBPI()->getEdgeProbability(Node, SuccBB);
// double WeightPercent = ((double)BranchProb.getNumerator()) /
// ((double)BranchProb.getDenominator());
double WeightPercent = 0.5;

std::string TTAttr =
formatv("tooltip=\"{0} -> {1}\\nProbability {2:P}\" ", getBBName(Node),
getBBName(SuccBB), WeightPercent);
Expand Down
3 changes: 1 addition & 2 deletions llvm/include/llvm/CodeGen/MachineBasicBlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -879,8 +879,7 @@ class MachineBasicBlock
/// debug. This is the correct point to insert copies at the beginning of a
/// basic block. \p Reg is the register being used by a spill or defined for a
/// restore/split during register allocation.
iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg = Register(),
bool SkipPseudoOp = true);
iterator SkipPHIsLabelsAndDebug(iterator I, bool SkipPseudoOp = true);

/// Returns an iterator to the first terminator instruction of this basic
/// block. If a terminator does not exist, it returns end().
Expand Down
3 changes: 1 addition & 2 deletions llvm/include/llvm/CodeGen/TargetInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -2058,8 +2058,7 @@ class TargetInstrInfo : public MCInstrInfo {
/// other instructions shall be inserted before it. This can be implemented
/// to prevent register allocator to insert spills for \p Reg before such
/// instructions.
virtual bool isBasicBlockPrologue(const MachineInstr &MI,
Register Reg = Register()) const {
virtual bool isBasicBlockPrologue(const MachineInstr &MI) const {
return false;
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3098,7 +3098,7 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
[llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
>;

def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty],
[IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// Represent unreachable in a divergent region.
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -461,8 +461,7 @@ class StatepointState {

if (EHPad && !RC.hasReload(Reg, RegToSlotIdx[Reg], EHPad)) {
RC.recordReload(Reg, RegToSlotIdx[Reg], EHPad);
auto EHPadInsertPoint =
EHPad->SkipPHIsLabelsAndDebug(EHPad->begin(), Reg);
auto EHPadInsertPoint = EHPad->SkipPHIsLabelsAndDebug(EHPad->begin());
insertReloadBefore(Reg, EHPadInsertPoint, EHPad);
LLVM_DEBUG(dbgs() << "...also reload at EHPad "
<< printMBBReference(*EHPad) << "\n");
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/InlineSpiller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
MachineBasicBlock *MBB = LIS.getMBBFromIndex(SrcVNI->def);
MachineBasicBlock::iterator MII;
if (SrcVNI->isPHIDef())
MII = MBB->SkipPHIsLabelsAndDebug(MBB->begin(), SrcReg);
MII = MBB->SkipPHIsLabelsAndDebug(MBB->begin());
else {
MachineInstr *DefMI = LIS.getInstructionFromIndex(SrcVNI->def);
assert(DefMI && "Defining instruction disappeared");
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/CodeGen/MachineBasicBlock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,13 +223,13 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {

MachineBasicBlock::iterator
MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I,
Register Reg, bool SkipPseudoOp) {
bool SkipPseudoOp) {
const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();

iterator E = end();
while (I != E && (I->isPHI() || I->isPosition() || I->isDebugInstr() ||
(SkipPseudoOp && I->isPseudoProbe()) ||
TII->isBasicBlockPrologue(*I, Reg)))
TII->isBasicBlockPrologue(*I)))
++I;
// FIXME: This needs to change if we wish to bundle labels / dbg_values
// inside the bundle.
Expand Down
6 changes: 2 additions & 4 deletions llvm/lib/CodeGen/SplitKit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -806,10 +806,8 @@ SlotIndex SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) {
return Start;
}

unsigned RegIdx = 0;
Register Reg = LIS.getInterval(Edit->get(RegIdx)).reg();
VNInfo *VNI = defFromParent(RegIdx, ParentVNI, Start, MBB,
MBB.SkipPHIsLabelsAndDebug(MBB.begin(), Reg));
VNInfo *VNI = defFromParent(0, ParentVNI, Start, MBB,
MBB.SkipPHIsLabelsAndDebug(MBB.begin()));
RegAssign.insert(Start, VNI->def, OpenIdx);
LLVM_DEBUG(dump());
return VNI->def;
Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1551,11 +1551,12 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
return true;
}

bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic(
MachineInstr &MI) const {
// FIXME: Manually selecting to avoid dealing with the SReg_1 trick
// SelectionDAG uses for wave32 vs wave64.
MachineBasicBlock *BB = MI.getParent();
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE))
.add(MI.getOperand(1));

Register Reg = MI.getOperand(1).getReg();
Expand Down Expand Up @@ -2083,8 +2084,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
MachineInstr &I) const {
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_end_cf:
return selectEndCfIntrinsic(I);
case Intrinsic::amdgcn_wave_reconverge:
return selectWaveReconvergeIntrinsic(I);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
return selectDSOrderedIntrinsic(I, IntrinsicID);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectReturnAddress(MachineInstr &I) const;
bool selectG_INTRINSIC(MachineInstr &I) const;

bool selectEndCfIntrinsic(MachineInstr &MI) const;
bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const;
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
Expand Down
22 changes: 10 additions & 12 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -785,8 +785,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
const unsigned MovExecOpc =
Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
const unsigned MovExecTermOpc =
Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;

const unsigned XorTermOpc = Subtarget.isWave32() ?
AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
Expand Down Expand Up @@ -949,27 +947,27 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(

B.setInsertPt(*BodyBB, BodyBB->end());

Register LoopMask = MRI.createVirtualRegister(
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
B.buildInstr(XorTermOpc)
.addDef(ExecReg)
.addReg(ExecReg)
.addReg(NewExec);
.addDef(LoopMask)
.addReg(ExecReg)
.addReg(NewExec);

// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
// s_cbranch_scc0?

// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP)
.addReg(LoopMask)
.addReg(NewExec)
.addMBB(LoopBB);

// Save the EXEC mask before the loop.
BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
.addReg(ExecReg);

// Restore the EXEC mask after the loop.
B.setMBB(*RestoreExecBB);
B.buildInstr(MovExecTermOpc)
.addDef(ExecReg)
.addReg(SaveExecReg);

// Set the insert point after the original instruction, so any new
// instructions will be in the remainder.
Expand Down Expand Up @@ -4967,7 +4965,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
case Intrinsic::amdgcn_end_cf: {
case Intrinsic::amdgcn_wave_reconverge: {
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
Expand Down
83 changes: 46 additions & 37 deletions llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
Expand Down Expand Up @@ -55,7 +56,7 @@ class SIAnnotateControlFlow {
Function *Else;
Function *IfBreak;
Function *Loop;
Function *EndCf;
Function *WaveReconverge;

DominatorTree *DT;
StackVector Stack;
Expand Down Expand Up @@ -88,7 +89,7 @@ class SIAnnotateControlFlow {

bool handleLoop(BranchInst *Term);

bool closeControlFlow(BasicBlock *BB);
bool tryWaveReconverge(BasicBlock *BB);

public:
SIAnnotateControlFlow(Module &M, const GCNSubtarget &ST, DominatorTree &DT,
Expand Down Expand Up @@ -123,7 +124,8 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
{ IntMask });
Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
WaveReconverge = Intrinsic::getDeclaration(
&M, Intrinsic::amdgcn_wave_reconverge, {IntMask});
}

/// Is the branch condition uniform or did the StructurizeCFG pass
Expand Down Expand Up @@ -185,8 +187,6 @@ bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {

/// Open a new "If" block
bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
if (isUniform(Term))
return false;

IRBuilder<> IRB(Term);
Value *IfCall = IRB.CreateCall(If, {Term->getCondition()});
Expand Down Expand Up @@ -287,43 +287,43 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
}

/// Close the last opened control flow
bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
llvm::Loop *L = LI->getLoopFor(BB);

assert(Stack.back().first == BB);
bool SIAnnotateControlFlow::tryWaveReconverge(BasicBlock *BB) {

if (L && L->getHeader() == BB) {
// We can't insert an EndCF call into a loop header, because it will
// get executed on every iteration of the loop, when it should be
// executed only once before the loop.
SmallVector <BasicBlock *, 8> Latches;
L->getLoopLatches(Latches);
if (succ_empty(BB))
return false;

BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
if (Term->getNumSuccessors() == 1) {
// The current BBs single successor is a top of the stack. We need to
// reconverge over thaqt path.
BasicBlock *SingleSucc = *succ_begin(BB);
BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end();
if (isTopOfStack(SingleSucc)) {
Value *Exec = Stack.back().second;
IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec});
}
} else {
// We have a uniform conditional branch terminating the block.
// THis block may be the last in the Then path of the enclosing divergent
// IF.
if (!isUniform(Term))
// Divergent loop is going to be further processed in another place
return false;

for (auto Succ : Term->successors()) {
if (isTopOfStack(Succ)) {
// Just split to make a room for further WAVE_RECONVERGE insertion
SmallVector<BasicBlock *, 2> Preds;
for (BasicBlock *Pred : predecessors(BB)) {
if (!is_contained(Latches, Pred))
Preds.push_back(Pred);
for (auto P : predecessors(Succ)) {
if (DT->dominates(BB, P))
Preds.push_back(P);
}

BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
SplitBlockPredecessors(Succ, Preds, ".reconverge", &DTU, LI, nullptr,
false);
}

Value *Exec = popSaved();
BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt();
if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
Instruction *ExecDef = cast<Instruction>(Exec);
BasicBlock *DefBB = ExecDef->getParent();
if (!DT->dominates(DefBB, BB)) {
// Split edge to make Def dominate Use
FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
}
IRBuilder<> IRB(FirstInsertionPt->getParent(), FirstInsertionPt);
// TODO: StructurizeCFG 'Flow' blocks have debug locations from the
// condition, for now just avoid copying these DebugLocs so that stepping
// out of the then/else block in a debugger doesn't step to the condition.
IRB.SetCurrentDebugLocation(DebugLoc());
IRB.CreateCall(EndCf, {Exec});
}

return true;
Expand All @@ -341,14 +341,18 @@ bool SIAnnotateControlFlow::run(Function &F) {

if (!Term || Term->isUnconditional()) {
if (isTopOfStack(BB))
Changed |= closeControlFlow(BB);
Stack.pop_back();
Changed |= tryWaveReconverge(BB);

continue;
}

if (I.nodeVisited(Term->getSuccessor(1))) {
if (isTopOfStack(BB))
Changed |= closeControlFlow(BB);
Stack.pop_back();
// Let's take care of uniform loop latch that may be closing the Then
// path of the enclosing divergent branch.
Changed |= tryWaveReconverge(BB);

if (DT->dominates(Term->getSuccessor(1), BB))
Changed |= handleLoop(Term);
Expand All @@ -363,9 +367,14 @@ bool SIAnnotateControlFlow::run(Function &F) {
continue;
}

Changed |= closeControlFlow(BB);
Stack.pop_back();
}

if (isUniform(Term))
// Uniform conditional branch may be in the block that closes the Then
// path of the divergent conditional branch.
Changed |= tryWaveReconverge(BB);
else
Changed |= openIf(Term);
}

Expand Down
Loading
Loading