From 2f11810c70b7e22ce3b753fcba535e52edad4f05 Mon Sep 17 00:00:00 2001 From: anikelal Date: Tue, 25 Nov 2025 15:08:39 +0530 Subject: [PATCH] NFCs for nonSSA Exec Mask manipulation instrs --- llvm/include/llvm/CodeGen/MachineBasicBlock.h | 1 + llvm/lib/CodeGen/MachineBasicBlock.cpp | 7 + .../lib/Target/AMDGPU/AMDGPUWaveTransform.cpp | 153 +++++++++++++++--- llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp | 132 ++++++++++++--- llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h | 16 +- 5 files changed, 260 insertions(+), 49 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h index a1023d4c32ce4..18aad996e07c5 100644 --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -263,6 +263,7 @@ class MachineBasicBlock /// Return the name of the corresponding LLVM basic block, or an empty string. LLVM_ABI StringRef getName() const; + LLVM_ABI std::string name() const; /// Return a formatted string to identify this block and its parent function. LLVM_ABI std::string getFullName() const; diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index be94e1e6d25b6..f2e6c8557b741 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -328,6 +328,13 @@ StringRef MachineBasicBlock::getName() const { return StringRef("", 0); } +std::string MachineBasicBlock::name() const { + std::string Name = ("BB." + Twine(getNumber()) + ".").str(); + if (getBasicBlock()) + Name += getBasicBlock()->getName(); + return Name; +} + /// Return a hopefully unique identifier for this block. std::string MachineBasicBlock::getFullName() const { std::string Name; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp index ceda928f202f5..90cb044285bae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp @@ -153,6 +153,42 @@ struct WaveNode { Out << "'; }); } + + std::string getName(){ + std::string str; + if (Block) + str = Block->name(); + if (Block && FlowNum) + str = str + "."; + if (FlowNum) + str = str + ""; + return str; + } + void dump() const { + dbgs() << "--------------------------------" << '\n'; + dbgs() << "WaveNode: " << printableName() << '\n'; + dbgs() << " OrderIndex: " << OrderIndex << '\n'; + dbgs() << " Predecessors: "; + for (WaveNode *Pred : Predecessors) + dbgs() << Pred->printableName() << ' '; + dbgs() << '\n'; + dbgs() << " Successors: "; + for (WaveNode *Succ : Successors) + dbgs() << Succ->printableName() << ' '; + dbgs() << '\n'; + dbgs() << " LanePredecessors: "; + for (const LaneEdge &LanePred : LanePredecessors) + dbgs() << "(lane=" << LanePred.Lane->printableName() + << ", wave=" << LanePred.Wave->printableName() << ") "; + dbgs() << '\n'; + dbgs() << " LaneSuccessors: "; + for (const LaneEdge &LaneSucc : LaneSuccessors) + dbgs() << "(lane=" << LaneSucc.Lane->printableName() + << ", wave=" << LaneSucc.Wave->printableName() << ") "; + if(LatestPostDom != nullptr) dbgs() << "\nlatestPostDom:" << LatestPostDom->printableName(); + else dbgs() << "latestPostDom:NULL"; + dbgs() << "\n--------------------------------\n\n"; + } }; /// \brief Helper class for making a CFG reconverging. @@ -1545,6 +1581,16 @@ class ControlFlowRewriter { explicit LaneOriginInfo(WaveNode *Node, Register CondReg = {}, bool InvertCondition = false) : Node(Node), CondReg(CondReg), InvertCondition(InvertCondition) {} + + friend llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const LaneOriginInfo& loi) { + if(loi.Node == nullptr) + os << "{WaveNode=nullptr"; + else + os << "{WaveNode="<< loi.Node->printableName(); + os << ", CondReg=" << loi.CondReg.id() << ", InvertCond:" << loi.InvertCondition << "}"; + return os; + } + }; struct CFGNodeInfo { @@ -1574,6 +1620,29 @@ class ControlFlowRewriter { Register PrimarySuccessorExec; explicit CFGNodeInfo(WaveNode *Node) : Node(Node) {} + + friend llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const CFGNodeInfo& ni) { + os << "CFGNodeInfo{\nOrigExit=" << ni.OrigExit << ", \nOrigCondition=" << ni.OrigCondition.id(); + if(ni.OrigSuccCond == nullptr) + os << ", \nOrigSuccCond=nullptr"; + else + os << ", \nOrigSuccCond=" << ni.OrigSuccCond->printableName(); + if(ni.OrigSuccFinal == nullptr) + os << ", \nOrigSuccFinal=nullptr"; + else + os << ", \nOrigSuccFinal=" << ni.OrigSuccFinal->printableName(); + os << ", \nPrimarySuccessorExec=" << ni.PrimarySuccessorExec.id(); + os << ", \nOriginBranch(Ri)={"; + for (const auto &E : ni.OriginBranch) { + os << "(" << E.getPointer()->printableName() << "," << E.getInt() << "),"; + } + os << "}, \norigins(Ti):{"; + for (const auto &E : ni.origins) { + os << E << ","; + } + return os << "}\n}\n"; + } + }; /// Information required to synthesize divergent terminators with a common @@ -1784,6 +1853,7 @@ void ControlFlowRewriter::prepareWaveCfg() { /// establishing wave-level control flow and insert instructions for EXEC mask /// manipulation. void ControlFlowRewriter::rewrite() { + LLVM_DEBUG(dbgs() << "\nrewrite() begins\n"); GCNLaneMaskAnalysis LMA(Function); const AMDGPU::LaneMaskConstants &LMC = LMU.getLaneMaskConsts(); @@ -1839,7 +1909,10 @@ void ControlFlowRewriter::rewrite() { Opcode = AMDGPU::S_CBRANCH_SCC1; } else { Register CondReg = Info.OrigCondition; - if (!LMA.isSubsetOfExec(CondReg, *Node->Block)) { + bool isCondRegSubsetOfExec = LMA.isSubsetOfExec(CondReg, *Node->Block); + LLVM_DEBUG(dbgs() << "isSubsetOfExec(" << printReg(CondReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << Node->Block->name() << ") : " << isCondRegSubsetOfExec << "\n"); + + if (!isCondRegSubsetOfExec) { CondReg = LMU.createLaneMaskReg(); BuildMI(*Node->Block, Node->Block->end(), {}, TII.get(LMC.AndOpc), CondReg) @@ -1867,7 +1940,9 @@ void ControlFlowRewriter::rewrite() { .addMBB(Other->Block); } } - + LLVM_DEBUG(dbgs() << "CFG_BEGIN:" << Function.getName().str() << "_pre\n"); + LLVM_DEBUG(Function.dump()); + LLVM_DEBUG(dbgs() << "CFG_END:" << Function.getName().str() << "_pre\n"); // Step 2: Insert lane masks and new terminators for divergent nodes. // // RegMap maps (block, register) -> (masked, inverted). @@ -1879,16 +1954,20 @@ void ControlFlowRewriter::rewrite() { Updater.setAccumulating(true); for (WaveNode *LaneTarget : NodeOrder) { + LLVM_DEBUG(dbgs() << "\nPROCESSING NODE:" << LaneTarget->printableName() << "\n\n"); + LaneTarget->dump(); CFGNodeInfo &LaneTargetInfo = NodeInfo.find(LaneTarget)->second; + LLVM_DEBUG(dbgs() << LaneTargetInfo << '\n'); if (!llvm::any_of( LaneTargetInfo.OriginBranch, [](const auto &OriginBranch) { return OriginBranch.getInt(); })) { // No divergent branches towards this node, nothing to be done. + LLVM_DEBUG(dbgs() << "No divergent branches towards this node, nothing to be done.\n"); continue; } - LLVM_DEBUG(dbgs() << "\nDivergent branches for " + LLVM_DEBUG(dbgs() << "Divergent branches for " << LaneTarget->printableName() << '\n'); // Step 2.1: Add conditions branching to LaneTarget to the Lane mask @@ -1896,14 +1975,21 @@ void ControlFlowRewriter::rewrite() { // FIXME: we are creating a register here only to initialize the updater Updater.init(LMU.createLaneMaskReg()); Updater.addReset(*LaneTarget->Block, GCNLaneMaskUpdater::ResetInMiddle); + LLVM_DEBUG(dbgs() << "\nMark ResetInMiddle(X): " << LaneTarget->printableName() << '\n'); for (const auto &NodeDivergentPair : LaneTargetInfo.OriginBranch) { + LLVM_DEBUG(dbgs() << "Mark ResetAtEnd(Ri): " << NodeDivergentPair.getPointer()->printableName() << '\n'); Updater.addReset(*NodeDivergentPair.getPointer()->Block, GCNLaneMaskUpdater::ResetAtEnd); } - + LLVM_DEBUG(dbgs() << "Iterating over Ti\n\n"); for (const LaneOriginInfo &LaneOrigin : LaneTargetInfo.origins) { Register CondReg; + LLVM_DEBUG(dbgs() << "\nOrigin(Ti): " << LaneOrigin << '\n'); + if(LaneOrigin.CondReg){ + dbgs() << "LaneOrigin.CondReg:" << printReg(LaneOrigin.CondReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"; + } + if (!LaneOrigin.CondReg) { assert(!LaneOrigin.InvertCondition); CondReg = getAllOnes(); @@ -1927,29 +2013,30 @@ void ControlFlowRewriter::rewrite() { LaneOrigin.Node->Block->getFirstTerminator(), {}, TII.get(LMC.CSelectOpc), CondReg) .addReg(LMC.ExecReg) - .addImm(0); + .addImm(0)->dump(); } else { BuildMI(*LaneOrigin.Node->Block, LaneOrigin.Node->Block->getFirstTerminator(), {}, TII.get(LMC.CSelectOpc), CondReg) .addImm(0) - .addReg(LMC.ExecReg); + .addReg(LMC.ExecReg)->dump(); } } else { CondReg = LaneOrigin.CondReg; - if (!LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block)) { + bool isCondRegSubsetOfExec = LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block); + LLVM_DEBUG(dbgs() << "isSubsetOfExec(" << printReg(LaneOrigin.CondReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << LaneOrigin.Node->Block->name() << ") : " << isCondRegSubsetOfExec << "\n"); + if (!isCondRegSubsetOfExec) { Register Prev = CondReg; CondReg = LMU.createLaneMaskReg(); BuildMI(*LaneOrigin.Node->Block, LaneOrigin.Node->Block->getFirstTerminator(), {}, TII.get(LMC.AndOpc), CondReg) .addReg(LMC.ExecReg) - .addReg(Prev); + .addReg(Prev)->dump(); RegMap[std::make_pair(LaneOrigin.Node->Block, LaneOrigin.CondReg)] .first = CondReg; } - if (LaneOrigin.InvertCondition) { // CondReg = EXEC ^ origCond; // @@ -1965,7 +2052,7 @@ void ControlFlowRewriter::rewrite() { LaneOrigin.Node->Block->getFirstTerminator(), {}, TII.get(LMC.XorOpc), CondReg) .addReg(LaneOrigin.CondReg) - .addImm(-1); + .addImm(-1)->dump(); RegMap[std::make_pair(LaneOrigin.Node->Block, LaneOrigin.CondReg)] .second = CondReg; @@ -1975,20 +2062,23 @@ void ControlFlowRewriter::rewrite() { } LLVM_DEBUG( - dbgs() << " available @ " << LaneOrigin.Node->printableName() << ": " + dbgs() << " Contributions @ " << LaneOrigin.Node->printableName() << ": " << printReg(CondReg, MRI.getTargetRegisterInfo(), 0, &MRI) << '\n'); Updater.addAvailable(*LaneOrigin.Node->Block, CondReg); } + LLVM_DEBUG(dbgs() << "Iterating over Ri\n\n"); // Step 2.2: Synthesize EXEC updates and branch instructions. for (const auto &NodeDivergentPair : LaneTargetInfo.OriginBranch) { if (!NodeDivergentPair.getInt()) continue; // not a divergent branch - + LLVM_DEBUG(dbgs() << "Synthesize EXEC updates and branch instructions for " << NodeDivergentPair.getPointer()->printableName() << "\n"); WaveNode *OriginNode = NodeDivergentPair.getPointer(); CFGNodeInfo &OriginCFGNodeInfo = NodeInfo.find(OriginNode)->second; + LLVM_DEBUG(dbgs() << OriginCFGNodeInfo << '\n'); + OriginCFGNodeInfo.PrimarySuccessorExec = Updater.getValueAfterMerge(*OriginNode->Block); @@ -2001,37 +2091,46 @@ void ControlFlowRewriter::rewrite() { BuildMI(*OriginNode->Block, OriginNode->Block->end(), {}, TII.get(LMC.MovTermOpc), LMC.ExecReg) - .addReg(OriginCFGNodeInfo.PrimarySuccessorExec); + .addReg(OriginCFGNodeInfo.PrimarySuccessorExec)->dump(); BuildMI(*OriginNode->Block, OriginNode->Block->end(), {}, - TII.get(AMDGPU::SI_WAVE_CF_EDGE)); + TII.get(AMDGPU::SI_WAVE_CF_EDGE))->dump(); BuildMI(*OriginNode->Block, OriginNode->Block->end(), {}, TII.get(AMDGPU::S_CBRANCH_EXECZ)) - .addMBB(OriginNode->Successors[1]->Block); + .addMBB(OriginNode->Successors[1]->Block)->dump(); BuildMI(*OriginNode->Block, OriginNode->Block->end(), {}, TII.get(AMDGPU::S_BRANCH)) - .addMBB(OriginNode->Successors[0]->Block); + .addMBB(OriginNode->Successors[0]->Block)->dump(); + + LLVM_DEBUG(dbgs() << "\nNodeDivergentPair:" << NodeDivergentPair.getPointer()->printableName() << "," << NodeDivergentPair.getInt() << " complete...\n"); } + LLVM_DEBUG(dbgs() << "CFG_BEGIN:" << Function.getName().str() << "_" << LaneTarget->getName() << "\n"); LLVM_DEBUG(Function.dump()); + LLVM_DEBUG(dbgs() << "CFG_END:" << Function.getName().str() << "_" << LaneTarget->getName() << "\n"); + } + LLVM_DEBUG(dbgs() << "\nInsert rejoin masks\n"); // Step 3: Insert rejoin masks. + LLVM_DEBUG(dbgs() << "Iterate over secondary nodes\n"); for (WaveNode *Secondary : ReconvergeCfg.nodes()) { if (!Secondary->IsSecondary) continue; LLVM_DEBUG(dbgs() << "\nRejoin @ " << Secondary->printableName() << '\n'); - + Secondary->dump(); // FIXME: we are creating a register here only to initialize the updater Updater.init(LMU.createLaneMaskReg()); Updater.addReset(*Secondary->Block, GCNLaneMaskUpdater::ResetInMiddle); + LLVM_DEBUG(dbgs() << "\nMark ResetInMiddle(X): " << Secondary->printableName() << '\n'); for (WaveNode *Pred : Secondary->Predecessors) { if (!Pred->IsDivergent || Pred->Successors.size() == 1) continue; CFGNodeInfo &PredInfo = NodeInfo.find(Pred)->second; - Register PrimaryExec = PredInfo.PrimarySuccessorExec; + Register PrimaryExec = PredInfo.PrimarySuccessorExec; + LLVM_DEBUG(dbgs() << "Pred:" << Pred->Block->name() << "\nPrimaryExec:" << printReg(PrimaryExec,MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"); MachineInstr *PrimaryExecDef; for (;;) { @@ -2041,6 +2140,10 @@ void ControlFlowRewriter::rewrite() { PrimaryExec = PrimaryExecDef->getOperand(1).getReg(); } + LLVM_DEBUG(dbgs() << "PrimaryExecDef:"); + LLVM_DEBUG(PrimaryExecDef->dump()); + LLVM_DEBUG(dbgs() << "\n"); + // Rejoin = EXEC ^ PrimaryExec // // Fold immediately if PrimaryExec was obtained via XOR as well. @@ -2072,11 +2175,11 @@ void ControlFlowRewriter::rewrite() { BuildMI(*Pred->Block, Pred->Block->getFirstTerminator(), {}, TII.get(LMC.XorOpc), Rejoin) .addReg(LMC.ExecReg) - .addReg(PrimaryExec); + .addReg(PrimaryExec)->dump(); } LLVM_DEBUG( - dbgs() << " available @ " << Pred->printableName() << ": " + dbgs() << " Rejoin available @ " << Pred->printableName() << ": " << printReg(Rejoin, MRI.getTargetRegisterInfo(), 0, &MRI) << '\n'); @@ -2087,12 +2190,20 @@ void ControlFlowRewriter::rewrite() { BuildMI(*Secondary->Block, Secondary->Block->getFirstNonPHI(), {}, TII.get(LMC.OrOpc), LMC.ExecReg) .addReg(LMC.ExecReg) - .addReg(Rejoin); + .addReg(Rejoin)->dump(); + LLVM_DEBUG(dbgs() << "CFG_BEGIN:" << Function.getName().str() << "_" << Secondary->Block->name() << ".rejoin\n"); LLVM_DEBUG(Function.dump()); + LLVM_DEBUG(dbgs() << "CFG_END:" << Function.getName().str() << "_" << Secondary->Block->name() << ".rejoin\n"); + + } Updater.cleanup(); + + LLVM_DEBUG(dbgs() << "CFG_BEGIN:" << Function.getName().str() << "_clean\n"); + LLVM_DEBUG(Function.dump()); + LLVM_DEBUG(dbgs() << "CFG_END:" << Function.getName().str() << "_clean\n"); } namespace { diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp index d7b19cbe745a8..8bc1e7a552d4c 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp @@ -13,7 +13,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" - +#include "llvm/Support/Debug.h" using namespace llvm; /// Check whether the register could be a lane-mask register. @@ -112,20 +112,36 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, bool PrevConstant = !PrevReg || isConstantLaneMask(PrevReg, PrevVal); bool CurVal = false; bool CurConstant = isConstantLaneMask(CurReg, CurVal); + MachineRegisterInfo &MRI = MF.getRegInfo(); + Printable destRegPrintable = printReg(DstReg , MRI.getTargetRegisterInfo(), 0, &MRI); + Printable curRegPrintable = printReg(CurReg , MRI.getTargetRegisterInfo(), 0, &MRI); + Printable prevRegPrintable = printReg(PrevReg , MRI.getTargetRegisterInfo(), 0, &MRI); + + dbgs() << "\t\tGCNLaneMaskUtils::buildMergeLaneMasks(" << MBB.name() << ",...):\n"; + dbgs() << "\t\t DstReg : BlockInfo.Merged : " << destRegPrintable << "\n"; + dbgs() << "\t\t PrevReg : Previous : " << prevRegPrintable << "\n"; + dbgs() << "\t\t CurReg : BlockInfo.Value : " << curRegPrintable << "\n"; + dbgs() << "\t\t Create instr : " << destRegPrintable << " = (" << prevRegPrintable << " & ~EXEC) | (" << curRegPrintable << " & EXEC) : \n"; + dbgs() << "\t\tPrevConstant:" << PrevConstant << " CurConstant:" << CurConstant << "\n"; + dbgs() << "\t\tPrevVal:" << PrevVal << " CurVal:" << CurVal << "\n"; + assert(PrevReg || !accumulating); - if (PrevConstant && CurConstant) { + if (PrevConstant && CurConstant) {// is wave wide constant? if (PrevVal == CurVal) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); + dbgs() << "\t "; + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg)->dump(); } else if (CurVal) { // If PrevReg is undef, prefer to propagate a full constant. + dbgs() << "\t "; BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) - .addReg(PrevReg ? LMC.ExecReg : CurReg); + .addReg(PrevReg ? LMC.ExecReg : CurReg)->dump(); } else { + dbgs() << "\t "; BuildMI(MBB, I, DL, TII->get(LMC.XorOpc), DstReg) .addReg(LMC.ExecReg) - .addImm(-1); + .addImm(-1)->dump(); } return; } @@ -139,21 +155,26 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, PrevMaskedReg = PrevReg; } else { PrevMaskedReg = createLaneMaskReg(); + dbgs() << "\t "; PrevMaskedBuilt = BuildMI(MBB, I, DL, TII->get(LMC.AndN2Opc), PrevMaskedReg) .addReg(PrevReg) .addReg(LMC.ExecReg); + PrevMaskedBuilt->dump(); } } if (!CurConstant) { - if ((PrevConstant && PrevVal) || - (LMA && LMA->isSubsetOfExec(CurReg, MBB))) { + bool isCurRegSubsetOfExec = LMA && LMA->isSubsetOfExec(CurReg, MBB); + dbgs() << "isSubsetOfExec(" << printReg(CurReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << MBB.name() << ") : " << isCurRegSubsetOfExec << "\n"; + if ((PrevConstant && PrevVal) || isCurRegSubsetOfExec) { CurMaskedReg = CurReg; } else { CurMaskedReg = createLaneMaskReg(); + dbgs() << "\t "; CurMaskedBuilt = BuildMI(MBB, I, DL, TII->get(LMC.AndOpc), CurMaskedReg) .addReg(CurReg) .addReg(LMC.ExecReg); + CurMaskedBuilt->dump(); } } @@ -163,24 +184,33 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, if (PrevConstant && !PrevVal) { if (CurMaskedBuilt) { CurMaskedBuilt->getOperand(0).setReg(DstReg); + dbgs() << "\t "; + CurMaskedBuilt->dump(); } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurMaskedReg); + dbgs() << "\t "; + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurMaskedReg)->dump(); } } else if (CurConstant && !CurVal) { if (PrevMaskedBuilt) { PrevMaskedBuilt->getOperand(0).setReg(DstReg); + dbgs() << "\t "; + PrevMaskedBuilt->dump(); } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(PrevMaskedReg); + dbgs() << "\t "; + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(PrevMaskedReg)->dump(); } } else if (PrevConstant && PrevVal) { + dbgs() << "\t "; BuildMI(MBB, I, DL, TII->get(LMC.OrN2Opc), DstReg) .addReg(CurMaskedReg) - .addReg(LMC.ExecReg); + .addReg(LMC.ExecReg)->dump(); } else { + dbgs() << "\t "; BuildMI(MBB, I, DL, TII->get(LMC.OrOpc), DstReg) .addReg(PrevMaskedReg) - .addReg(CurMaskedReg ? CurMaskedReg : LMC.ExecReg); + .addReg(CurMaskedReg ? CurMaskedReg : LMC.ExecReg)->dump(); } + dbgs() << "\t\tGCNLaneMaskUtils::buildMergeLaneMasks() ends\n"; } /// Conservatively determine whether the \p Reg is a subset of EXEC for @@ -321,6 +351,7 @@ void GCNLaneMaskUpdater::addAvailable(MachineBasicBlock &Block, BlockIt = Blocks.end() - 1; } assert(!BlockIt->Value); + dbgs() << "GCNLaneMaskUpdater::addAvailable(" << Block.name() << "," << printReg(Value, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"; BlockIt->Value = Value; } @@ -328,9 +359,12 @@ void GCNLaneMaskUpdater::addAvailable(MachineBasicBlock &Block, /// Return the value in the middle of the block, i.e. before any change that /// was registered via \ref addAvailable. Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) { + dbgs() << "GCNLaneMaskUpdater::getValueInMiddleOfBlock(" << Block.name() << ")\n"; if (!Processed) process(); - return SSAUpdater.GetValueInMiddleOfBlock(&Block); + Register reg = SSAUpdater.GetValueInMiddleOfBlock(&Block); + dbgs() << "GCNLaneMaskUpdater::getValueInMiddleOfBlock(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"; + return reg; } /// Return the value at the end of the given block, i.e. after any change that @@ -340,27 +374,39 @@ Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) { /// reset mode, then this value will be 0. You likely want /// \ref getPreReset instead. Register GCNLaneMaskUpdater::getValueAtEndOfBlock(MachineBasicBlock &Block) { + dbgs() << "GCNLaneMaskUpdater::getValueAtEndOfBlock(" << Block.name() << ")\n"; if (!Processed) process(); - return SSAUpdater.GetValueAtEndOfBlock(&Block); + Register reg = SSAUpdater.GetValueAtEndOfBlock(&Block); + dbgs() << "GCNLaneMaskUpdater::getValueAtEndOfBlock(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"; + return reg; } /// Return the value in \p Block after the value merge (if any). Register GCNLaneMaskUpdater::getValueAfterMerge(MachineBasicBlock &Block) { + dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << ")\n"; if (!Processed) process(); - + Register reg = {}; auto BlockIt = findBlockInfo(Block); if (BlockIt != Blocks.end()) { - if (BlockIt->Merged) - return BlockIt->Merged; - if (BlockIt->Flags & ResetInMiddle) - return ZeroReg; + if (BlockIt->Merged){ + reg = BlockIt->Merged; + dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ") returning Merged.\n"; + return reg; + } + if (BlockIt->Flags & ResetInMiddle){ + reg = ZeroReg; + dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ") returning ZeroReg.\n"; + return reg; + } } // We didn't merge anything in the block, but the block may still be // ResetAtEnd, in which case we need the pre-reset value. - return SSAUpdater.GetValueInMiddleOfBlock(&Block); + reg = SSAUpdater.GetValueInMiddleOfBlock(&Block); + dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"; + return reg; } /// Determine whether \p MI defines and/or uses SCC. @@ -409,6 +455,7 @@ getSaluInsertionAtEnd(MachineBasicBlock &MBB) { /// Internal method to insert merge instructions. void GCNLaneMaskUpdater::process() { + dbgs() << "\n\tGCNLaneMaskUpdater::process() begins\n"; MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); const SIInstrInfo *TII = LMU.function()->getSubtarget().getInstrInfo(); @@ -421,29 +468,50 @@ void GCNLaneMaskUpdater::process() { TII->get(LMU.getLaneMaskConsts().MovOpc), ZeroReg) .addImm(0); } + dbgs() << "\tZeroReg:" << printReg(ZeroReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"; + dbgs() << "\n\tAdding available values:\n"; // Add available values. for (BlockInfo &Info : Blocks) { + dbgs() << "\tAdd avail value for BlockInfo:" << Info.Block->name() << "\n\t"; assert(Accumulating || !Info.Flags); assert(Info.Flags || Info.Value); - if (Info.Value) + if (Info.Value){ Info.Merged = LMU.createLaneMaskReg(); + dbgs() << "creating Info.Merged:" << printReg(Info.Merged, MRI.getTargetRegisterInfo(), 0, &MRI) << " for block " << Info.Block->name() << "\n\t"; + } + + Info.dump(MRI); + //Info.Value and not ResetAtEnd, then Info.Merged, else ZeroReg + Register val = (Info.Value && !(Info.Flags & ResetAtEnd)) ? Info.Merged : ZeroReg; + dbgs() << "\t\t(Info.Value && !(Info.Flags & ResetAtEnd)) : " << (Info.Value && !(Info.Flags & ResetAtEnd)) << " => "; + if((Info.Value && !(Info.Flags & ResetAtEnd))) + dbgs() << "Info.Merged\n"; + else + dbgs() << "ZeroReg\n"; + SSAUpdater.AddAvailableValue(Info.Block,val); + dbgs() << "\n"; - SSAUpdater.AddAvailableValue( - Info.Block, - (Info.Value && !(Info.Flags & ResetAtEnd)) ? Info.Merged : ZeroReg); } - if (Accumulating && !SSAUpdater.HasValueForBlock(&Entry)) + if (Accumulating && !SSAUpdater.HasValueForBlock(&Entry)){ + dbgs() << "\tAdd avail value for Entry block : ZeroReg\n"; SSAUpdater.AddAvailableValue(&Entry, ZeroReg); + } + + dbgs() << "\n\tMachineSSAUpdater ready, begin merging\n"; + // Once the SSA updater is ready, we can fill in all merge code, relying // on the SSA updater to insert required PHIs. for (BlockInfo &Info : Blocks) { if (!Info.Value) continue; - + + dbgs() << "\tmerge "; + Info.dump(MRI); + dbgs() << "\n"; // Determine the "previous" value, if any. Register Previous; if (Info.Block != &LMU.function()->front() && @@ -460,8 +528,13 @@ void GCNLaneMaskUpdater::process() { } } } else { - if (Accumulating) + dbgs() << "\tEither one of the following 2 conds are true:\n"; + dbgs() << "\tInfo.Block == &LMU.function()->front():" << (Info.Block == &LMU.function()->front()) << "\n"; + dbgs() << "\tInfo.Flags & ResetInMiddle:" << (Info.Flags & ResetInMiddle) << "\n"; + if (Accumulating){ Previous = ZeroReg; + dbgs() << "\tBlock:" << Info.Block->name() << " Previous is ZeroReg:" << printReg(Previous , MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"; + } } // Insert merge logic. @@ -471,16 +544,23 @@ void GCNLaneMaskUpdater::process() { if (Info.Flags & ResetAtEnd) { MachineInstr *mergeInstr = MRI.getVRegDef(Info.Merged); + dbgs() << "\tmergeInstr:"; + mergeInstr->dump(); + dbgs() << "\n"; if (mergeInstr->getOpcode() == AMDGPU::COPY && mergeInstr->getOperand(1).getReg().isVirtual()) { assert(MRI.use_empty(Info.Merged)); Info.Merged = mergeInstr->getOperand(1).getReg(); + dbgs() << "\tset Merged:" << printReg(Info.Merged , MRI.getTargetRegisterInfo(), 0, &MRI) << " for block " << Info.Block->name() << "\n"; + dbgs() << "\tErase mergeInstr\n"; mergeInstr->eraseFromParent(); } } } Processed = true; + dbgs() << "GCNLaneMaskUpdater::process() ends\n"; + } /// Find a block in the \ref Blocks structure. diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h index f4419f139d92c..2903f93fd98e1 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineSSAUpdater.h" +#include "llvm/Support/Debug.h" namespace llvm { @@ -106,7 +107,7 @@ class GCNLaneMaskUpdater { GCNLaneMaskUtils LMU; GCNLaneMaskAnalysis *LMA = nullptr; MachineSSAUpdater SSAUpdater; - + MachineRegisterInfo &MRI; bool Accumulating = false; bool Processed = false; @@ -118,6 +119,17 @@ class GCNLaneMaskUpdater { Register Merged; explicit BlockInfo(MachineBasicBlock *Block) : Block(Block) {} + + void dump(MachineRegisterInfo &MRI) { + dbgs() << "BlockInfo{"; + dbgs() << " Block:" << Block->name() << ","; + dbgs() << " Value:" << printReg(Value, MRI.getTargetRegisterInfo(), 0, &MRI) << ","; + dbgs() << " Merged:" << printReg(Merged, MRI.getTargetRegisterInfo(), 0, &MRI) << ","; + dbgs() << " Flags:"; + if(Flags & ResetAtEnd) dbgs() << "ResetAtEnd,"; + if(Flags & ResetInMiddle) dbgs() << "ResetInMiddle,"; + dbgs() << "}\n"; + } }; SmallVector Blocks; @@ -126,7 +138,7 @@ class GCNLaneMaskUpdater { DenseSet PotentiallyDead; public: - GCNLaneMaskUpdater(MachineFunction &MF) : LMU(MF), SSAUpdater(MF) {} + GCNLaneMaskUpdater(MachineFunction &MF) : LMU(MF), SSAUpdater(MF), MRI(MF.getRegInfo()) {} void setLaneMaskAnalysis(GCNLaneMaskAnalysis *Analysis) { LMA = Analysis; }