llvm · alex-t · Sep 13, 2024
diff --git a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu
@@ -10,7 +10,7 @@
 // GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope
 // GFX90A-CAS-LABEL: _Z14atomic_add_casPf
 // GFX90A-CAS:  flat_atomic_cmpswap
-// GFX90A-CAS:  s_cbranch_execnz
+// GFX90A-CAS:  s_cbranch_scc1
 __device__ float atomic_add_cas(float *p) {
   return __atomic_fetch_add(p, 1.0f, memory_order_relaxed);
 }
diff --git a/llvm/include/llvm/Analysis/CFGPrinter.h b/llvm/include/llvm/Analysis/CFGPrinter.h
@@ -272,9 +272,11 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
     unsigned OpNo = I.getSuccessorIndex();
     const Instruction *TI = Node->getTerminator();
     BasicBlock *SuccBB = TI->getSuccessor(OpNo);
-    auto BranchProb = CFGInfo->getBPI()->getEdgeProbability(Node, SuccBB);
-    double WeightPercent = ((double)BranchProb.getNumerator()) /
-                           ((double)BranchProb.getDenominator());
+    // auto BranchProb = CFGInfo->getBPI()->getEdgeProbability(Node, SuccBB);
+    // double WeightPercent = ((double)BranchProb.getNumerator()) /
+    //                        ((double)BranchProb.getDenominator());
+    double WeightPercent = 0.5;
+
     std::string TTAttr =
         formatv("tooltip=\"{0} -> {1}\\nProbability {2:P}\" ", getBBName(Node),
                 getBBName(SuccBB), WeightPercent);

diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -879,8 +879,7 @@ class MachineBasicBlock
   /// debug.  This is the correct point to insert copies at the beginning of a
   /// basic block. \p Reg is the register being used by a spill or defined for a
   /// restore/split during register allocation.
-  iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg = Register(),
-                                  bool SkipPseudoOp = true);
+  iterator SkipPHIsLabelsAndDebug(iterator I, bool SkipPseudoOp = true);
 
   /// Returns an iterator to the first terminator instruction of this basic
   /// block. If a terminator does not exist, it returns end().

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2058,8 +2058,7 @@ class TargetInstrInfo : public MCInstrInfo {
   /// other instructions shall be inserted before it. This can be implemented
   /// to prevent register allocator to insert spills for \p Reg before such
   /// instructions.
-  virtual bool isBasicBlockPrologue(const MachineInstr &MI,
-                                    Register Reg = Register()) const {
+  virtual bool isBasicBlockPrologue(const MachineInstr &MI) const {
     return false;
   }
 

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3098,7 +3098,7 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
   [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
-def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
+def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty],
   [IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // Represent unreachable in a divergent region.

diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -461,8 +461,7 @@ class StatepointState {
 
       if (EHPad && !RC.hasReload(Reg, RegToSlotIdx[Reg], EHPad)) {
         RC.recordReload(Reg, RegToSlotIdx[Reg], EHPad);
-        auto EHPadInsertPoint =
-            EHPad->SkipPHIsLabelsAndDebug(EHPad->begin(), Reg);
+        auto EHPadInsertPoint = EHPad->SkipPHIsLabelsAndDebug(EHPad->begin());
         insertReloadBefore(Reg, EHPadInsertPoint, EHPad);
         LLVM_DEBUG(dbgs() << "...also reload at EHPad "
                           << printMBBReference(*EHPad) << "\n");

diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -463,7 +463,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
   MachineBasicBlock *MBB = LIS.getMBBFromIndex(SrcVNI->def);
   MachineBasicBlock::iterator MII;
   if (SrcVNI->isPHIDef())
-    MII = MBB->SkipPHIsLabelsAndDebug(MBB->begin(), SrcReg);
+    MII = MBB->SkipPHIsLabelsAndDebug(MBB->begin());
   else {
     MachineInstr *DefMI = LIS.getInstructionFromIndex(SrcVNI->def);
     assert(DefMI && "Defining instruction disappeared");

diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -223,13 +223,13 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
 
 MachineBasicBlock::iterator
 MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I,
-                                          Register Reg, bool SkipPseudoOp) {
+                                          bool SkipPseudoOp) {
   const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
 
   iterator E = end();
   while (I != E && (I->isPHI() || I->isPosition() || I->isDebugInstr() ||
                     (SkipPseudoOp && I->isPseudoProbe()) ||
-                    TII->isBasicBlockPrologue(*I, Reg)))
+                    TII->isBasicBlockPrologue(*I)))
     ++I;
   // FIXME: This needs to change if we wish to bundle labels / dbg_values
   // inside the bundle.

diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
@@ -806,10 +806,8 @@ SlotIndex SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) {
     return Start;
   }
 
-  unsigned RegIdx = 0;
-  Register Reg = LIS.getInterval(Edit->get(RegIdx)).reg();
-  VNInfo *VNI = defFromParent(RegIdx, ParentVNI, Start, MBB,
-                              MBB.SkipPHIsLabelsAndDebug(MBB.begin(), Reg));
+  VNInfo *VNI = defFromParent(0, ParentVNI, Start, MBB,
+                              MBB.SkipPHIsLabelsAndDebug(MBB.begin()));
   RegAssign.insert(Start, VNI->def, OpenIdx);
   LLVM_DEBUG(dump());
   return VNI->def;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1551,11 +1551,12 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
   return true;
 }
 
-bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
+bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic(
+    MachineInstr &MI) const {
   // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
   // SelectionDAG uses for wave32 vs wave64.
   MachineBasicBlock *BB = MI.getParent();
-  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
+  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE))
       .add(MI.getOperand(1));
 
   Register Reg = MI.getOperand(1).getReg();
@@ -2083,8 +2084,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     MachineInstr &I) const {
   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
   switch (IntrinsicID) {
-  case Intrinsic::amdgcn_end_cf:
-    return selectEndCfIntrinsic(I);
+  case Intrinsic::amdgcn_wave_reconverge:
+    return selectWaveReconvergeIntrinsic(I);
   case Intrinsic::amdgcn_ds_ordered_add:
   case Intrinsic::amdgcn_ds_ordered_swap:
     return selectDSOrderedIntrinsic(I, IntrinsicID);

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -116,7 +116,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectReturnAddress(MachineInstr &I) const;
   bool selectG_INTRINSIC(MachineInstr &I) const;
 
-  bool selectEndCfIntrinsic(MachineInstr &MI) const;
+  bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const;
   bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -785,8 +785,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
   const unsigned MovExecOpc =
       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-  const unsigned MovExecTermOpc =
-      Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
 
   const unsigned XorTermOpc = Subtarget.isWave32() ?
     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
@@ -949,27 +947,27 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
 
   B.setInsertPt(*BodyBB, BodyBB->end());
 
+  Register LoopMask = MRI.createVirtualRegister(
+      TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
   B.buildInstr(XorTermOpc)
-    .addDef(ExecReg)
-    .addReg(ExecReg)
-    .addReg(NewExec);
+    .addDef(LoopMask)
+	.addReg(ExecReg)
+	.addReg(NewExec);
 
   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
   // s_cbranch_scc0?
 
   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
-  B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
+  B.buildInstr(AMDGPU::SI_WATERFALL_LOOP)
+      .addReg(LoopMask)
+      .addReg(NewExec)
+      .addMBB(LoopBB);
 
   // Save the EXEC mask before the loop.
   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
     .addReg(ExecReg);
 
-  // Restore the EXEC mask after the loop.
-  B.setMBB(*RestoreExecBB);
-  B.buildInstr(MovExecTermOpc)
-    .addDef(ExecReg)
-    .addReg(SaveExecReg);
 
   // Set the insert point after the original instruction, so any new
   // instructions will be in the remainder.
@@ -4967,7 +4965,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
-    case Intrinsic::amdgcn_end_cf: {
+    case Intrinsic::amdgcn_wave_reconverge: {
       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;

diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -14,6 +14,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -55,7 +56,7 @@ class SIAnnotateControlFlow {
   Function *Else;
   Function *IfBreak;
   Function *Loop;
-  Function *EndCf;
+  Function *WaveReconverge;
 
   DominatorTree *DT;
   StackVector Stack;
@@ -88,7 +89,7 @@ class SIAnnotateControlFlow {
 
   bool handleLoop(BranchInst *Term);
 
-  bool closeControlFlow(BasicBlock *BB);
+  bool tryWaveReconverge(BasicBlock *BB);
 
 public:
   SIAnnotateControlFlow(Module &M, const GCNSubtarget &ST, DominatorTree &DT,
@@ -123,7 +124,8 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
   IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
                                       { IntMask });
   Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
-  EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
+  WaveReconverge = Intrinsic::getDeclaration(
+      &M, Intrinsic::amdgcn_wave_reconverge, {IntMask});
 }
 
 /// Is the branch condition uniform or did the StructurizeCFG pass
@@ -185,8 +187,6 @@ bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
 
 /// Open a new "If" block
 bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
-  if (isUniform(Term))
-    return false;
 
   IRBuilder<> IRB(Term);
   Value *IfCall = IRB.CreateCall(If, {Term->getCondition()});
@@ -287,43 +287,43 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
 }
 
 /// Close the last opened control flow
-bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
-  llvm::Loop *L = LI->getLoopFor(BB);
-
-  assert(Stack.back().first == BB);
+bool SIAnnotateControlFlow::tryWaveReconverge(BasicBlock *BB) {
 
-  if (L && L->getHeader() == BB) {
-    // We can't insert an EndCF call into a loop header, because it will
-    // get executed on every iteration of the loop, when it should be
-    // executed only once before the loop.
-    SmallVector <BasicBlock *, 8> Latches;
-    L->getLoopLatches(Latches);
+  if (succ_empty(BB))
+    return false;
 
+  BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
+  if (Term->getNumSuccessors() == 1) {
+    // The current BBs single successor is a top of the stack. We need to
+    // reconverge over thaqt path.
+    BasicBlock *SingleSucc = *succ_begin(BB);
+    BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end();
+    if (isTopOfStack(SingleSucc)) {
+      Value *Exec = Stack.back().second;
+      IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec});
+    }
+  } else {
+    // We have a uniform conditional branch terminating the block.
+    // THis block may be the last in the Then path of the enclosing divergent
+    // IF.
+    if (!isUniform(Term))
+      // Divergent loop is going to be further processed in another place
+      return false;
+
+    for (auto Succ : Term->successors()) {
+      if (isTopOfStack(Succ)) {
+        // Just split to make a room for further WAVE_RECONVERGE insertion
     SmallVector<BasicBlock *, 2> Preds;
-    for (BasicBlock *Pred : predecessors(BB)) {
-      if (!is_contained(Latches, Pred))
-        Preds.push_back(Pred);
+        for (auto P : predecessors(Succ)) {
+          if (DT->dominates(BB, P))
+            Preds.push_back(P);
     }
-
-    BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
+        DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+        SplitBlockPredecessors(Succ, Preds, ".reconverge", &DTU, LI, nullptr,
                                 false);
   }
 
-  Value *Exec = popSaved();
-  BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt();
-  if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
-    Instruction *ExecDef = cast<Instruction>(Exec);
-    BasicBlock *DefBB = ExecDef->getParent();
-    if (!DT->dominates(DefBB, BB)) {
-      // Split edge to make Def dominate Use
-      FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
     }
-    IRBuilder<> IRB(FirstInsertionPt->getParent(), FirstInsertionPt);
-    // TODO: StructurizeCFG 'Flow' blocks have debug locations from the
-    // condition, for now just avoid copying these DebugLocs so that stepping
-    // out of the then/else block in a debugger doesn't step to the condition.
-    IRB.SetCurrentDebugLocation(DebugLoc());
-    IRB.CreateCall(EndCf, {Exec});
   }
 
   return true;
@@ -341,14 +341,18 @@ bool SIAnnotateControlFlow::run(Function &F) {
 
     if (!Term || Term->isUnconditional()) {
       if (isTopOfStack(BB))
-        Changed |= closeControlFlow(BB);
+        Stack.pop_back();
+      Changed |= tryWaveReconverge(BB);
 
       continue;
     }
 
     if (I.nodeVisited(Term->getSuccessor(1))) {
       if (isTopOfStack(BB))
-        Changed |= closeControlFlow(BB);
+        Stack.pop_back();
+      // Let's take care of uniform loop latch that may be closing the Then
+      // path of the enclosing divergent branch.
+      Changed |= tryWaveReconverge(BB);
 
       if (DT->dominates(Term->getSuccessor(1), BB))
         Changed |= handleLoop(Term);
@@ -363,9 +367,14 @@ bool SIAnnotateControlFlow::run(Function &F) {
         continue;
       }
 
-      Changed |= closeControlFlow(BB);
+      Stack.pop_back();
     }
 
+    if (isUniform(Term))
+      // Uniform conditional branch may be in the block that closes the Then
+      // path of the divergent conditional branch.
+      Changed |= tryWaveReconverge(BB);
+    else
     Changed |= openIf(Term);
   }