llvm · kerbowa · Nov 4, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -22,6 +22,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPURewriteAGPRCopyMFMA.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
@@ -30,6 +31,7 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -39,6 +41,72 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-rewrite-agpr-copy-mfma"
 
+namespace llvm {
+namespace AMDGPU {
+bool checkAGPRCopyMFMAJointDominance(
+    const MachineFunction &MF, const MachineDominatorTree &MDT,
+    const SmallVectorImpl<MachineInstr *> &Stores,
+    const SmallVectorImpl<MachineInstr *> &Loads, int Slot) {
+  SmallPtrSet<MachineBasicBlock *, 4> StoreBlocks;
+  for (MachineInstr *S : Stores)
+    if (MDT.isReachableFromEntry(S->getParent()))
+      StoreBlocks.insert(S->getParent());
+
+  if (StoreBlocks.empty())
+    return false;
+
+  // Compute blocks reachable from entry without passing through a store
+  // block.
+  SmallPtrSet<MachineBasicBlock *, 16> StoreFreeReachable;
+  SmallVector<MachineBasicBlock *, 16> Worklist;
+
+  MachineBasicBlock &EntryMBB = const_cast<MachineBasicBlock &>(MF.front());
+  Worklist.push_back(&EntryMBB);
+  StoreFreeReachable.insert(&EntryMBB);
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.pop_back_val();
+    if (StoreBlocks.contains(MBB))
+      continue;
+
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      if (StoreFreeReachable.insert(Succ).second)
+        Worklist.push_back(Succ);
+    }
+  }
+
+  auto IsLoadJointlyDominatedByStores = [&](MachineInstr *LoadMI) -> bool {
+    MachineBasicBlock *LoadMBB = LoadMI->getParent();
+    if (!MDT.isReachableFromEntry(LoadMBB))
+      return true;
+
+    // Check if every path passed through a store block.
+    if (!StoreFreeReachable.contains(LoadMBB))
+      return true;
+
+    // Otherwise, there exists a path to this block that has not seen any
+    // store yet. We must ensure that within this block there is a store to
+    // this slot before the load.
+    for (MachineInstr &MI : *LoadMBB) {
+      if (&MI == LoadMI)
+        break;
+      if (MI.mayStore()) {
+        for (MachineOperand &MO : MI.operands()) {
+          if (MO.isFI() && MO.getIndex() == Slot)
+            return true;
+        }
+      }
+    }
+
+    return false;
+  };
+
+  return llvm::all_of(Loads, IsLoadJointlyDominatedByStores);
+}
+
+} // namespace AMDGPU
+} // namespace llvm
+
 namespace {
 
 STATISTIC(NumMFMAsRewrittenToAGPR,
@@ -58,6 +126,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   LiveIntervals &LIS;
   LiveStacks &LSS;
   const RegisterClassInfo &RegClassInfo;
+  MachineDominatorTree &MDT;
 
   bool attemptReassignmentsToAGPR(SmallSetVector<Register, 4> &InterferingRegs,
                                   MCPhysReg PrefPhysReg) const;
@@ -66,10 +135,11 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
                                 LiveRegMatrix &LRM, LiveIntervals &LIS,
                                 LiveStacks &LSS,
-                                const RegisterClassInfo &RegClassInfo)
+                                const RegisterClassInfo &RegClassInfo,
+                                MachineDominatorTree &MDT)
       : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
         TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
-        LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo) {}
+        LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo), MDT(MDT) {}
 
   bool isRewriteCandidate(const MachineInstr &MI) const {
     return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1;
@@ -515,6 +585,37 @@ void AMDGPURewriteAGPRCopyMFMAImpl::eliminateSpillsOfReassignedVGPRs() const {
     if (SpillReferences == SpillSlotReferences.end())
       continue;
 
+    // For each spill reload, every path from entry to the reload must pass
+    // through at least one spill store to the same stack slot.
+    SmallVector<MachineInstr *, 4> Stores, Loads;
+    Stores.reserve(SpillReferences->second.size());
+    Loads.reserve(SpillReferences->second.size());
+    for (MachineInstr *MI : SpillReferences->second) {
+      if (MI->mayStore())
+        Stores.push_back(MI);
+      else if (MI->mayLoad())
+        Loads.push_back(MI);
+    }
+
+    SmallPtrSet<MachineBasicBlock *, 4> StoreBlocks;
+    for (MachineInstr *S : Stores)
+      if (MDT.isReachableFromEntry(S->getParent()))
+        StoreBlocks.insert(S->getParent());
+
+    if (StoreBlocks.empty()) {
+      LLVM_DEBUG(dbgs() << "Skipping " << printReg(Slot, &TRI)
+                        << ": no reachable stores\n");
+      continue;
+    }
+
+    if (!AMDGPU::checkAGPRCopyMFMAJointDominance(MF, MDT, Stores, Loads,
+                                                 Slot)) {
+      LLVM_DEBUG(
+          dbgs() << "Skipping " << printReg(Slot, &TRI)
+                 << ": some reachable load not jointly dominated by stores\n");
+      continue;
+    }
+
     const TargetRegisterClass *RC = LSS.getIntervalRegClass(Slot);
 
     LLVM_DEBUG(dbgs() << "Trying to eliminate " << printReg(Slot, &TRI)
@@ -603,11 +704,13 @@ class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass {
     AU.addRequired<VirtRegMapWrapperLegacy>();
     AU.addRequired<LiveRegMatrixWrapperLegacy>();
     AU.addRequired<LiveStacksWrapperLegacy>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
 
     AU.addPreserved<LiveIntervalsWrapperPass>();
     AU.addPreserved<VirtRegMapWrapperLegacy>();
     AU.addPreserved<LiveRegMatrixWrapperLegacy>();
     AU.addPreserved<LiveStacksWrapperLegacy>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
 
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -622,6 +725,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE,
                     "AMDGPU Rewrite AGPR-Copy-MFMA", false, false)
 
@@ -641,7 +745,8 @@ bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction(
   auto &LRM = getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
   auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
   auto &LSS = getAnalysis<LiveStacksWrapperLegacy>().getLS();
-  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo, MDT);
   return Impl.run(MF);
 }
 
@@ -652,10 +757,11 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF,
   LiveRegMatrix &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF);
   LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
   LiveStacks &LSS = MFAM.getResult<LiveStacksAnalysis>(MF);
+  MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
   RegisterClassInfo RegClassInfo;
   RegClassInfo.runOnMachineFunction(MF);
 
-  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
+  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo, MDT);
   if (!Impl.run(MF))
     return PreservedAnalyses::all();
   auto PA = getMachineFunctionPassPreservedAnalyses();

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h
@@ -0,0 +1,32 @@
+//===- AMDGPURewriteAGPRCopyMFMA.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H
+
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+class MachineFunction;
+class MachineDominatorTree;
+class MachineInstr;
+
+namespace AMDGPU {
+
+/// Returns true if every reload in \p Loads is jointly dominated by at least
+/// one store in \p Stores to the same stack slot \p Slot.
+bool checkAGPRCopyMFMAJointDominance(
+    const MachineFunction &MF, const MachineDominatorTree &MDT,
+    const SmallVectorImpl<MachineInstr *> &Stores,
+    const SmallVectorImpl<MachineInstr *> &Loads, int Slot);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H