diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index e13147e9461eb..29699962d324c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -16,6 +16,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUIGroupLP.h"
+#include "GCNSchedStrategy.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
@@ -833,6 +834,8 @@ class IGLPStrategy {
 
   const SIInstrInfo *TII;
 
+  const MachineInstr *IGLPOptMI;
+
 public:
   /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
   virtual bool applyIGLPStrategy(
@@ -846,8 +849,9 @@ class IGLPStrategy {
 
   bool IsBottomUp = true;
 
-  IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : DAG(DAG), TII(TII) {}
+  IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+               const MachineInstr *IGLPOptMI)
+      : DAG(DAG), TII(TII), IGLPOptMI(IGLPOptMI) {}
 
   virtual ~IGLPStrategy() = default;
 };
@@ -865,8 +869,9 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
     return true;
   }
 
-  MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : IGLPStrategy(DAG, TII) {
+  MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+                   const MachineInstr *IGLPOptMI)
+      : IGLPStrategy(DAG, TII, IGLPOptMI) {
     IsBottomUp = true;
   }
 };
@@ -899,31 +904,36 @@ bool MFMASmallGemmOpt::applyIGLPStrategy(
 class MFMAExpInterleaveOpt final : public IGLPStrategy {
 private:
   // The count of TRANS SUs involved in the interleaved pipeline
-  static unsigned TransPipeCount;
+  unsigned TransPipeCount = 0;
   // The count of MFMA SUs involved in the interleaved pipeline
-  static unsigned MFMAPipeCount;
+  unsigned MFMAPipeCount = 0;
   // The count of Add SUs involved in the interleaved pipeline
-  static unsigned AddPipeCount;
+  unsigned AddPipeCount = 0;
   // The number of transitive MFMA successors for each TRANS SU
-  static unsigned MFMAEnablement;
+  unsigned MFMAEnablement = 0;
   // The number of transitive TRANS predecessors for each MFMA SU
-  static unsigned ExpRequirement;
+  unsigned ExpRequirement = 0;
   // The count of independent "chains" of MFMA instructions in the pipeline
-  static unsigned MFMAChains;
+  unsigned MFMAChains = 0;
   // The length of each independent "chain" of MFMA instructions
-  static unsigned MFMAChainLength;
+  unsigned MFMAChainLength = 0;
   // Whether or not the pipeline has V_CVT instructions
-  static bool HasCvt;
+  bool HasCvt = false;
   // Whether or not there are instructions between the TRANS instruction and
   // V_CVT
-  static bool HasChainBetweenCvt;
+  bool HasChainBetweenCvt = false;
   // The first occuring DS_READ which feeds an MFMA chain
-  static std::optional<unsigned> FirstPipeDSR;
+  std::optional<unsigned> FirstPipeDSR = std::nullopt;
   // The MFMAPipe SUs with no MFMA predecessors
   SmallVector<SUnit *, 4> MFMAChainSeeds;
-  // Compute the heuristics for the pipeline, returning whether or not the DAG
-  // is well formatted for the mutation
-  bool analyzeDAG(const SIInstrInfo *TII);
+  bool analyzeDAG(const SIInstrInfo *TII, AMDGPU::SchedulingPhase Phase);
+  bool computeDAGAnalysis(const SIInstrInfo *TII,
+                          SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache);
+  void initializeScalarsFromCache(
+      const SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache);
+  void initializePointersFromCache(
+      const GCNScheduleDAGMILive::IGLPExpInterleavePointerCache &PtrCache);
+  bool AnalysisResult;
 
   /// Whether or not the instruction is a transitive predecessor of an MFMA
   /// instruction
@@ -1328,29 +1338,22 @@ class MFMAExpInterleaveOpt final : public IGLPStrategy {
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                            AMDGPU::SchedulingPhase Phase) override;
 
-  MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : IGLPStrategy(DAG, TII) {
+  MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+                       const MachineInstr *IGLPOptMI)
+      : IGLPStrategy(DAG, TII, IGLPOptMI) {
     IsBottomUp = false;
   }
 };
 
-unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
-unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
-unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
-unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
-bool MFMAExpInterleaveOpt::HasCvt = false;
-bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
-std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
-
-bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
+bool MFMAExpInterleaveOpt::computeDAGAnalysis(
+    const SIInstrInfo *TII,
+    SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache) {
   SmallVector<SUnit *, 10> ExpPipeCands;
   SmallVector<SUnit *, 10> MFMAPipeCands;
   SmallVector<SUnit *, 10> MFMAPipeSUs;
   SmallVector<SUnit *, 10> PackSUs;
   SmallVector<SUnit *, 10> CvtSUs;
+  const MachineInstr *FirstPipeDSRInstr = nullptr;
 
   auto isBitPack = [](unsigned Opc) {
     return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
@@ -1367,12 +1370,14 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
     auto Opc = SU.getInstr()->getOpcode();
     if (TII->isTRANS(Opc)) {
       // Avoid counting a potential bonus V_EXP which all the MFMA depend on
+      // FIXME: This heuristic needs improvement/clarification!
+      // In general, the pipeline seems to look like this:
+      //   fma_f32 -> exp_f32 -> cvt_f16_f32 -> v_pack_b32_f16 -> mfma_.._f16
+      //   (with potential arithmetic between exp and cvt)
+      //   see
+      //   https://github.com/llvm/llvm-project/pull/80370#discussion_r1483660378
       if (SU.Succs.size() >= 7)
         continue;
-      for (auto &Succ : SU.Succs) {
-        if (Succ.getSUnit()->Succs.size() >= 7)
-          continue;
-      }
       ExpPipeCands.push_back(&SU);
     }
 
@@ -1457,6 +1462,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
     }
   }
 
+  MFMAChainSeeds.clear();
   MFMAChains = 0;
   for (auto &MFMAPipeSU : MFMAPipeSUs) {
     if (is_contained(MFMAChainSeeds, MFMAPipeSU))
@@ -1474,8 +1480,10 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
 
   for (auto Pred : MFMAChainSeeds[0]->Preds) {
     if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
-        Pred.getSUnit()->getInstr()->mayLoad())
+        Pred.getSUnit()->getInstr()->mayLoad()) {
       FirstPipeDSR = Pred.getSUnit()->NodeNum;
+      FirstPipeDSRInstr = Pred.getSUnit()->getInstr();
+    }
   }
 
   MFMAChainLength = MFMAPipeCount / MFMAChains;
@@ -1519,20 +1527,93 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
       });
 
   ExpRequirement *= PackPredCount;
+
+  Cache.TransPipeCount = TransPipeCount;
+  Cache.MFMAPipeCount = MFMAPipeCount;
+  Cache.AddPipeCount = AddPipeCount;
+  Cache.MFMAEnablement = MFMAEnablement;
+  Cache.ExpRequirement = ExpRequirement;
+  Cache.MFMAChains = MFMAChains;
+  Cache.MFMAChainLength = MFMAChainLength;
+  Cache.HasCvt = HasCvt;
+  Cache.HasChainBetweenCvt = HasChainBetweenCvt;
+  Cache.AnalysisResult = true;
+
+  GCNScheduleDAGMILive *GCNDAG = static_cast<GCNScheduleDAGMILive *>(DAG);
+  GCNScheduleDAGMILive::IGLPExpInterleavePointerCache PtrCache;
+  PtrCache.FirstPipeDSRInstr = FirstPipeDSRInstr;
+  PtrCache.MFMAChainSeedInstrs.reserve(MFMAChainSeeds.size());
+  for (SUnit *Seed : MFMAChainSeeds)
+    PtrCache.MFMAChainSeedInstrs.push_back(Seed->getInstr());
+  GCNDAG->setIGLPExpInterleavePointerCache(IGLPOptMI, PtrCache);
+
   return true;
 }
 
+void MFMAExpInterleaveOpt::initializeScalarsFromCache(
+    const SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache) {
+  TransPipeCount = Cache.TransPipeCount;
+  MFMAPipeCount = Cache.MFMAPipeCount;
+  AddPipeCount = Cache.AddPipeCount;
+  MFMAEnablement = Cache.MFMAEnablement;
+  ExpRequirement = Cache.ExpRequirement;
+  MFMAChains = Cache.MFMAChains;
+  MFMAChainLength = Cache.MFMAChainLength;
+  HasCvt = Cache.HasCvt;
+  HasChainBetweenCvt = Cache.HasChainBetweenCvt;
+  AnalysisResult = Cache.AnalysisResult;
+}
+
+void MFMAExpInterleaveOpt::initializePointersFromCache(
+    const GCNScheduleDAGMILive::IGLPExpInterleavePointerCache &PtrCache) {
+  if (PtrCache.FirstPipeDSRInstr) {
+    SUnit *SU =
+        DAG->getSUnit(const_cast<MachineInstr *>(PtrCache.FirstPipeDSRInstr));
+    assert(SU && "FirstPipeDSRInstr instruction not found in DAG");
+    FirstPipeDSR = SU->NodeNum;
+  }
+  MFMAChainSeeds.clear();
+  for (const MachineInstr *MI : PtrCache.MFMAChainSeedInstrs) {
+    SUnit *SeedSU = DAG->getSUnit(const_cast<MachineInstr *>(MI));
+    assert(SeedSU && "MFMAChainSeed instruction not found in DAG");
+    MFMAChainSeeds.push_back(SeedSU);
+  }
+}
+
+bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII,
+                                      AMDGPU::SchedulingPhase Phase) {
+  SIMachineFunctionInfo &MFI = *DAG->MF.getInfo<SIMachineFunctionInfo>();
+  bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
+
+  if (const SIMachineFunctionInfo::MFMAExpInterleaveCache *Cache =
+          MFI.getMFMAExpInterleaveCache(IGLPOptMI)) {
+    initializeScalarsFromCache(*Cache);
+
+    if (!IsPostRA) {
+      GCNScheduleDAGMILive *GCNDAG = static_cast<GCNScheduleDAGMILive *>(DAG);
+      const GCNScheduleDAGMILive::IGLPExpInterleavePointerCache *PtrCache =
+          GCNDAG->getIGLPExpInterleavePointerCache(IGLPOptMI);
+      assert(PtrCache &&
+             "Pre-RA phase expected pointer cache in GCNScheduleDAGMILive");
+      initializePointersFromCache(*PtrCache);
+    }
+    return AnalysisResult;
+  }
+
+  assert(!IsPostRA && "PostRA phase not expected to require analyzing DAG");
+  SIMachineFunctionInfo::MFMAExpInterleaveCache Cache;
+  AnalysisResult = computeDAGAnalysis(TII, Cache);
+  if (AnalysisResult)
+    MFI.setMFMAExpInterleaveCache(IGLPOptMI, Cache);
+  return AnalysisResult;
+}
+
 bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                                                AMDGPU::SchedulingPhase Phase) {
   const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
-  if (Phase != AMDGPU::SchedulingPhase::PostRA)
-    MFMAChainSeeds.clear();
-  if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
-    return false;
-
-  return true;
+  return analyzeDAG(TII, Phase);
 }
 
 bool MFMAExpInterleaveOpt::applyIGLPStrategy(
@@ -1540,6 +1621,8 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
     AMDGPU::SchedulingPhase Phase) {
 
+  assert(AnalysisResult && "no or failed DAG analysis");
+
   bool IsSmallKernelType =
       MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
   bool IsLargeKernelType =
@@ -1559,18 +1642,18 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
   unsigned CurrMFMAForTransPosition = 0;
 
   auto incrementTransPosition = [&MFMAChain, &PositionInChain,
-                                 &CurrMFMAForTransPosition]() {
+                                 &CurrMFMAForTransPosition, this]() {
     CurrMFMAForTransPosition += MFMAEnablement;
     PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
     MFMAChain = CurrMFMAForTransPosition % MFMAChains;
   };
 
-  auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
+  auto getNextTransPositionInChain = [&CurrMFMAForTransPosition, this]() {
     auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
     return (TempMFMAForTrans / MFMAChains);
   };
 
-  auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
+  auto getNextTransMFMAChain = [&CurrMFMAForTransPosition, this]() {
     auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
     return TempMFMAForTrans % MFMAChains;
   };
@@ -1580,7 +1663,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
   unsigned PositionInChainForMFMA = 0;
 
   auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
-                                &PositionInChainForMFMA]() {
+                                &PositionInChainForMFMA, this]() {
     ++CurrMFMAPosition;
     MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
     PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
@@ -1838,8 +1921,9 @@ class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy {
     return true;
   }
 
-  MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : IGLPStrategy(DAG, TII) {
+  MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+                             const MachineInstr *IGLPOptMI)
+      : IGLPStrategy(DAG, TII, IGLPOptMI) {
     IsBottomUp = true;
   }
 };
@@ -2065,25 +2149,36 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
     return true;
   }
 
-  MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : IGLPStrategy(DAG, TII) {
+  MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+                             const MachineInstr *IGLPOptMI)
+      : IGLPStrategy(DAG, TII, IGLPOptMI) {
     IsBottomUp = false;
   }
 };
 
-static unsigned DSWCount = 0;
-static unsigned DSWWithPermCount = 0;
-static unsigned DSWWithSharedVMEMCount = 0;
-
 bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
     AMDGPU::SchedulingPhase Phase) {
   unsigned MFMACount = 0;
   unsigned DSRCount = 0;
+  unsigned DSWCount = 0;
+  unsigned DSWWithPermCount = 0;
+  unsigned DSWWithSharedVMEMCount = 0;
 
   bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
 
+  if (!IsInitial) {
+    const SIMachineFunctionInfo &MFI =
+        *DAG->MF.getInfo<SIMachineFunctionInfo>();
+    const SIMachineFunctionInfo::MFMASmallGemmSingleWaveCache *Cache =
+        MFI.getMFMASmallGemmSingleWaveCache(IGLPOptMI);
+    assert(Cache && "no cache found");
+    DSWCount = Cache->DSWCount;
+    DSWWithPermCount = Cache->DSWWithPermCount;
+    DSWWithSharedVMEMCount = Cache->DSWWithSharedVMEMCount;
+  }
+
   assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
                          DSWWithSharedVMEMCount == 0)) &&
          "DSWCounters should be zero in pre-RA scheduling!");
@@ -2110,8 +2205,6 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
   if (IsInitial) {
     DSWWithPermCount = DSWithPerms.size();
-    auto *I = DSWithPerms.begin();
-    auto *E = DSWithPerms.end();
 
     // Get the count of DS_WRITES with V_PERM predecessors which
     // have loop carried dependencies (WAR) on the same VMEM_READs.
@@ -2121,10 +2214,10 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     // for every V_PERM pred of this DS_W.
     DenseMap<MachineInstr *, SUnit *> VMEMLookup;
     SmallVector<SUnit *, 6> Counted;
-    for (; I != E; I++) {
+    for (SUnit *DSWrite : DSWithPerms) {
       SUnit *Cand = nullptr;
       bool MissedAny = false;
-      for (auto &Pred : (*I)->Preds) {
+      for (auto &Pred : DSWrite->Preds) {
         if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
           continue;
 
@@ -2138,11 +2231,11 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
           if (MissedAny || !VMEMLookup.size()) {
             MissedAny = true;
-            VMEMLookup[MI] = *I;
+            VMEMLookup[MI] = DSWrite;
             continue;
           }
 
-          auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
+          auto [It, Inserted] = VMEMLookup.try_emplace(MI, DSWrite);
           if (Inserted) {
             MissedAny = true;
             continue;
@@ -2158,9 +2251,14 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
       if (!MissedAny && Cand) {
         DSWWithSharedVMEMCount += 2;
         Counted.push_back(Cand);
-        Counted.push_back(*I);
+        Counted.push_back(DSWrite);
       }
     }
+
+    SIMachineFunctionInfo::MFMASmallGemmSingleWaveCache Cache = {
+        DSWCount, DSWWithPermCount, DSWWithSharedVMEMCount};
+    SIMachineFunctionInfo &MFI = *DAG->MF.getInfo<SIMachineFunctionInfo>();
+    MFI.setMFMASmallGemmSingleWaveCache(IGLPOptMI, Cache);
   }
 
   assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
@@ -2322,16 +2420,16 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
 static std::unique_ptr<IGLPStrategy>
 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
-                   const SIInstrInfo *TII) {
+                   const SIInstrInfo *TII, const MachineInstr *IGLPOptMI) {
   switch (ID) {
   case MFMASmallGemmOptID:
-    return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
+    return std::make_unique<MFMASmallGemmOpt>(DAG, TII, IGLPOptMI);
   case MFMASmallGemmSingleWaveOptID:
-    return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
+    return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII, IGLPOptMI);
   case MFMAExpInterleaveID:
-    return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
+    return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII, IGLPOptMI);
   case MFMAExpSimpleInterleaveID:
-    return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
+    return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII, IGLPOptMI);
   }
 
   llvm_unreachable("Unknown IGLPStrategyID");
@@ -2709,7 +2807,7 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
 bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
   IGLPStrategyID StrategyID =
       (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
-  auto S = createIGLPStrategy(StrategyID, DAG, TII);
+  auto S = createIGLPStrategy(StrategyID, DAG, TII, SU.getInstr());
   if (!S->shouldApplyStrategy(DAG, Phase))
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 99fd55db33285..cb07f2da679c1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -249,6 +249,24 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class ILPInitialScheduleStage;
   friend class RegionPressureMap;
 
+public:
+  struct IGLPExpInterleavePointerCache {
+    /// MFMA instructions in the interleave pipeline that have no MFMA
+    /// predecessors, i.e. the roots of independent MFMA chains.
+    SmallVector<const MachineInstr *, 4> MFMAChainSeedInstrs;
+    /// The first DS_READ that feeds into an MFMA chain seed.
+    const MachineInstr *FirstPipeDSRInstr = nullptr;
+  };
+
+private:
+  /// Pass-local cache of pointer-based IGLP analysis data, keyed by the
+  /// IGLP_OPT MachineInstr. This is safe because the DAG object lives for
+  /// the duration of the pre-RA scheduling pass and the MachineInstrs are
+  /// stable within that lifetime. Unlike SIMachineFunctionInfo caches, this
+  /// is never serialized.
+  DenseMap<const MachineInstr *, IGLPExpInterleavePointerCache>
+      IGLPExpInterleavePtrCaches;
+
   const GCNSubtarget &ST;
 
   SIMachineFunctionInfo &MFI;
@@ -317,6 +335,18 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   GCNScheduleDAGMILive(MachineSchedContext *C,
                        std::unique_ptr<MachineSchedStrategy> S);
 
+  const IGLPExpInterleavePointerCache *
+  getIGLPExpInterleavePointerCache(const MachineInstr *MI) const {
+    auto It = IGLPExpInterleavePtrCaches.find(MI);
+    return It == IGLPExpInterleavePtrCaches.end() ? nullptr : &It->second;
+  }
+
+  void
+  setIGLPExpInterleavePointerCache(const MachineInstr *MI,
+                                   const IGLPExpInterleavePointerCache &Cache) {
+    IGLPExpInterleavePtrCaches[MI] = Cache;
+  }
+
   void schedule() override;
 
   void finalizeSchedule() override;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 32efae69b20c8..95867d442d278 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -772,6 +772,41 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
   auto SFI = MFI.getOptionalScavengeFI();
   if (SFI)
     ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo());
+
+  const DenseMap<const MachineInstr *,
+                 llvm::SIMachineFunctionInfo::MFMASmallGemmSingleWaveCache>
+      &SmallGemmCaches = MFI.getMFMASmallGemmSingleWaveCaches();
+  for (DenseMap<const MachineInstr *,
+                llvm::SIMachineFunctionInfo::MFMASmallGemmSingleWaveCache>::
+           const_iterator It = SmallGemmCaches.begin(),
+                          E = SmallGemmCaches.end();
+       It != E; ++It) {
+    const MachineInstr *MI = It->first;
+    IGLPSmallGemmSingleWaveCaches.push_back(
+        {static_cast<unsigned>(MI->getParent()->getNumber()),
+         static_cast<unsigned>(MI->getOperand(0).getImm()), It->second.DSWCount,
+         It->second.DSWWithPermCount, It->second.DSWWithSharedVMEMCount});
+  }
+
+  const DenseMap<const MachineInstr *,
+                 llvm::SIMachineFunctionInfo::MFMAExpInterleaveCache>
+      &ExpInterleaveCaches = MFI.getMFMAExpInterleaveCaches();
+  for (DenseMap<
+           const MachineInstr *,
+           llvm::SIMachineFunctionInfo::MFMAExpInterleaveCache>::const_iterator
+           It = ExpInterleaveCaches.begin(),
+           E = ExpInterleaveCaches.end();
+       It != E; ++It) {
+    const MachineInstr *MI = It->first;
+    IGLPExpInterleaveCaches.push_back(
+        {static_cast<unsigned>(MI->getParent()->getNumber()),
+         static_cast<unsigned>(MI->getOperand(0).getImm()),
+         It->second.TransPipeCount, It->second.MFMAPipeCount,
+         It->second.AddPipeCount, It->second.MFMAEnablement,
+         It->second.ExpRequirement, It->second.MFMAChains,
+         It->second.MFMAChainLength, It->second.HasCvt,
+         It->second.HasChainBetweenCvt, It->second.AnalysisResult});
+  }
 }
 
 void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
@@ -821,6 +856,48 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   } else {
     ScavengeFI = std::nullopt;
   }
+
+  if (!YamlMFI.IGLPSmallGemmSingleWaveCaches.empty() ||
+      !YamlMFI.IGLPExpInterleaveCaches.empty()) {
+    for (const MachineBasicBlock &MBB : MF) {
+      for (const MachineInstr &MI : MBB) {
+        if (MI.getOpcode() != AMDGPU::IGLP_OPT)
+          continue;
+        unsigned MBBNum = MBB.getNumber();
+        unsigned StrategyId = static_cast<unsigned>(MI.getOperand(0).getImm());
+
+        for (const yaml::IGLPSmallGemmSingleWaveCacheEntry &E :
+             YamlMFI.IGLPSmallGemmSingleWaveCaches) {
+          if (E.MBBNum == MBBNum && E.StrategyId == StrategyId) {
+            MFMASmallGemmSingleWaveCache Cache = {
+                E.DSWCount, E.DSWWithPermCount, E.DSWWithSharedVMEMCount};
+            MFMASmallGemmSingleWaveCaches[&MI] = Cache;
+            break;
+          }
+        }
+
+        for (const yaml::IGLPExpInterleaveCacheEntry &E :
+             YamlMFI.IGLPExpInterleaveCaches) {
+          if (E.MBBNum == MBBNum && E.StrategyId == StrategyId) {
+            MFMAExpInterleaveCache Cache;
+            Cache.TransPipeCount = E.TransPipeCount;
+            Cache.MFMAPipeCount = E.MFMAPipeCount;
+            Cache.AddPipeCount = E.AddPipeCount;
+            Cache.MFMAEnablement = E.MFMAEnablement;
+            Cache.ExpRequirement = E.ExpRequirement;
+            Cache.MFMAChains = E.MFMAChains;
+            Cache.MFMAChainLength = E.MFMAChainLength;
+            Cache.HasCvt = E.HasCvt;
+            Cache.HasChainBetweenCvt = E.HasChainBetweenCvt;
+            Cache.AnalysisResult = E.AnalysisResult;
+            MFMAExpInterleaveCaches[&MI] = Cache;
+            break;
+          }
+        }
+      }
+    }
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 617862db8f506..7a2ac80c002d1 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -20,6 +20,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "SIModeRegisterDefaults.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
@@ -29,6 +30,8 @@
 
 namespace llvm {
 
+class MachineInstr;
+
 class MachineFrameInfo;
 class MachineFunction;
 class SIMachineFunctionInfo;
@@ -259,6 +262,56 @@ template <> struct MappingTraits<SIMode> {
   }
 };
 
+struct IGLPSmallGemmSingleWaveCacheEntry {
+  unsigned MBBNum = 0;
+  unsigned StrategyId = 0;
+  unsigned DSWCount = 0;
+  unsigned DSWWithPermCount = 0;
+  unsigned DSWWithSharedVMEMCount = 0;
+};
+
+template <> struct MappingTraits<IGLPSmallGemmSingleWaveCacheEntry> {
+  static void mapping(IO &YamlIO, IGLPSmallGemmSingleWaveCacheEntry &E) {
+    YamlIO.mapRequired("mbb", E.MBBNum);
+    YamlIO.mapRequired("strategyId", E.StrategyId);
+    YamlIO.mapRequired("dswCount", E.DSWCount);
+    YamlIO.mapRequired("dswWithPermCount", E.DSWWithPermCount);
+    YamlIO.mapRequired("dswWithSharedVMEMCount", E.DSWWithSharedVMEMCount);
+  }
+};
+
+struct IGLPExpInterleaveCacheEntry {
+  unsigned MBBNum = 0;
+  unsigned StrategyId = 0;
+  unsigned TransPipeCount = 0;
+  unsigned MFMAPipeCount = 0;
+  unsigned AddPipeCount = 0;
+  unsigned MFMAEnablement = 0;
+  unsigned ExpRequirement = 0;
+  unsigned MFMAChains = 0;
+  unsigned MFMAChainLength = 0;
+  bool HasCvt = false;
+  bool HasChainBetweenCvt = false;
+  bool AnalysisResult = false;
+};
+
+template <> struct MappingTraits<IGLPExpInterleaveCacheEntry> {
+  static void mapping(IO &YamlIO, IGLPExpInterleaveCacheEntry &E) {
+    YamlIO.mapRequired("mbb", E.MBBNum);
+    YamlIO.mapRequired("strategyId", E.StrategyId);
+    YamlIO.mapRequired("transPipeCount", E.TransPipeCount);
+    YamlIO.mapRequired("mfmaPipeCount", E.MFMAPipeCount);
+    YamlIO.mapRequired("addPipeCount", E.AddPipeCount);
+    YamlIO.mapRequired("mfmaEnablement", E.MFMAEnablement);
+    YamlIO.mapRequired("expRequirement", E.ExpRequirement);
+    YamlIO.mapRequired("mfmaChains", E.MFMAChains);
+    YamlIO.mapRequired("mfmaChainLength", E.MFMAChainLength);
+    YamlIO.mapRequired("hasCvt", E.HasCvt);
+    YamlIO.mapRequired("hasChainBetweenCvt", E.HasChainBetweenCvt);
+    YamlIO.mapRequired("analysisResult", E.AnalysisResult);
+  }
+};
+
 struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   uint64_t ExplicitKernArgSize = 0;
   Align MaxKernArgAlign;
@@ -303,6 +356,9 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   bool HasInitWholeWave = false;
   bool IsWholeWaveFunction = false;
 
+  SmallVector<IGLPSmallGemmSingleWaveCacheEntry> IGLPSmallGemmSingleWaveCaches;
+  SmallVector<IGLPExpInterleaveCacheEntry> IGLPExpInterleaveCaches;
+
   unsigned DynamicVGPRBlockSize = 0;
   unsigned ScratchReservedForDynamicVGPRs = 0;
 
@@ -360,6 +416,9 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
                        StringValue());
     YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
+    YamlIO.mapOptional("iglpSmallGemmSingleWaveCaches",
+                       MFI.IGLPSmallGemmSingleWaveCaches);
+    YamlIO.mapOptional("iglpExpInterleaveCaches", MFI.IGLPExpInterleaveCaches);
     YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
     YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
                        MFI.ScratchReservedForDynamicVGPRs, 0);
@@ -550,6 +609,25 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     bool IsDead = false;
   };
 
+  struct MFMASmallGemmSingleWaveCache {
+    unsigned DSWCount = 0;
+    unsigned DSWWithPermCount = 0;
+    unsigned DSWWithSharedVMEMCount = 0;
+  };
+
+  struct MFMAExpInterleaveCache {
+    unsigned TransPipeCount = 0;
+    unsigned MFMAPipeCount = 0;
+    unsigned AddPipeCount = 0;
+    unsigned MFMAEnablement = 0;
+    unsigned ExpRequirement = 0;
+    unsigned MFMAChains = 0;
+    unsigned MFMAChainLength = 0;
+    bool HasCvt = false;
+    bool HasChainBetweenCvt = false;
+    bool AnalysisResult = false;
+  };
+
 private:
   // To track virtual VGPR + lane index for each subregister of the SGPR spilled
   // to frameindex key during SILowerSGPRSpills pass.
@@ -614,7 +692,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // load/store is enabled.
   IndexedMap<uint32_t, VGPRBlock2IndexFunctor> MaskForVGPRBlockOps;
 
-private:
+  DenseMap<const MachineInstr *, MFMASmallGemmSingleWaveCache>
+      MFMASmallGemmSingleWaveCaches;
+  DenseMap<const MachineInstr *, MFMAExpInterleaveCache>
+      MFMAExpInterleaveCaches;
+
   Register VGPRForAGPRCopy;
 
   bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
@@ -632,6 +714,41 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     VGPRForAGPRCopy = NewVGPRForAGPRCopy;
   }
 
+  const MFMASmallGemmSingleWaveCache *
+  getMFMASmallGemmSingleWaveCache(const MachineInstr *MI) const {
+    DenseMap<const MachineInstr *, MFMASmallGemmSingleWaveCache>::const_iterator
+        It = MFMASmallGemmSingleWaveCaches.find(MI);
+    return It == MFMASmallGemmSingleWaveCaches.end() ? nullptr : &It->second;
+  }
+
+  void
+  setMFMASmallGemmSingleWaveCache(const MachineInstr *MI,
+                                  const MFMASmallGemmSingleWaveCache &Cache) {
+    MFMASmallGemmSingleWaveCaches[MI] = Cache;
+  }
+
+  const MFMAExpInterleaveCache *
+  getMFMAExpInterleaveCache(const MachineInstr *MI) const {
+    DenseMap<const MachineInstr *, MFMAExpInterleaveCache>::const_iterator It =
+        MFMAExpInterleaveCaches.find(MI);
+    return It == MFMAExpInterleaveCaches.end() ? nullptr : &It->second;
+  }
+
+  void setMFMAExpInterleaveCache(const MachineInstr *MI,
+                                 const MFMAExpInterleaveCache &Cache) {
+    MFMAExpInterleaveCaches[MI] = Cache;
+  }
+
+  const DenseMap<const MachineInstr *, MFMASmallGemmSingleWaveCache> &
+  getMFMASmallGemmSingleWaveCaches() const {
+    return MFMASmallGemmSingleWaveCaches;
+  }
+
+  const DenseMap<const MachineInstr *, MFMAExpInterleaveCache> &
+  getMFMAExpInterleaveCaches() const {
+    return MFMAExpInterleaveCaches;
+  }
+
   bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const;
 
   void setMaskForVGPRBlockOps(Register RegisterBlock, uint32_t Mask) {
@@ -647,7 +764,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return MaskForVGPRBlockOps.inBounds(RegisterBlock);
   }
 
-public:
   SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI) = default;
   SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI);
 
@@ -1229,4 +1345,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
 } // end namespace llvm
 
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::IGLPSmallGemmSingleWaveCacheEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::IGLPExpInterleaveCacheEntry)
+
 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
diff --git a/llvm/test/CodeGen/AMDGPU/iglp-cache-serialization.mir b/llvm/test/CodeGen/AMDGPU/iglp-cache-serialization.mir
new file mode 100644
index 0000000000000..501eb308ffc48
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/iglp-cache-serialization.mir
@@ -0,0 +1,85 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=none -o - %s | FileCheck -check-prefix=ROUNDTRIP %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=postmisched -o - %s | FileCheck -check-prefix=POSTRA %s
+
+# Test that IGLP cache data serializes/deserializes through MIR and is
+# available for PostRA scheduling.
+
+# Verify round-trip of the serialized cache fields.
+# ROUNDTRIP: iglpSmallGemmSingleWaveCaches:
+# ROUNDTRIP-NEXT: - mbb:             1
+# ROUNDTRIP-NEXT:   strategyId:      1
+# ROUNDTRIP-NEXT:   dswCount:        6
+# ROUNDTRIP-NEXT:   dswWithPermCount: 2
+# ROUNDTRIP-NEXT:   dswWithSharedVMEMCount: 0
+# ROUNDTRIP: iglpExpInterleaveCaches:
+# ROUNDTRIP-NEXT: - mbb:             0
+# ROUNDTRIP-NEXT:   strategyId:      2
+# ROUNDTRIP-NEXT:   transPipeCount:  32
+# ROUNDTRIP-NEXT:   mfmaPipeCount:   40
+# ROUNDTRIP-NEXT:   addPipeCount:    33
+# ROUNDTRIP-NEXT:   mfmaEnablement:  2
+# ROUNDTRIP-NEXT:   expRequirement:  4
+# ROUNDTRIP-NEXT:   mfmaChains:      2
+# ROUNDTRIP-NEXT:   mfmaChainLength: 20
+# ROUNDTRIP-NEXT:   hasCvt:          true
+# ROUNDTRIP-NEXT:   hasChainBetweenCvt: false
+# ROUNDTRIP-NEXT:   analysisResult:  true
+
+# Verify PostRA scheduling doesn't crash and passes through the IGLP_OPTs.
+# POSTRA: IGLP_OPT 2
+# POSTRA: V_MFMA
+# POSTRA: IGLP_OPT 1
+--- |
+  define amdgpu_kernel void @iglp_cache_test() #0 { ret void }
+
+  attributes #0 = { "amdgpu-flat-work-group-size"="256,256" }
+...
+---
+name:            iglp_cache_test
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+machineFunctionInfo:
+  isEntryFunction: false
+  occupancy:     5
+  iglpExpInterleaveCaches:
+    - mbb:             0
+      strategyId:      2
+      transPipeCount:  32
+      mfmaPipeCount:   40
+      addPipeCount:    33
+      mfmaEnablement:  2
+      expRequirement:  4
+      mfmaChains:      2
+      mfmaChainLength: 20
+      hasCvt:          true
+      hasChainBetweenCvt: false
+      analysisResult:  true
+  iglpSmallGemmSingleWaveCaches:
+    - mbb:             1
+      strategyId:      1
+      dswCount:        6
+      dswWithPermCount: 2
+      dswWithSharedVMEMCount: 0
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr36_vgpr37, $vgpr40_vgpr41, $vgpr44_vgpr45, $vgpr38_vgpr39, $vgpr42_vgpr43
+
+    IGLP_OPT 2
+    $vgpr50 = V_EXP_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr51 = V_EXP_F32_e32 $vgpr1, implicit $mode, implicit $exec
+    $vgpr52 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr36_vgpr37, $vgpr40_vgpr41, 0, 0, 0, 0, implicit $mode, implicit $exec
+    early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr44_vgpr45, $vgpr40_vgpr41, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    liveins: $vgpr0, $vgpr1
+
+    IGLP_OPT 1
+    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index a10c99070d8e1..64265f6f9a505 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -285,6 +285,34 @@ entry:
   ret void
 }
 
+; If we run this function after test_iglp_opt_rev_mfma_gemm, we get:
+; > Assertion `(!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
+; > DSWWithSharedVMEMCount == 0)) && "DSWCounters should be zero in pre-RA
+; > scheduling!"' failed.
+; This is because, previously, the counters were global static variables which
+; weren't reset.
+define amdgpu_kernel void @test_after_test_iglp_opt_rev_mfma_gemm(ptr %src, ptr addrspace(3) %dst) {
+; GCN-LABEL: test_after_test_iglp_opt_rev_mfma_gemm:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    ; iglp_opt mask(0x00000001)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT:    flat_load_ubyte v0, v[0:1]
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    ds_write_b8 v1, v0
+; GCN-NEXT:    s_endpgm
+entry:
+  %a = load i1, ptr %src, align 1
+  call void @llvm.amdgcn.iglp.opt(i32 1)
+  store i1 %a, ptr addrspace(3) %dst, align 1
+  ret void
+}
+
 define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_iglp_opt_asm_sideeffect:
 ; GCN:       ; %bb.0: ; %entry