diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index e13147e9461eb..29699962d324c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -16,6 +16,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUIGroupLP.h" +#include "GCNSchedStrategy.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -833,6 +834,8 @@ class IGLPStrategy { const SIInstrInfo *TII; + const MachineInstr *IGLPOptMI; + public: /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy. virtual bool applyIGLPStrategy( @@ -846,8 +849,9 @@ class IGLPStrategy { bool IsBottomUp = true; - IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : DAG(DAG), TII(TII) {} + IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII, + const MachineInstr *IGLPOptMI) + : DAG(DAG), TII(TII), IGLPOptMI(IGLPOptMI) {} virtual ~IGLPStrategy() = default; }; @@ -865,8 +869,9 @@ class MFMASmallGemmOpt final : public IGLPStrategy { return true; } - MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : IGLPStrategy(DAG, TII) { + MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII, + const MachineInstr *IGLPOptMI) + : IGLPStrategy(DAG, TII, IGLPOptMI) { IsBottomUp = true; } }; @@ -899,31 +904,36 @@ bool MFMASmallGemmOpt::applyIGLPStrategy( class MFMAExpInterleaveOpt final : public IGLPStrategy { private: // The count of TRANS SUs involved in the interleaved pipeline - static unsigned TransPipeCount; + unsigned TransPipeCount = 0; // The count of MFMA SUs involved in the interleaved pipeline - static unsigned MFMAPipeCount; + unsigned MFMAPipeCount = 0; // The count of Add SUs involved in the interleaved pipeline - static unsigned AddPipeCount; + unsigned AddPipeCount = 0; // The number of transitive MFMA successors for each TRANS SU - static unsigned MFMAEnablement; + unsigned MFMAEnablement = 0; // The number of transitive TRANS predecessors for each MFMA SU - static unsigned ExpRequirement; + unsigned ExpRequirement = 0; // The count of independent "chains" of MFMA instructions in the pipeline - static unsigned MFMAChains; + unsigned MFMAChains = 0; // The length of each independent "chain" of MFMA instructions - static unsigned MFMAChainLength; + unsigned MFMAChainLength = 0; // Whether or not the pipeline has V_CVT instructions - static bool HasCvt; + bool HasCvt = false; // Whether or not there are instructions between the TRANS instruction and // V_CVT - static bool HasChainBetweenCvt; + bool HasChainBetweenCvt = false; // The first occuring DS_READ which feeds an MFMA chain - static std::optional FirstPipeDSR; + std::optional FirstPipeDSR = std::nullopt; // The MFMAPipe SUs with no MFMA predecessors SmallVector MFMAChainSeeds; - // Compute the heuristics for the pipeline, returning whether or not the DAG - // is well formatted for the mutation - bool analyzeDAG(const SIInstrInfo *TII); + bool analyzeDAG(const SIInstrInfo *TII, AMDGPU::SchedulingPhase Phase); + bool computeDAGAnalysis(const SIInstrInfo *TII, + SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache); + void initializeScalarsFromCache( + const SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache); + void initializePointersFromCache( + const GCNScheduleDAGMILive::IGLPExpInterleavePointerCache &PtrCache); + bool AnalysisResult; /// Whether or not the instruction is a transitive predecessor of an MFMA /// instruction @@ -1328,29 +1338,22 @@ class MFMAExpInterleaveOpt final : public IGLPStrategy { bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, AMDGPU::SchedulingPhase Phase) override; - MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : IGLPStrategy(DAG, TII) { + MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII, + const MachineInstr *IGLPOptMI) + : IGLPStrategy(DAG, TII, IGLPOptMI) { IsBottomUp = false; } }; -unsigned MFMAExpInterleaveOpt::TransPipeCount = 0; -unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0; -unsigned MFMAExpInterleaveOpt::AddPipeCount = 0; -unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0; -unsigned MFMAExpInterleaveOpt::ExpRequirement = 0; -unsigned MFMAExpInterleaveOpt::MFMAChains = 0; -unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0; -bool MFMAExpInterleaveOpt::HasCvt = false; -bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false; -std::optional MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt; - -bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { +bool MFMAExpInterleaveOpt::computeDAGAnalysis( + const SIInstrInfo *TII, + SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache) { SmallVector ExpPipeCands; SmallVector MFMAPipeCands; SmallVector MFMAPipeSUs; SmallVector PackSUs; SmallVector CvtSUs; + const MachineInstr *FirstPipeDSRInstr = nullptr; auto isBitPack = [](unsigned Opc) { return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64; @@ -1367,12 +1370,14 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { auto Opc = SU.getInstr()->getOpcode(); if (TII->isTRANS(Opc)) { // Avoid counting a potential bonus V_EXP which all the MFMA depend on + // FIXME: This heuristic needs improvement/clarification! + // In general, the pipeline seems to look like this: + // fma_f32 -> exp_f32 -> cvt_f16_f32 -> v_pack_b32_f16 -> mfma_.._f16 + // (with potential arithmetic between exp and cvt) + // see + // https://github.com/llvm/llvm-project/pull/80370#discussion_r1483660378 if (SU.Succs.size() >= 7) continue; - for (auto &Succ : SU.Succs) { - if (Succ.getSUnit()->Succs.size() >= 7) - continue; - } ExpPipeCands.push_back(&SU); } @@ -1457,6 +1462,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { } } + MFMAChainSeeds.clear(); MFMAChains = 0; for (auto &MFMAPipeSU : MFMAPipeSUs) { if (is_contained(MFMAChainSeeds, MFMAPipeSU)) @@ -1474,8 +1480,10 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { for (auto Pred : MFMAChainSeeds[0]->Preds) { if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) && - Pred.getSUnit()->getInstr()->mayLoad()) + Pred.getSUnit()->getInstr()->mayLoad()) { FirstPipeDSR = Pred.getSUnit()->NodeNum; + FirstPipeDSRInstr = Pred.getSUnit()->getInstr(); + } } MFMAChainLength = MFMAPipeCount / MFMAChains; @@ -1519,20 +1527,93 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { }); ExpRequirement *= PackPredCount; + + Cache.TransPipeCount = TransPipeCount; + Cache.MFMAPipeCount = MFMAPipeCount; + Cache.AddPipeCount = AddPipeCount; + Cache.MFMAEnablement = MFMAEnablement; + Cache.ExpRequirement = ExpRequirement; + Cache.MFMAChains = MFMAChains; + Cache.MFMAChainLength = MFMAChainLength; + Cache.HasCvt = HasCvt; + Cache.HasChainBetweenCvt = HasChainBetweenCvt; + Cache.AnalysisResult = true; + + GCNScheduleDAGMILive *GCNDAG = static_cast(DAG); + GCNScheduleDAGMILive::IGLPExpInterleavePointerCache PtrCache; + PtrCache.FirstPipeDSRInstr = FirstPipeDSRInstr; + PtrCache.MFMAChainSeedInstrs.reserve(MFMAChainSeeds.size()); + for (SUnit *Seed : MFMAChainSeeds) + PtrCache.MFMAChainSeedInstrs.push_back(Seed->getInstr()); + GCNDAG->setIGLPExpInterleavePointerCache(IGLPOptMI, PtrCache); + return true; } +void MFMAExpInterleaveOpt::initializeScalarsFromCache( + const SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache) { + TransPipeCount = Cache.TransPipeCount; + MFMAPipeCount = Cache.MFMAPipeCount; + AddPipeCount = Cache.AddPipeCount; + MFMAEnablement = Cache.MFMAEnablement; + ExpRequirement = Cache.ExpRequirement; + MFMAChains = Cache.MFMAChains; + MFMAChainLength = Cache.MFMAChainLength; + HasCvt = Cache.HasCvt; + HasChainBetweenCvt = Cache.HasChainBetweenCvt; + AnalysisResult = Cache.AnalysisResult; +} + +void MFMAExpInterleaveOpt::initializePointersFromCache( + const GCNScheduleDAGMILive::IGLPExpInterleavePointerCache &PtrCache) { + if (PtrCache.FirstPipeDSRInstr) { + SUnit *SU = + DAG->getSUnit(const_cast(PtrCache.FirstPipeDSRInstr)); + assert(SU && "FirstPipeDSRInstr instruction not found in DAG"); + FirstPipeDSR = SU->NodeNum; + } + MFMAChainSeeds.clear(); + for (const MachineInstr *MI : PtrCache.MFMAChainSeedInstrs) { + SUnit *SeedSU = DAG->getSUnit(const_cast(MI)); + assert(SeedSU && "MFMAChainSeed instruction not found in DAG"); + MFMAChainSeeds.push_back(SeedSU); + } +} + +bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII, + AMDGPU::SchedulingPhase Phase) { + SIMachineFunctionInfo &MFI = *DAG->MF.getInfo(); + bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA; + + if (const SIMachineFunctionInfo::MFMAExpInterleaveCache *Cache = + MFI.getMFMAExpInterleaveCache(IGLPOptMI)) { + initializeScalarsFromCache(*Cache); + + if (!IsPostRA) { + GCNScheduleDAGMILive *GCNDAG = static_cast(DAG); + const GCNScheduleDAGMILive::IGLPExpInterleavePointerCache *PtrCache = + GCNDAG->getIGLPExpInterleavePointerCache(IGLPOptMI); + assert(PtrCache && + "Pre-RA phase expected pointer cache in GCNScheduleDAGMILive"); + initializePointersFromCache(*PtrCache); + } + return AnalysisResult; + } + + assert(!IsPostRA && "PostRA phase not expected to require analyzing DAG"); + SIMachineFunctionInfo::MFMAExpInterleaveCache Cache; + AnalysisResult = computeDAGAnalysis(TII, Cache); + if (AnalysisResult) + MFI.setMFMAExpInterleaveCache(IGLPOptMI, Cache); + return AnalysisResult; +} + bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG, AMDGPU::SchedulingPhase Phase) { const GCNSubtarget &ST = DAG->MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - if (Phase != AMDGPU::SchedulingPhase::PostRA) - MFMAChainSeeds.clear(); - if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII)) - return false; - - return true; + return analyzeDAG(TII, Phase); } bool MFMAExpInterleaveOpt::applyIGLPStrategy( @@ -1540,6 +1621,8 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( DenseMap> &SyncedSchedGroups, AMDGPU::SchedulingPhase Phase) { + assert(AnalysisResult && "no or failed DAG analysis"); + bool IsSmallKernelType = MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32; bool IsLargeKernelType = @@ -1559,18 +1642,18 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( unsigned CurrMFMAForTransPosition = 0; auto incrementTransPosition = [&MFMAChain, &PositionInChain, - &CurrMFMAForTransPosition]() { + &CurrMFMAForTransPosition, this]() { CurrMFMAForTransPosition += MFMAEnablement; PositionInChain = (CurrMFMAForTransPosition / MFMAChains); MFMAChain = CurrMFMAForTransPosition % MFMAChains; }; - auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() { + auto getNextTransPositionInChain = [&CurrMFMAForTransPosition, this]() { auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; return (TempMFMAForTrans / MFMAChains); }; - auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() { + auto getNextTransMFMAChain = [&CurrMFMAForTransPosition, this]() { auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; return TempMFMAForTrans % MFMAChains; }; @@ -1580,7 +1663,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( unsigned PositionInChainForMFMA = 0; auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA, - &PositionInChainForMFMA]() { + &PositionInChainForMFMA, this]() { ++CurrMFMAPosition; MFMAChainForMFMA = CurrMFMAPosition % MFMAChains; PositionInChainForMFMA = CurrMFMAPosition / MFMAChains; @@ -1838,8 +1921,9 @@ class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy { return true; } - MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : IGLPStrategy(DAG, TII) { + MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII, + const MachineInstr *IGLPOptMI) + : IGLPStrategy(DAG, TII, IGLPOptMI) { IsBottomUp = true; } }; @@ -2065,25 +2149,36 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { return true; } - MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : IGLPStrategy(DAG, TII) { + MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII, + const MachineInstr *IGLPOptMI) + : IGLPStrategy(DAG, TII, IGLPOptMI) { IsBottomUp = false; } }; -static unsigned DSWCount = 0; -static unsigned DSWWithPermCount = 0; -static unsigned DSWWithSharedVMEMCount = 0; - bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, AMDGPU::SchedulingPhase Phase) { unsigned MFMACount = 0; unsigned DSRCount = 0; + unsigned DSWCount = 0; + unsigned DSWWithPermCount = 0; + unsigned DSWWithSharedVMEMCount = 0; bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial; + if (!IsInitial) { + const SIMachineFunctionInfo &MFI = + *DAG->MF.getInfo(); + const SIMachineFunctionInfo::MFMASmallGemmSingleWaveCache *Cache = + MFI.getMFMASmallGemmSingleWaveCache(IGLPOptMI); + assert(Cache && "no cache found"); + DSWCount = Cache->DSWCount; + DSWWithPermCount = Cache->DSWWithPermCount; + DSWWithSharedVMEMCount = Cache->DSWWithSharedVMEMCount; + } + assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 && DSWWithSharedVMEMCount == 0)) && "DSWCounters should be zero in pre-RA scheduling!"); @@ -2110,8 +2205,6 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( if (IsInitial) { DSWWithPermCount = DSWithPerms.size(); - auto *I = DSWithPerms.begin(); - auto *E = DSWithPerms.end(); // Get the count of DS_WRITES with V_PERM predecessors which // have loop carried dependencies (WAR) on the same VMEM_READs. @@ -2121,10 +2214,10 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( // for every V_PERM pred of this DS_W. DenseMap VMEMLookup; SmallVector Counted; - for (; I != E; I++) { + for (SUnit *DSWrite : DSWithPerms) { SUnit *Cand = nullptr; bool MissedAny = false; - for (auto &Pred : (*I)->Preds) { + for (auto &Pred : DSWrite->Preds) { if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) continue; @@ -2138,11 +2231,11 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( if (MissedAny || !VMEMLookup.size()) { MissedAny = true; - VMEMLookup[MI] = *I; + VMEMLookup[MI] = DSWrite; continue; } - auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I); + auto [It, Inserted] = VMEMLookup.try_emplace(MI, DSWrite); if (Inserted) { MissedAny = true; continue; @@ -2158,9 +2251,14 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( if (!MissedAny && Cand) { DSWWithSharedVMEMCount += 2; Counted.push_back(Cand); - Counted.push_back(*I); + Counted.push_back(DSWrite); } } + + SIMachineFunctionInfo::MFMASmallGemmSingleWaveCache Cache = { + DSWCount, DSWWithPermCount, DSWWithSharedVMEMCount}; + SIMachineFunctionInfo &MFI = *DAG->MF.getInfo(); + MFI.setMFMASmallGemmSingleWaveCache(IGLPOptMI, Cache); } assert(DSWWithSharedVMEMCount <= DSWWithPermCount); @@ -2322,16 +2420,16 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( static std::unique_ptr createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, - const SIInstrInfo *TII) { + const SIInstrInfo *TII, const MachineInstr *IGLPOptMI) { switch (ID) { case MFMASmallGemmOptID: - return std::make_unique(DAG, TII); + return std::make_unique(DAG, TII, IGLPOptMI); case MFMASmallGemmSingleWaveOptID: - return std::make_unique(DAG, TII); + return std::make_unique(DAG, TII, IGLPOptMI); case MFMAExpInterleaveID: - return std::make_unique(DAG, TII); + return std::make_unique(DAG, TII, IGLPOptMI); case MFMAExpSimpleInterleaveID: - return std::make_unique(DAG, TII); + return std::make_unique(DAG, TII, IGLPOptMI); } llvm_unreachable("Unknown IGLPStrategyID"); @@ -2709,7 +2807,7 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { IGLPStrategyID StrategyID = (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); - auto S = createIGLPStrategy(StrategyID, DAG, TII); + auto S = createIGLPStrategy(StrategyID, DAG, TII, SU.getInstr()); if (!S->shouldApplyStrategy(DAG, Phase)) return false; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 99fd55db33285..cb07f2da679c1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -249,6 +249,24 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class ILPInitialScheduleStage; friend class RegionPressureMap; +public: + struct IGLPExpInterleavePointerCache { + /// MFMA instructions in the interleave pipeline that have no MFMA + /// predecessors, i.e. the roots of independent MFMA chains. + SmallVector MFMAChainSeedInstrs; + /// The first DS_READ that feeds into an MFMA chain seed. + const MachineInstr *FirstPipeDSRInstr = nullptr; + }; + +private: + /// Pass-local cache of pointer-based IGLP analysis data, keyed by the + /// IGLP_OPT MachineInstr. This is safe because the DAG object lives for + /// the duration of the pre-RA scheduling pass and the MachineInstrs are + /// stable within that lifetime. Unlike SIMachineFunctionInfo caches, this + /// is never serialized. + DenseMap + IGLPExpInterleavePtrCaches; + const GCNSubtarget &ST; SIMachineFunctionInfo &MFI; @@ -317,6 +335,18 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr S); + const IGLPExpInterleavePointerCache * + getIGLPExpInterleavePointerCache(const MachineInstr *MI) const { + auto It = IGLPExpInterleavePtrCaches.find(MI); + return It == IGLPExpInterleavePtrCaches.end() ? nullptr : &It->second; + } + + void + setIGLPExpInterleavePointerCache(const MachineInstr *MI, + const IGLPExpInterleavePointerCache &Cache) { + IGLPExpInterleavePtrCaches[MI] = Cache; + } + void schedule() override; void finalizeSchedule() override; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 32efae69b20c8..95867d442d278 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -772,6 +772,41 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( auto SFI = MFI.getOptionalScavengeFI(); if (SFI) ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo()); + + const DenseMap + &SmallGemmCaches = MFI.getMFMASmallGemmSingleWaveCaches(); + for (DenseMap:: + const_iterator It = SmallGemmCaches.begin(), + E = SmallGemmCaches.end(); + It != E; ++It) { + const MachineInstr *MI = It->first; + IGLPSmallGemmSingleWaveCaches.push_back( + {static_cast(MI->getParent()->getNumber()), + static_cast(MI->getOperand(0).getImm()), It->second.DSWCount, + It->second.DSWWithPermCount, It->second.DSWWithSharedVMEMCount}); + } + + const DenseMap + &ExpInterleaveCaches = MFI.getMFMAExpInterleaveCaches(); + for (DenseMap< + const MachineInstr *, + llvm::SIMachineFunctionInfo::MFMAExpInterleaveCache>::const_iterator + It = ExpInterleaveCaches.begin(), + E = ExpInterleaveCaches.end(); + It != E; ++It) { + const MachineInstr *MI = It->first; + IGLPExpInterleaveCaches.push_back( + {static_cast(MI->getParent()->getNumber()), + static_cast(MI->getOperand(0).getImm()), + It->second.TransPipeCount, It->second.MFMAPipeCount, + It->second.AddPipeCount, It->second.MFMAEnablement, + It->second.ExpRequirement, It->second.MFMAChains, + It->second.MFMAChainLength, It->second.HasCvt, + It->second.HasChainBetweenCvt, It->second.AnalysisResult}); + } } void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { @@ -821,6 +856,48 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( } else { ScavengeFI = std::nullopt; } + + if (!YamlMFI.IGLPSmallGemmSingleWaveCaches.empty() || + !YamlMFI.IGLPExpInterleaveCaches.empty()) { + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + if (MI.getOpcode() != AMDGPU::IGLP_OPT) + continue; + unsigned MBBNum = MBB.getNumber(); + unsigned StrategyId = static_cast(MI.getOperand(0).getImm()); + + for (const yaml::IGLPSmallGemmSingleWaveCacheEntry &E : + YamlMFI.IGLPSmallGemmSingleWaveCaches) { + if (E.MBBNum == MBBNum && E.StrategyId == StrategyId) { + MFMASmallGemmSingleWaveCache Cache = { + E.DSWCount, E.DSWWithPermCount, E.DSWWithSharedVMEMCount}; + MFMASmallGemmSingleWaveCaches[&MI] = Cache; + break; + } + } + + for (const yaml::IGLPExpInterleaveCacheEntry &E : + YamlMFI.IGLPExpInterleaveCaches) { + if (E.MBBNum == MBBNum && E.StrategyId == StrategyId) { + MFMAExpInterleaveCache Cache; + Cache.TransPipeCount = E.TransPipeCount; + Cache.MFMAPipeCount = E.MFMAPipeCount; + Cache.AddPipeCount = E.AddPipeCount; + Cache.MFMAEnablement = E.MFMAEnablement; + Cache.ExpRequirement = E.ExpRequirement; + Cache.MFMAChains = E.MFMAChains; + Cache.MFMAChainLength = E.MFMAChainLength; + Cache.HasCvt = E.HasCvt; + Cache.HasChainBetweenCvt = E.HasChainBetweenCvt; + Cache.AnalysisResult = E.AnalysisResult; + MFMAExpInterleaveCaches[&MI] = Cache; + break; + } + } + } + } + } + return false; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 617862db8f506..7a2ac80c002d1 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -20,6 +20,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIModeRegisterDefaults.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MIRYamlMapping.h" @@ -29,6 +30,8 @@ namespace llvm { +class MachineInstr; + class MachineFrameInfo; class MachineFunction; class SIMachineFunctionInfo; @@ -259,6 +262,56 @@ template <> struct MappingTraits { } }; +struct IGLPSmallGemmSingleWaveCacheEntry { + unsigned MBBNum = 0; + unsigned StrategyId = 0; + unsigned DSWCount = 0; + unsigned DSWWithPermCount = 0; + unsigned DSWWithSharedVMEMCount = 0; +}; + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, IGLPSmallGemmSingleWaveCacheEntry &E) { + YamlIO.mapRequired("mbb", E.MBBNum); + YamlIO.mapRequired("strategyId", E.StrategyId); + YamlIO.mapRequired("dswCount", E.DSWCount); + YamlIO.mapRequired("dswWithPermCount", E.DSWWithPermCount); + YamlIO.mapRequired("dswWithSharedVMEMCount", E.DSWWithSharedVMEMCount); + } +}; + +struct IGLPExpInterleaveCacheEntry { + unsigned MBBNum = 0; + unsigned StrategyId = 0; + unsigned TransPipeCount = 0; + unsigned MFMAPipeCount = 0; + unsigned AddPipeCount = 0; + unsigned MFMAEnablement = 0; + unsigned ExpRequirement = 0; + unsigned MFMAChains = 0; + unsigned MFMAChainLength = 0; + bool HasCvt = false; + bool HasChainBetweenCvt = false; + bool AnalysisResult = false; +}; + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, IGLPExpInterleaveCacheEntry &E) { + YamlIO.mapRequired("mbb", E.MBBNum); + YamlIO.mapRequired("strategyId", E.StrategyId); + YamlIO.mapRequired("transPipeCount", E.TransPipeCount); + YamlIO.mapRequired("mfmaPipeCount", E.MFMAPipeCount); + YamlIO.mapRequired("addPipeCount", E.AddPipeCount); + YamlIO.mapRequired("mfmaEnablement", E.MFMAEnablement); + YamlIO.mapRequired("expRequirement", E.ExpRequirement); + YamlIO.mapRequired("mfmaChains", E.MFMAChains); + YamlIO.mapRequired("mfmaChainLength", E.MFMAChainLength); + YamlIO.mapRequired("hasCvt", E.HasCvt); + YamlIO.mapRequired("hasChainBetweenCvt", E.HasChainBetweenCvt); + YamlIO.mapRequired("analysisResult", E.AnalysisResult); + } +}; + struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { uint64_t ExplicitKernArgSize = 0; Align MaxKernArgAlign; @@ -303,6 +356,9 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { bool HasInitWholeWave = false; bool IsWholeWaveFunction = false; + SmallVector IGLPSmallGemmSingleWaveCaches; + SmallVector IGLPExpInterleaveCaches; + unsigned DynamicVGPRBlockSize = 0; unsigned ScratchReservedForDynamicVGPRs = 0; @@ -360,6 +416,9 @@ template <> struct MappingTraits { YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg, StringValue()); YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false); + YamlIO.mapOptional("iglpSmallGemmSingleWaveCaches", + MFI.IGLPSmallGemmSingleWaveCaches); + YamlIO.mapOptional("iglpExpInterleaveCaches", MFI.IGLPExpInterleaveCaches); YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false); YamlIO.mapOptional("scratchReservedForDynamicVGPRs", MFI.ScratchReservedForDynamicVGPRs, 0); @@ -550,6 +609,25 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, bool IsDead = false; }; + struct MFMASmallGemmSingleWaveCache { + unsigned DSWCount = 0; + unsigned DSWWithPermCount = 0; + unsigned DSWWithSharedVMEMCount = 0; + }; + + struct MFMAExpInterleaveCache { + unsigned TransPipeCount = 0; + unsigned MFMAPipeCount = 0; + unsigned AddPipeCount = 0; + unsigned MFMAEnablement = 0; + unsigned ExpRequirement = 0; + unsigned MFMAChains = 0; + unsigned MFMAChainLength = 0; + bool HasCvt = false; + bool HasChainBetweenCvt = false; + bool AnalysisResult = false; + }; + private: // To track virtual VGPR + lane index for each subregister of the SGPR spilled // to frameindex key during SILowerSGPRSpills pass. @@ -614,7 +692,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, // load/store is enabled. IndexedMap MaskForVGPRBlockOps; -private: + DenseMap + MFMASmallGemmSingleWaveCaches; + DenseMap + MFMAExpInterleaveCaches; + Register VGPRForAGPRCopy; bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI, @@ -632,6 +714,41 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, VGPRForAGPRCopy = NewVGPRForAGPRCopy; } + const MFMASmallGemmSingleWaveCache * + getMFMASmallGemmSingleWaveCache(const MachineInstr *MI) const { + DenseMap::const_iterator + It = MFMASmallGemmSingleWaveCaches.find(MI); + return It == MFMASmallGemmSingleWaveCaches.end() ? nullptr : &It->second; + } + + void + setMFMASmallGemmSingleWaveCache(const MachineInstr *MI, + const MFMASmallGemmSingleWaveCache &Cache) { + MFMASmallGemmSingleWaveCaches[MI] = Cache; + } + + const MFMAExpInterleaveCache * + getMFMAExpInterleaveCache(const MachineInstr *MI) const { + DenseMap::const_iterator It = + MFMAExpInterleaveCaches.find(MI); + return It == MFMAExpInterleaveCaches.end() ? nullptr : &It->second; + } + + void setMFMAExpInterleaveCache(const MachineInstr *MI, + const MFMAExpInterleaveCache &Cache) { + MFMAExpInterleaveCaches[MI] = Cache; + } + + const DenseMap & + getMFMASmallGemmSingleWaveCaches() const { + return MFMASmallGemmSingleWaveCaches; + } + + const DenseMap & + getMFMAExpInterleaveCaches() const { + return MFMAExpInterleaveCaches; + } + bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const; void setMaskForVGPRBlockOps(Register RegisterBlock, uint32_t Mask) { @@ -647,7 +764,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, return MaskForVGPRBlockOps.inBounds(RegisterBlock); } -public: SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI) = default; SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI); @@ -1229,4 +1345,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, } // end namespace llvm +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::IGLPSmallGemmSingleWaveCacheEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::IGLPExpInterleaveCacheEntry) + #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H diff --git a/llvm/test/CodeGen/AMDGPU/iglp-cache-serialization.mir b/llvm/test/CodeGen/AMDGPU/iglp-cache-serialization.mir new file mode 100644 index 0000000000000..501eb308ffc48 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/iglp-cache-serialization.mir @@ -0,0 +1,85 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=none -o - %s | FileCheck -check-prefix=ROUNDTRIP %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=postmisched -o - %s | FileCheck -check-prefix=POSTRA %s + +# Test that IGLP cache data serializes/deserializes through MIR and is +# available for PostRA scheduling. + +# Verify round-trip of the serialized cache fields. +# ROUNDTRIP: iglpSmallGemmSingleWaveCaches: +# ROUNDTRIP-NEXT: - mbb: 1 +# ROUNDTRIP-NEXT: strategyId: 1 +# ROUNDTRIP-NEXT: dswCount: 6 +# ROUNDTRIP-NEXT: dswWithPermCount: 2 +# ROUNDTRIP-NEXT: dswWithSharedVMEMCount: 0 +# ROUNDTRIP: iglpExpInterleaveCaches: +# ROUNDTRIP-NEXT: - mbb: 0 +# ROUNDTRIP-NEXT: strategyId: 2 +# ROUNDTRIP-NEXT: transPipeCount: 32 +# ROUNDTRIP-NEXT: mfmaPipeCount: 40 +# ROUNDTRIP-NEXT: addPipeCount: 33 +# ROUNDTRIP-NEXT: mfmaEnablement: 2 +# ROUNDTRIP-NEXT: expRequirement: 4 +# ROUNDTRIP-NEXT: mfmaChains: 2 +# ROUNDTRIP-NEXT: mfmaChainLength: 20 +# ROUNDTRIP-NEXT: hasCvt: true +# ROUNDTRIP-NEXT: hasChainBetweenCvt: false +# ROUNDTRIP-NEXT: analysisResult: true + +# Verify PostRA scheduling doesn't crash and passes through the IGLP_OPTs. +# POSTRA: IGLP_OPT 2 +# POSTRA: V_MFMA +# POSTRA: IGLP_OPT 1 +--- | + define amdgpu_kernel void @iglp_cache_test() #0 { ret void } + + attributes #0 = { "amdgpu-flat-work-group-size"="256,256" } +... +--- +name: iglp_cache_test +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +machineFunctionInfo: + isEntryFunction: false + occupancy: 5 + iglpExpInterleaveCaches: + - mbb: 0 + strategyId: 2 + transPipeCount: 32 + mfmaPipeCount: 40 + addPipeCount: 33 + mfmaEnablement: 2 + expRequirement: 4 + mfmaChains: 2 + mfmaChainLength: 20 + hasCvt: true + hasChainBetweenCvt: false + analysisResult: true + iglpSmallGemmSingleWaveCaches: + - mbb: 1 + strategyId: 1 + dswCount: 6 + dswWithPermCount: 2 + dswWithSharedVMEMCount: 0 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr36_vgpr37, $vgpr40_vgpr41, $vgpr44_vgpr45, $vgpr38_vgpr39, $vgpr42_vgpr43 + + IGLP_OPT 2 + $vgpr50 = V_EXP_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr51 = V_EXP_F32_e32 $vgpr1, implicit $mode, implicit $exec + $vgpr52 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr36_vgpr37, $vgpr40_vgpr41, 0, 0, 0, 0, implicit $mode, implicit $exec + early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr44_vgpr45, $vgpr40_vgpr41, 0, 0, 0, 0, implicit $mode, implicit $exec + S_BRANCH %bb.1 + + bb.1: + liveins: $vgpr0, $vgpr1 + + IGLP_OPT 1 + $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index a10c99070d8e1..64265f6f9a505 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -285,6 +285,34 @@ entry: ret void } +; If we run this function after test_iglp_opt_rev_mfma_gemm, we get: +; > Assertion `(!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 && +; > DSWWithSharedVMEMCount == 0)) && "DSWCounters should be zero in pre-RA +; > scheduling!"' failed. +; This is because, previously, the counters were global static variables which +; weren't reset. +define amdgpu_kernel void @test_after_test_iglp_opt_rev_mfma_gemm(ptr %src, ptr addrspace(3) %dst) { +; GCN-LABEL: test_after_test_iglp_opt_rev_mfma_gemm: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: ; iglp_opt mask(0x00000001) +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: ds_write_b8 v1, v0 +; GCN-NEXT: s_endpgm +entry: + %a = load i1, ptr %src, align 1 + call void @llvm.amdgcn.iglp.opt(i32 1) + store i1 %a, ptr addrspace(3) %dst, align 1 + ret void +} + define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_asm_sideeffect: ; GCN: ; %bb.0: ; %entry