Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 74 additions & 88 deletions llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
//===----------------------------------------------------------------------===//

#include "AMDGPUIGroupLP.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/MachineScheduler.h"
Expand Down Expand Up @@ -899,31 +899,32 @@ bool MFMASmallGemmOpt::applyIGLPStrategy(
class MFMAExpInterleaveOpt final : public IGLPStrategy {
private:
// The count of TRANS SUs involved in the interleaved pipeline
static unsigned TransPipeCount;
unsigned TransPipeCount = 0;
// The count of MFMA SUs involved in the interleaved pipeline
static unsigned MFMAPipeCount;
unsigned MFMAPipeCount = 0;
// The count of Add SUs involved in the interleaved pipeline
static unsigned AddPipeCount;
unsigned AddPipeCount = 0;
// The number of transitive MFMA successors for each TRANS SU
static unsigned MFMAEnablement;
unsigned MFMAEnablement = 0;
// The number of transitive TRANS predecessors for each MFMA SU
static unsigned ExpRequirement;
unsigned ExpRequirement = 0;
// The count of independent "chains" of MFMA instructions in the pipeline
static unsigned MFMAChains;
unsigned MFMAChains = 0;
// The length of each independent "chain" of MFMA instructions
static unsigned MFMAChainLength;
unsigned MFMAChainLength = 0;
// Whether or not the pipeline has V_CVT instructions
static bool HasCvt;
bool HasCvt = false;
// Whether or not there are instructions between the TRANS instruction and
// V_CVT
static bool HasChainBetweenCvt;
bool HasChainBetweenCvt = false;
// The first occuring DS_READ which feeds an MFMA chain
static std::optional<unsigned> FirstPipeDSR;
std::optional<unsigned> FirstPipeDSR = std::nullopt;
// The MFMAPipe SUs with no MFMA predecessors
SmallVector<SUnit *, 4> MFMAChainSeeds;
// Compute the heuristics for the pipeline, returning whether or not the DAG
// is well formatted for the mutation
bool analyzeDAG(const SIInstrInfo *TII);
bool AnalysisResult;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest to move the member variable into a (nested) struct (similar to MFMAExpInterleaveCache from #137549 although I would not use this name here) and then change the type of this from bool to std::optional.


/// Whether or not the instruction is a transitive predecessor of an MFMA
/// instruction
Expand Down Expand Up @@ -1334,17 +1335,6 @@ class MFMAExpInterleaveOpt final : public IGLPStrategy {
}
};

unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
bool MFMAExpInterleaveOpt::HasCvt = false;
bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;

bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
SmallVector<SUnit *, 10> ExpPipeCands;
SmallVector<SUnit *, 10> MFMAPipeCands;
Expand All @@ -1367,6 +1357,12 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
auto Opc = SU.getInstr()->getOpcode();
if (TII->isTRANS(Opc)) {
// Avoid counting a potential bonus V_EXP which all the MFMA depend on
// FIXME: This heuristic needs improvement/clarification!
// In general, the pipeline seems to look like this:
// fma_f32 -> exp_f32 -> cvt_f16_f32 -> v_pack_b32_f16 -> mfma_.._f16
// (with potential arithmetic between exp and cvt)
// see
// https://github.com/llvm/llvm-project/pull/80370#discussion_r1483660378
if (SU.Succs.size() >= 7)
continue;
for (auto &Succ : SU.Succs) {
Expand Down Expand Up @@ -1457,6 +1453,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
}
}

MFMAChainSeeds.clear();
MFMAChains = 0;
for (auto &MFMAPipeSU : MFMAPipeSUs) {
if (is_contained(MFMAChainSeeds, MFMAPipeSU))
Expand All @@ -1474,8 +1471,9 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {

for (auto Pred : MFMAChainSeeds[0]->Preds) {
if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
Pred.getSUnit()->getInstr()->mayLoad())
Pred.getSUnit()->getInstr()->mayLoad()) {
FirstPipeDSR = Pred.getSUnit()->NodeNum;
}
}

MFMAChainLength = MFMAPipeCount / MFMAChains;
Expand Down Expand Up @@ -1527,19 +1525,17 @@ bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();

if (Phase != AMDGPU::SchedulingPhase::PostRA)
MFMAChainSeeds.clear();
if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
return false;

return true;
AnalysisResult = analyzeDAG(TII);
return AnalysisResult;
}

bool MFMAExpInterleaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
AMDGPU::SchedulingPhase Phase) {

assert(AnalysisResult && "no or failed DAG analysis");

bool IsSmallKernelType =
MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
bool IsLargeKernelType =
Expand All @@ -1559,18 +1555,18 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
unsigned CurrMFMAForTransPosition = 0;

auto incrementTransPosition = [&MFMAChain, &PositionInChain,
&CurrMFMAForTransPosition]() {
&CurrMFMAForTransPosition, this]() {
CurrMFMAForTransPosition += MFMAEnablement;
PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
MFMAChain = CurrMFMAForTransPosition % MFMAChains;
};

auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
auto getNextTransPositionInChain = [&CurrMFMAForTransPosition, this]() {
auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
return (TempMFMAForTrans / MFMAChains);
};

auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
auto getNextTransMFMAChain = [&CurrMFMAForTransPosition, this]() {
auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
return TempMFMAForTrans % MFMAChains;
};
Expand All @@ -1580,7 +1576,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
unsigned PositionInChainForMFMA = 0;

auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
&PositionInChainForMFMA]() {
&PositionInChainForMFMA, this]() {
++CurrMFMAPosition;
MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
Expand Down Expand Up @@ -2071,22 +2067,16 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
}
};

static unsigned DSWCount = 0;
static unsigned DSWWithPermCount = 0;
static unsigned DSWWithSharedVMEMCount = 0;

bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
AMDGPU::SchedulingPhase Phase) {
unsigned MFMACount = 0;
unsigned DSRCount = 0;
unsigned DSWCount = 0;
unsigned DSWWithPermCount = 0;
unsigned DSWWithSharedVMEMCount = 0;

bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;

assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
DSWWithSharedVMEMCount == 0)) &&
"DSWCounters should be zero in pre-RA scheduling!");
SmallVector<SUnit *, 6> DSWithPerms;
for (auto &SU : DAG->SUnits) {
auto *I = SU.getInstr();
Expand All @@ -2095,7 +2085,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
else if (TII->isDS(*I)) {
if (I->mayLoad())
++DSRCount;
else if (I->mayStore() && IsInitial) {
else if (I->mayStore()) {
++DSWCount;
for (auto Pred : SU.Preds) {
if (Pred.getSUnit()->getInstr()->getOpcode() ==
Expand All @@ -2108,58 +2098,54 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
}
}

if (IsInitial) {
DSWWithPermCount = DSWithPerms.size();
auto *I = DSWithPerms.begin();
auto *E = DSWithPerms.end();

// Get the count of DS_WRITES with V_PERM predecessors which
// have loop carried dependencies (WAR) on the same VMEM_READs.
// We consider partial overlap as a miss -- in other words,
// for a given DS_W, we only consider another DS_W as matching
// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
// for every V_PERM pred of this DS_W.
DenseMap<MachineInstr *, SUnit *> VMEMLookup;
SmallVector<SUnit *, 6> Counted;
for (; I != E; I++) {
SUnit *Cand = nullptr;
bool MissedAny = false;
for (auto &Pred : (*I)->Preds) {
if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
continue;
DSWWithPermCount = DSWithPerms.size();

// Get the count of DS_WRITES with V_PERM predecessors which
// have loop carried dependencies (WAR) on the same VMEM_READs.
// We consider partial overlap as a miss -- in other words,
// for a given DS_W, we only consider another DS_W as matching
// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
// for every V_PERM pred of this DS_W.
DenseMap<MachineInstr *, SUnit *> VMEMLookup;
SmallVector<SUnit *, 6> Counted;
for (SUnit *DSWrite : DSWithPerms) {
SUnit *Cand = nullptr;
bool MissedAny = false;
for (auto &Pred : DSWrite->Preds) {
if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
continue;

if (Cand && llvm::is_contained(Counted, Cand))
break;
if (Cand && llvm::is_contained(Counted, Cand))
break;

for (auto &Succ : Pred.getSUnit()->Succs) {
auto *MI = Succ.getSUnit()->getInstr();
if (!TII->isVMEM(*MI) || !MI->mayLoad())
continue;
for (auto &Succ : Pred.getSUnit()->Succs) {
auto *MI = Succ.getSUnit()->getInstr();
if (!TII->isVMEM(*MI) || !MI->mayLoad())
continue;

if (MissedAny || !VMEMLookup.size()) {
MissedAny = true;
VMEMLookup[MI] = *I;
continue;
}
if (MissedAny || !VMEMLookup.size()) {
MissedAny = true;
VMEMLookup[MI] = DSWrite;
continue;
}

auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
if (Inserted) {
MissedAny = true;
continue;
}
auto [It, Inserted] = VMEMLookup.try_emplace(MI, DSWrite);
if (Inserted) {
MissedAny = true;
continue;
}

Cand = It->second;
if (llvm::is_contained(Counted, Cand)) {
MissedAny = true;
break;
}
Cand = It->second;
if (llvm::is_contained(Counted, Cand)) {
MissedAny = true;
break;
}
}
if (!MissedAny && Cand) {
DSWWithSharedVMEMCount += 2;
Counted.push_back(Cand);
Counted.push_back(*I);
}
}
if (!MissedAny && Cand) {
DSWWithSharedVMEMCount += 2;
Counted.push_back(Cand);
Counted.push_back(DSWrite);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,28 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(<1 x i64> %L1) {
; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: ds_read_b128 v[2:5], v0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GCN-NEXT: ds_read_b128 v[30:33], v0 offset:112
; GCN-NEXT: ds_read_b128 v[26:29], v0 offset:96
; GCN-NEXT: ds_read_b128 v[22:25], v0 offset:80
; GCN-NEXT: ds_read_b128 v[18:21], v0 offset:64
; GCN-NEXT: ds_read_b128 v[2:5], v0
; GCN-NEXT: ds_read_b128 v[6:9], v0 offset:16
; GCN-NEXT: ds_read_b128 v[10:13], v0 offset:32
; GCN-NEXT: ds_read_b128 v[14:17], v0 offset:48
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b128 v0, v[2:5]
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
; GCN-NEXT: ; iglp_opt mask(0x00000001)
; GCN-NEXT: ds_write_b128 v0, v[30:33] offset:112
; GCN-NEXT: ds_write_b128 v0, v[26:29] offset:96
; GCN-NEXT: ds_write_b128 v0, v[22:25] offset:80
; GCN-NEXT: ds_write_b128 v0, v[18:21] offset:64
; GCN-NEXT: ds_write_b128 v0, v[14:17] offset:48
; GCN-NEXT: ds_write_b128 v0, v[10:13] offset:32
; GCN-NEXT: ds_write_b128 v0, v[6:9] offset:16
; GCN-NEXT: ds_write_b128 v0, v[2:5]
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
; GCN-NEXT: ; iglp_opt mask(0x00000001)
; GCN-NEXT: ds_write_b64 v0, v[2:3]
; GCN-NEXT: s_endpgm
entry:
Expand Down
Loading