[AMDGPU] Introduce ASYNC_CNT on GFX1250#185810
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Sameer Sahasrabuddhe (ssahasra) ChangesAsync operations transfer data between global memory and LDS. Their progress is tracked by the ASYNC_CNT counter on GFX1250 and later architectures. This change introduces the representation of that counter in SIInsertWaitCnts. For now, the programmer must manually insert s_wait_asyncnt instructions. Later changes will add compiler assistance for generating the waits by including this counter in the asyncmark instructions. Assisted-by: Claude Sonnet 4.5 Full diff: https://github.com/llvm/llvm-project/pull/185810.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a0b6ff13e7d7a..4259bf4c1b0bf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -972,6 +972,11 @@ defm Vscnt : AMDGPUSubtargetFeature<"vscnt",
/*GenPredicate=*/0
>;
+defm Asynccnt : AMDGPUSubtargetFeature<"asynccnt",
+ "Has separate asynccnt counter",
+ /*GenPredicate=*/0
+>;
+
defm GetWaveIdInst : AMDGPUSubtargetFeature<"get-wave-id-inst",
"Has s_get_waveid_in_workgroup instruction"
>;
@@ -2032,6 +2037,7 @@ def FeatureISAVersion12_50_Common : FeatureSet<
FeatureSupportsSRAMECC,
FeatureMaxHardClauseLength63,
FeatureWaitXcnt,
+ FeatureAsynccnt,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureFlatBufferGlobalAtomicFaddF64Inst,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b07516c22cf29..a804ba35bade7 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -160,7 +160,8 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) {
DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
- DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
+ DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \
+ DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */
// clang-format off
#define AMDGPU_EVENT_ENUM(Name) Name,
@@ -217,7 +218,7 @@ enum VmemType {
static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
- AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
+ AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT, AMDGPU::S_WAIT_ASYNCCNT};
static bool updateVMCntOnly(const MachineInstr &Inst) {
return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
@@ -405,6 +406,8 @@ class WaitcntGenerator {
// Returns a new waitcnt with all counters except VScnt set to 0. If
// IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
+ // AsyncCnt always defaults to ~0u (don't wait for it). It is only updated
+ // when a call to @llvm.amdgcn.wait.asyncmark() is processed.
virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
virtual ~WaitcntGenerator() = default;
@@ -459,6 +462,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
WaitEventSet({VMEM_BVH_READ_ACCESS}),
WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
+ WaitEventSet({ASYNC_ACCESS}),
WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
VGPR_XDL_WRITE}),
WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
@@ -1314,6 +1318,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
case X_CNT:
OS << " X_CNT(" << SR << "):";
break;
+ case ASYNC_CNT:
+ OS << " ASYNC_CNT(" << SR << "):";
+ break;
case VA_VDST:
OS << " VA_VDST(" << SR << "): ";
break;
@@ -1418,6 +1425,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
case X_CNT:
OS << " X_CNT: " << MarkedScore;
break;
+ case ASYNC_CNT:
+ OS << " ASYNC_CNT: " << MarkedScore;
+ break;
default:
OS << " UNKNOWN: " << MarkedScore;
break;
@@ -1442,6 +1452,7 @@ void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
simplifyXcnt(CheckWait, UpdateWait);
simplifyWaitcnt(UpdateWait, VA_VDST);
simplifyVmVsrc(CheckWait, UpdateWait);
+ simplifyWaitcnt(UpdateWait, ASYNC_CNT);
}
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1977,7 +1988,8 @@ AMDGPU::Waitcnt
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
- ~0u /* XCNT */, ExpertVal, ExpertVal);
+ ~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal,
+ ExpertVal);
}
/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -2917,6 +2929,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
ScoreBrackets->setPendingGDS();
}
+ } else if (SIInstrInfo::usesASYNC_CNT(Inst)) {
+ // Async instructions use flat encoding, so this needs to happen before the
+ // isFLAT() check below.
+ ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
} else if (TII->isFLAT(Inst)) {
if (Inst.mayLoadOrStore() && TII->mayAccessVMEMThroughFlat(Inst) &&
TII->mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst))
@@ -2927,7 +2943,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// pointers so that both VM and LGKM counters are flushed.
ScoreBrackets->setPendingFlat();
} else if (Inst.isCall()) {
- // Act as a wait on everything
+ // Act as a wait on everything, but AsyncCnt is never included in such
+ // blanket waits.
ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
ScoreBrackets->setStateOnFunctionEntryOrReturn();
} else if (TII->isVINTERP(Inst)) {
@@ -3265,12 +3282,9 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
OldWaitcntInstr = nullptr;
if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
- // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
- //
// Asyncmarks record the current wait state and so should not allow
// waitcnts that occur after them to be merged into waitcnts that occur
// before.
- assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
ScoreBrackets.recordAsyncMark(Inst);
continue;
}
@@ -3677,7 +3691,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
.addImm(0);
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
- if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
+ if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT ||
+ CT == ASYNC_CNT)
continue;
if (!ST->hasImageInsts() &&
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 488c150dd5c28..b04e5264feddc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -135,6 +135,10 @@ unsigned getXcntBitWidth(unsigned VersionMajor, unsigned VersionMinor) {
return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0;
}
+unsigned getAsynccntBitWidth(unsigned VersionMajor, unsigned VersionMinor) {
+ return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0;
+}
+
/// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions.
unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) {
return VersionMajor >= 12 ? 8 : 0;
@@ -1808,6 +1812,10 @@ unsigned getXcntBitMask(const IsaVersion &Version) {
return (1 << getXcntBitWidth(Version.Major, Version.Minor)) - 1;
}
+unsigned getAsynccntBitMask(const IsaVersion &Version) {
+ return (1 << getAsynccntBitWidth(Version.Major, Version.Minor)) - 1;
+}
+
unsigned getStorecntBitMask(const IsaVersion &Version) {
return (1 << getStorecntBitWidth(Version.Major)) - 1;
}
@@ -1827,6 +1835,7 @@ HardwareLimits::HardwareLimits(const IsaVersion &IV) {
BvhcntMax = getBvhcntBitMask(IV);
KmcntMax = getKmcntBitMask(IV);
XcntMax = getXcntBitMask(IV);
+ AsyncMax = getAsynccntBitMask(IV);
VaVdstMax = DepCtr::getVaVdstBitMask();
VmVsrcMax = DepCtr::getVmVsrcBitMask();
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index b3d20777ccfcf..9cec56090172b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1096,6 +1096,7 @@ enum InstCounterType {
BVH_CNT, // gfx12+ only.
KM_CNT, // gfx12+ only.
X_CNT, // gfx1250.
+ ASYNC_CNT, // gfx1250.
NUM_EXTENDED_INST_CNTS,
VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
VM_VSRC, // gfx12+ expert mode only.
@@ -1130,6 +1131,7 @@ class Waitcnt {
unsigned BvhCnt = ~0u; // gfx12+ only.
unsigned KmCnt = ~0u; // gfx12+ only.
unsigned XCnt = ~0u; // gfx1250.
+ unsigned AsyncCnt = ~0u; // gfx1250.
unsigned VaVdst = ~0u; // gfx12+ expert scheduling mode only.
unsigned VmVsrc = ~0u; // gfx12+ expert scheduling mode only.
@@ -1152,6 +1154,8 @@ class Waitcnt {
return KmCnt;
case X_CNT:
return XCnt;
+ case ASYNC_CNT:
+ return AsyncCnt;
case VA_VDST:
return VaVdst;
case VM_VSRC:
@@ -1186,6 +1190,9 @@ class Waitcnt {
case X_CNT:
XCnt = Val;
break;
+ case ASYNC_CNT:
+ AsyncCnt = Val;
+ break;
case VA_VDST:
VaVdst = Val;
break;
@@ -1205,10 +1212,10 @@ class Waitcnt {
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt,
- unsigned VaVdst, unsigned VmVsrc)
+ unsigned AsyncCnt, unsigned VaVdst, unsigned VmVsrc)
: LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt),
- VaVdst(VaVdst), VmVsrc(VmVsrc) {}
+ AsyncCnt(AsyncCnt), VaVdst(VaVdst), VmVsrc(VmVsrc) {}
bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
@@ -1230,7 +1237,8 @@ class Waitcnt {
std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt),
std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt),
std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt),
- std::min(VaVdst, Other.VaVdst), std::min(VmVsrc, Other.VmVsrc));
+ std::min(AsyncCnt, Other.AsyncCnt), std::min(VaVdst, Other.VaVdst),
+ std::min(VmVsrc, Other.VmVsrc));
}
friend raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait);
@@ -1246,6 +1254,7 @@ struct HardwareLimits {
unsigned BvhcntMax; // gfx12+ only.
unsigned KmcntMax; // gfx12+ only.
unsigned XcntMax; // gfx1250.
+ unsigned AsyncMax; // gfx1250.
unsigned VaVdstMax; // gfx12+ expert mode only.
unsigned VmVsrcMax; // gfx12+ expert mode only.
@@ -1349,6 +1358,10 @@ unsigned getSamplecntBitMask(const IsaVersion &Version);
/// Returns 0 for versions that do not support BVHcnt
unsigned getBvhcntBitMask(const IsaVersion &Version);
+/// \returns Asynccnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support Asynccnt
+unsigned getAsynccntBitMask(const IsaVersion &Version);
+
/// \returns Dscnt bit mask for given isa \p Version.
/// Returns 0 for versions that do not support DScnt
unsigned getDscntBitMask(const IsaVersion &Version);
|
Pierre-vh
left a comment
There was a problem hiding this comment.
I think this is fairly straightforward, LGTM.
RyanRio
left a comment
There was a problem hiding this comment.
Tangential, is there any reason why we still need
// Not addressable, used to model dependencies.
def ASYNCcnt : SIReg <"ASYNCcnt", 0>;
Is this for scheduling purposes?
|
Oh also, should we remove the builtin for specifying asynccnt waits manually? |
|
I could see an argument for removing the builtin and maybe leaving an intrinsic in in case people doing lower-level codegen need to manually force the counters. |
This use of asyncmark is still too knew. The users will probably want the old builtin and intrinsic for debugging. We can deprecate it eventually. |
It shows up in the uses and defs of |
Async operations transfer data between global memory and LDS. Their progress is tracked by the ASYNC_CNT counter on GFX1250 and later architectures. This change introduces the representation of that counter in SIInsertWaitCnts. For now, the programmer must manually insert s_wait_asyncnt instructions. Later changes will add compiler assistance for generating the waits by including this counter in the asyncmark instructions. Assisted-by: Claude Sonnet 4.5
c912af8 to
68e3556
Compare
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/225/builds/4222 Here is the relevant piece of the build log for the reference |
Async operations transfer data between global memory and LDS. Their progress is tracked by the ASYNC_CNT counter on GFX1250 and later architectures. This change introduces the representation of that counter in SIInsertWaitCnts. For now, the programmer must manually insert s_wait_asyncnt instructions. Later changes will add compiler assistance for generating the waits by including this counter in the asyncmark instructions. Assisted-by: Claude Sonnet 4.5 This is part of a stack: - llvm#185813 - llvm#185810
The ASYNC_CNT is used to track the progress of asynchronous copies between global and LDS memories. By including it in asyncmark, the compiler can now assist the programmer in generating waits for ASYNC_CNT. Assisted-By: Claude Sonnet 4.5 This is part of a stack: - #185813 - #185810 Fixes: LCOMPILER-332
The ASYNC_CNT is used to track the progress of asynchronous copies between global and LDS memories. By including it in asyncmark, the compiler can now assist the programmer in generating waits for ASYNC_CNT. Assisted-By: Claude Sonnet 4.5 This is part of a stack: - llvm#185813 - llvm#185810 Fixes: LCOMPILER-332
The ASYNC_CNT is used to track the progress of asynchronous copies between global and LDS memories. By including it in asyncmark, the compiler can now assist the programmer in generating waits for ASYNC_CNT. Assisted-By: Claude Sonnet 4.5 This is part of a stack: - llvm#185813 - llvm#185810 Fixes: LCOMPILER-332
Async operations transfer data between global memory and LDS. Their progress is tracked by the ASYNC_CNT counter on GFX1250 and later architectures. This change introduces the representation of that counter in SIInsertWaitCnts. For now, the programmer must manually insert s_wait_asyncnt instructions. Later changes will add compiler assistance for generating the waits by including this counter in the asyncmark instructions.
Assisted-by: Claude Sonnet 4.5
This is part of a stack: