Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -999,6 +999,11 @@ defm Vscnt : AMDGPUSubtargetFeature<"vscnt",
/*GenPredicate=*/0
>;

defm Asynccnt : AMDGPUSubtargetFeature<"asynccnt",
"Has separate asynccnt counter",
/*GenPredicate=*/0
>;

defm GetWaveIdInst : AMDGPUSubtargetFeature<"get-wave-id-inst",
"Has s_get_waveid_in_workgroup instruction"
>;
Expand Down Expand Up @@ -2069,6 +2074,7 @@ def FeatureISAVersion12_50_Common : FeatureSet<
FeatureSupportsSRAMECC,
FeatureMaxHardClauseLength63,
FeatureWaitXcnt,
FeatureAsynccnt,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureFlatBufferGlobalAtomicFaddF64Inst,
Expand Down
33 changes: 24 additions & 9 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) {
DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \
DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */

// clang-format off
#define AMDGPU_EVENT_ENUM(Name) Name,
Expand Down Expand Up @@ -217,7 +218,7 @@ enum VmemType {
static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT, AMDGPU::S_WAIT_ASYNCCNT};

static bool updateVMCntOnly(const MachineInstr &Inst) {
return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
Expand Down Expand Up @@ -405,6 +406,8 @@ class WaitcntGenerator {

// Returns a new waitcnt with all counters except VScnt set to 0. If
// IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
// AsyncCnt always defaults to ~0u (don't wait for it). It is only updated
// when a call to @llvm.amdgcn.wait.asyncmark() is processed.
virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;

virtual ~WaitcntGenerator() = default;
Expand Down Expand Up @@ -459,6 +462,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
WaitEventSet({VMEM_BVH_READ_ACCESS}),
WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
WaitEventSet({ASYNC_ACCESS}),
WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
VGPR_XDL_WRITE}),
WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
Expand Down Expand Up @@ -1315,6 +1319,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
case X_CNT:
OS << " X_CNT(" << SR << "):";
break;
case ASYNC_CNT:
OS << " ASYNC_CNT(" << SR << "):";
break;
case VA_VDST:
OS << " VA_VDST(" << SR << "): ";
break;
Expand Down Expand Up @@ -1419,6 +1426,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
case X_CNT:
OS << " X_CNT: " << MarkedScore;
break;
case ASYNC_CNT:
OS << " ASYNC_CNT: " << MarkedScore;
break;
default:
OS << " UNKNOWN: " << MarkedScore;
break;
Expand All @@ -1443,6 +1453,7 @@ void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
simplifyXcnt(CheckWait, UpdateWait);
simplifyWaitcnt(UpdateWait, VA_VDST);
simplifyVmVsrc(CheckWait, UpdateWait);
simplifyWaitcnt(UpdateWait, ASYNC_CNT);
}

void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
Expand Down Expand Up @@ -1978,7 +1989,8 @@ AMDGPU::Waitcnt
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
~0u /* XCNT */, ExpertVal, ExpertVal);
~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal,
ExpertVal);
}

/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
Expand Down Expand Up @@ -2919,15 +2931,20 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
}
} else if (TII.isFLAT(Inst)) {
if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&
TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst))
TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) {
// Async/LDSDMA operations have FLAT encoding but do not actually use flat
// pointers. They do have two operands that each access global and LDS,
// thus making it appear at this point that they are using a flat pointer.
// Filter them out, and for the rest, generate a dependency on flat
// pointers so that both VM and LGKM counters are flushed.
ScoreBrackets->setPendingFlat();
}
if (SIInstrInfo::usesASYNC_CNT(Inst)) {
ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
}
} else if (Inst.isCall()) {
// Act as a wait on everything
// Act as a wait on everything, but AsyncCnt is never included in such
// blanket waits.
ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
ScoreBrackets->setStateOnFunctionEntryOrReturn();
} else if (TII.isVINTERP(Inst)) {
Expand Down Expand Up @@ -3265,12 +3282,9 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
OldWaitcntInstr = nullptr;

if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
// FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
//
// Asyncmarks record the current wait state and so should not allow
// waitcnts that occur after them to be merged into waitcnts that occur
// before.
assert(ST.getGeneration() < AMDGPUSubtarget::GFX12);
ScoreBrackets.recordAsyncMark(Inst);
continue;
}
Expand Down Expand Up @@ -3669,7 +3683,8 @@ bool SIInsertWaitcnts::run() {
BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
.addImm(0);
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT ||
CT == ASYNC_CNT)
continue;

if (!ST.hasImageInsts() &&
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,11 @@ unsigned getXcntBitWidth(unsigned VersionMajor, unsigned VersionMinor) {
return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0;
}

/// \returns Asynccnt bit width.
unsigned getAsynccntBitWidth(unsigned VersionMajor, unsigned VersionMinor) {
return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0;
}
Comment thread
ssahasra marked this conversation as resolved.

/// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions.
unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) {
return VersionMajor >= 12 ? 8 : 0;
Expand Down Expand Up @@ -1824,6 +1829,10 @@ unsigned getXcntBitMask(const IsaVersion &Version) {
return (1 << getXcntBitWidth(Version.Major, Version.Minor)) - 1;
}

unsigned getAsynccntBitMask(const IsaVersion &Version) {
return (1 << getAsynccntBitWidth(Version.Major, Version.Minor)) - 1;
}

unsigned getStorecntBitMask(const IsaVersion &Version) {
return (1 << getStorecntBitWidth(Version.Major)) - 1;
}
Expand All @@ -1843,6 +1852,7 @@ HardwareLimits::HardwareLimits(const IsaVersion &IV) {
BvhcntMax = getBvhcntBitMask(IV);
KmcntMax = getKmcntBitMask(IV);
XcntMax = getXcntBitMask(IV);
AsyncMax = getAsynccntBitMask(IV);
VaVdstMax = DepCtr::getVaVdstBitMask();
VmVsrcMax = DepCtr::getVmVsrcBitMask();
}
Expand Down
9 changes: 8 additions & 1 deletion llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,7 @@ enum InstCounterType {
BVH_CNT, // gfx12+ only.
KM_CNT, // gfx12+ only.
X_CNT, // gfx1250.
ASYNC_CNT, // gfx1250.
NUM_EXTENDED_INST_CNTS,
VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
VM_VSRC, // gfx12+ expert mode only.
Expand Down Expand Up @@ -1148,7 +1149,7 @@ class Waitcnt {
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt,
unsigned VaVdst, unsigned VmVsrc)
unsigned AsyncCnt, unsigned VaVdst, unsigned VmVsrc)
: Waitcnt() {
Cnt[LOAD_CNT] = LoadCnt;
Cnt[DS_CNT] = DsCnt;
Expand All @@ -1158,6 +1159,7 @@ class Waitcnt {
Cnt[BVH_CNT] = BvhCnt;
Cnt[KM_CNT] = KmCnt;
Cnt[X_CNT] = XCnt;
Cnt[ASYNC_CNT] = AsyncCnt;
Cnt[VA_VDST] = VaVdst;
Cnt[VM_VSRC] = VmVsrc;
}
Expand Down Expand Up @@ -1220,6 +1222,7 @@ struct HardwareLimits {
unsigned BvhcntMax; // gfx12+ only.
unsigned KmcntMax; // gfx12+ only.
unsigned XcntMax; // gfx1250.
unsigned AsyncMax; // gfx1250.
unsigned VaVdstMax; // gfx12+ expert mode only.
unsigned VmVsrcMax; // gfx12+ expert mode only.

Expand Down Expand Up @@ -1323,6 +1326,10 @@ unsigned getSamplecntBitMask(const IsaVersion &Version);
/// Returns 0 for versions that do not support BVHcnt
unsigned getBvhcntBitMask(const IsaVersion &Version);

/// \returns Asynccnt bit mask for given isa \p Version.
/// Returns 0 for versions that do not support Asynccnt
unsigned getAsynccntBitMask(const IsaVersion &Version);

/// \returns Dscnt bit mask for given isa \p Version.
/// Returns 0 for versions that do not support DScnt
unsigned getDscntBitMask(const IsaVersion &Version);
Expand Down
Loading