diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 585200657006c..6be61d1d0829a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -999,6 +999,11 @@ defm Vscnt : AMDGPUSubtargetFeature<"vscnt", /*GenPredicate=*/0 >; +defm Asynccnt : AMDGPUSubtargetFeature<"asynccnt", + "Has separate asynccnt counter", + /*GenPredicate=*/0 +>; + defm GetWaveIdInst : AMDGPUSubtargetFeature<"get-wave-id-inst", "Has s_get_waveid_in_workgroup instruction" >; @@ -2069,6 +2074,7 @@ def FeatureISAVersion12_50_Common : FeatureSet< FeatureSupportsSRAMECC, FeatureMaxHardClauseLength63, FeatureWaitXcnt, + FeatureAsynccnt, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF64FlatInsts, FeatureFlatBufferGlobalAtomicFaddF64Inst, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index c3f69cdb3cbed..522a520c2f50c 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -160,7 +160,8 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) { DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \ DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \ DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \ - DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ + DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \ + DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */ // clang-format off #define AMDGPU_EVENT_ENUM(Name) Name, @@ -217,7 +218,7 @@ enum VmemType { static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = { AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, - AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT}; + AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT, AMDGPU::S_WAIT_ASYNCCNT}; static bool updateVMCntOnly(const MachineInstr &Inst) { return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) || @@ -405,6 +406,8 @@ class WaitcntGenerator { // Returns a new waitcnt with all counters except VScnt set to 0. If // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u. + // AsyncCnt always defaults to ~0u (don't wait for it). It is only updated + // when a call to @llvm.amdgcn.wait.asyncmark() is processed. virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0; virtual ~WaitcntGenerator() = default; @@ -459,6 +462,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator { WaitEventSet({VMEM_BVH_READ_ACCESS}), WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}), WaitEventSet({VMEM_GROUP, SMEM_GROUP}), + WaitEventSet({ASYNC_ACCESS}), WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE, VGPR_XDL_WRITE}), WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})}; @@ -1315,6 +1319,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const { case X_CNT: OS << " X_CNT(" << SR << "):"; break; + case ASYNC_CNT: + OS << " ASYNC_CNT(" << SR << "):"; + break; case VA_VDST: OS << " VA_VDST(" << SR << "): "; break; @@ -1419,6 +1426,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const { case X_CNT: OS << " X_CNT: " << MarkedScore; break; + case ASYNC_CNT: + OS << " ASYNC_CNT: " << MarkedScore; + break; default: OS << " UNKNOWN: " << MarkedScore; break; @@ -1443,6 +1453,7 @@ void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait, simplifyXcnt(CheckWait, UpdateWait); simplifyWaitcnt(UpdateWait, VA_VDST); simplifyVmVsrc(CheckWait, UpdateWait); + simplifyWaitcnt(UpdateWait, ASYNC_CNT); } void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, @@ -1978,7 +1989,8 @@ AMDGPU::Waitcnt WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { unsigned ExpertVal = IsExpertMode ? 0 : ~0u; return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0, - ~0u /* XCNT */, ExpertVal, ExpertVal); + ~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal, + ExpertVal); } /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and @@ -2919,15 +2931,20 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } } else if (TII.isFLAT(Inst)) { if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) && - TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) + TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) { // Async/LDSDMA operations have FLAT encoding but do not actually use flat // pointers. They do have two operands that each access global and LDS, // thus making it appear at this point that they are using a flat pointer. // Filter them out, and for the rest, generate a dependency on flat // pointers so that both VM and LGKM counters are flushed. ScoreBrackets->setPendingFlat(); + } + if (SIInstrInfo::usesASYNC_CNT(Inst)) { + ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst); + } } else if (Inst.isCall()) { - // Act as a wait on everything + // Act as a wait on everything, but AsyncCnt is never included in such + // blanket waits. ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); ScoreBrackets->setStateOnFunctionEntryOrReturn(); } else if (TII.isVINTERP(Inst)) { @@ -3265,12 +3282,9 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, OldWaitcntInstr = nullptr; if (Inst.getOpcode() == AMDGPU::ASYNCMARK) { - // FIXME: Not supported on GFX12 yet. Will need a new feature when we do. - // // Asyncmarks record the current wait state and so should not allow // waitcnts that occur after them to be merged into waitcnts that occur // before. - assert(ST.getGeneration() < AMDGPUSubtarget::GFX12); ScoreBrackets.recordAsyncMark(Inst); continue; } @@ -3669,7 +3683,8 @@ bool SIInsertWaitcnts::run() { BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) .addImm(0); for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { - if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT) + if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT || + CT == ASYNC_CNT) continue; if (!ST.hasImageInsts() && diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 10c9921f34318..1e55acb3b7db0 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -135,6 +135,11 @@ unsigned getXcntBitWidth(unsigned VersionMajor, unsigned VersionMinor) { return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0; } +/// \returns Asynccnt bit width. +unsigned getAsynccntBitWidth(unsigned VersionMajor, unsigned VersionMinor) { + return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0; +} + /// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions. unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) { return VersionMajor >= 12 ? 8 : 0; @@ -1824,6 +1829,10 @@ unsigned getXcntBitMask(const IsaVersion &Version) { return (1 << getXcntBitWidth(Version.Major, Version.Minor)) - 1; } +unsigned getAsynccntBitMask(const IsaVersion &Version) { + return (1 << getAsynccntBitWidth(Version.Major, Version.Minor)) - 1; +} + unsigned getStorecntBitMask(const IsaVersion &Version) { return (1 << getStorecntBitWidth(Version.Major)) - 1; } @@ -1843,6 +1852,7 @@ HardwareLimits::HardwareLimits(const IsaVersion &IV) { BvhcntMax = getBvhcntBitMask(IV); KmcntMax = getKmcntBitMask(IV); XcntMax = getXcntBitMask(IV); + AsyncMax = getAsynccntBitMask(IV); VaVdstMax = DepCtr::getVaVdstBitMask(); VmVsrcMax = DepCtr::getVmVsrcBitMask(); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 619a8248b22fb..9fa0a07ed7558 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1101,6 +1101,7 @@ enum InstCounterType { BVH_CNT, // gfx12+ only. KM_CNT, // gfx12+ only. X_CNT, // gfx1250. + ASYNC_CNT, // gfx1250. NUM_EXTENDED_INST_CNTS, VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only. VM_VSRC, // gfx12+ expert mode only. @@ -1148,7 +1149,7 @@ class Waitcnt { // gfx12+ constructor. Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt, unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt, - unsigned VaVdst, unsigned VmVsrc) + unsigned AsyncCnt, unsigned VaVdst, unsigned VmVsrc) : Waitcnt() { Cnt[LOAD_CNT] = LoadCnt; Cnt[DS_CNT] = DsCnt; @@ -1158,6 +1159,7 @@ class Waitcnt { Cnt[BVH_CNT] = BvhCnt; Cnt[KM_CNT] = KmCnt; Cnt[X_CNT] = XCnt; + Cnt[ASYNC_CNT] = AsyncCnt; Cnt[VA_VDST] = VaVdst; Cnt[VM_VSRC] = VmVsrc; } @@ -1220,6 +1222,7 @@ struct HardwareLimits { unsigned BvhcntMax; // gfx12+ only. unsigned KmcntMax; // gfx12+ only. unsigned XcntMax; // gfx1250. + unsigned AsyncMax; // gfx1250. unsigned VaVdstMax; // gfx12+ expert mode only. unsigned VmVsrcMax; // gfx12+ expert mode only. @@ -1323,6 +1326,10 @@ unsigned getSamplecntBitMask(const IsaVersion &Version); /// Returns 0 for versions that do not support BVHcnt unsigned getBvhcntBitMask(const IsaVersion &Version); +/// \returns Asynccnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support Asynccnt +unsigned getAsynccntBitMask(const IsaVersion &Version); + /// \returns Dscnt bit mask for given isa \p Version. /// Returns 0 for versions that do not support DScnt unsigned getDscntBitMask(const IsaVersion &Version);