From 7a67b2b9670328535d227f73051e710d59bd3e42 Mon Sep 17 00:00:00 2001 From: "Joao P. L. de Carvalho" Date: Sun, 26 Apr 2026 20:42:41 +0000 Subject: [PATCH 01/17] Preserve MUBUF scope modifiers through StinkyTofu lowering Carry rocisa MUBUF scope metadata through StinkyTofu conversion, IR serialization, and assembly emission so scope-qualified buffer operations survive lowering. --- ...{test_mubuf_off_vaddr.py => test_mubuf.py} | 59 ++++++++++++++++++- .../stinkytofu/ir/asm/StinkyModifiers.hpp | 9 ++- .../conversion/rocisa/ToStinkyTofuUtils.cpp | 6 +- .../serialization/asm/ModifierSerializer.cpp | 7 ++- .../serialization/asm/StinkyAsmEmitter.cpp | 3 + .../tests/filecheck/mubuf_off_vaddr.stir | 10 ++++ .../tests/unit/asm/StinkyAsmEmitterTest.cpp | 30 ++++++++++ 7 files changed, 117 insertions(+), 7 deletions(-) rename projects/hipblaslt/tensilelite/rocisa/test/{test_mubuf_off_vaddr.py => test_mubuf.py} (65%) diff --git a/projects/hipblaslt/tensilelite/rocisa/test/test_mubuf_off_vaddr.py b/projects/hipblaslt/tensilelite/rocisa/test/test_mubuf.py similarity index 65% rename from projects/hipblaslt/tensilelite/rocisa/test/test_mubuf_off_vaddr.py rename to projects/hipblaslt/tensilelite/rocisa/test/test_mubuf.py index 4a0d4d94411..2b62e277238 100644 --- a/projects/hipblaslt/tensilelite/rocisa/test/test_mubuf_off_vaddr.py +++ b/projects/hipblaslt/tensilelite/rocisa/test/test_mubuf.py @@ -20,15 +20,16 @@ # CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ################################################################################ -"""Regression: MUBUF isOff=True vaddr must survive rocisa → StinkyTofu as 'off', not 'v[vgproff]'.""" +"""Regressions for MUBUF rocisa -> StinkyTofu lowering.""" import re import pytest import rocisa from rocisa.code import Module, SignatureBase -from rocisa.container import sgpr, vgpr -from rocisa.instruction import BufferStoreB32 +from rocisa.container import MUBUFModifiers, sgpr, vgpr +from rocisa.enum import CacheScope +from rocisa.instruction import BufferLoadB32, BufferStoreB32 _ISA = (12, 5, 0) @@ -81,3 +82,55 @@ def _mubuf_off_asm() -> str: def test_mubuf_off_vaddr_stinkytofu(_mubuf_off_asm): # Assembler rejects 'v[vgproff]'; 'off' must appear as the literal vaddr operand. assert re.search(r"buffer_store_b32 v12, off, s\[60:63\], s46", _mubuf_off_asm) + + +@pytest.fixture(scope="module") +def _mubuf_scope_asm() -> str: + mod = Module("mubuf_scope_modifiers") + mod.add( + BufferStoreB32( + src=vgpr(12), + vaddr=vgpr(32), + saddr=sgpr(60, 4), + soffset=sgpr(46), + mubuf=MUBUFModifiers(offen=True, scope=CacheScope.SCOPE_DEV), + ) + ) + mod.add( + BufferLoadB32( + dst=vgpr(13), + vaddr=vgpr(33), + saddr=sgpr(64, 4), + soffset=sgpr(47), + mubuf=MUBUFModifiers(offen=True, scope=CacheScope.SCOPE_DEV), + ) + ) + mod.setParent() + + sig = SignatureBase( + kernelName="mubuf_scope_modifiers", + kernArgsVersion=1, + codeObjectVersion="4", + groupSegmentSize=0, + sgprWorkGroup=(1, 1, 0), + vgprWorkItem=0, + flatWorkGroupSize=64, + preloadKernArgs=False, + ) + + st = rocisa.toStinkyTofuModule( + mod, _ISA, "mubuf_scope_modifiers", signature=sig, options={"OptLevel": 0} + ) + st.runOptimizationPipeline() + return st.emitAssembly() + + +def test_mubuf_scope_modifiers_stinkytofu(_mubuf_scope_asm): + assert re.search( + r"buffer_store_b32 v12, v32, s\[60:63\], s46 offen offset:0 scope:SCOPE_DEV", + _mubuf_scope_asm, + ) + assert re.search( + r"buffer_load_b32 v13, v33, s\[64:67\], s47 offen offset:0 scope:SCOPE_DEV", + _mubuf_scope_asm, + ) diff --git a/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp b/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp index b9d360bb21c..c835f8bc40c 100644 --- a/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp +++ b/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp @@ -151,9 +151,11 @@ struct MUBUFModifiers : public TypedModifier { MUBUFModifiers(bool offen = false, int offset12 = 0, bool glc = false, bool slc = false, bool nt = false, bool lds = false, bool isStore = false, bool hasMUBUFConst = false, bool hasGLCModifier = false, - bool hasSC0Modifier = false) + bool hasSC0Modifier = false, bool hasSCOPEModifier = false, + const std::string& scope = "") : TypedModifier(), offset12(offset12), + scope(scope), offen(offen), glc(glc), slc(slc), @@ -162,9 +164,11 @@ struct MUBUFModifiers : public TypedModifier { isStore(isStore), hasMUBUFConst(hasMUBUFConst), hasGLCModifier(hasGLCModifier), - hasSC0Modifier(hasSC0Modifier) {} + hasSC0Modifier(hasSC0Modifier), + hasSCOPEModifier(hasSCOPEModifier) {} int offset12; + std::string scope; uint32_t offen : 1; uint32_t glc : 1; uint32_t slc : 1; @@ -174,6 +178,7 @@ struct MUBUFModifiers : public TypedModifier { uint32_t hasMUBUFConst : 1; uint32_t hasGLCModifier : 1; uint32_t hasSC0Modifier : 1; + uint32_t hasSCOPEModifier : 1; }; struct SMEMModifiers : public TypedModifier { diff --git a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp index 338e8de5634..3877efc9789 100644 --- a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp +++ b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -84,9 +85,12 @@ stinkytofu::MUBUFModifiers convertMUBUFModifiers(const rocisa::MUBUFModifiers& r bool hasMUBUFConst = asmCaps.count("HasMUBUFConst") && asmCaps.at("HasMUBUFConst"); bool hasGLCModifier = asmCaps.count("HasGLCModifier") && asmCaps.at("HasGLCModifier"); bool hasSC0Modifier = asmCaps.count("HasSC0Modifier") && asmCaps.at("HasSC0Modifier"); + bool hasSCOPEModifier = asmCaps.count("HasSCOPEModifier") && asmCaps.at("HasSCOPEModifier"); + std::string scope = + rocMod.scope == rocisa::CacheScope::SCOPE_NONE ? "" : rocisa::toString(rocMod.scope); return stinkytofu::MUBUFModifiers(rocMod.offen, rocMod.offset12, rocMod.glc, rocMod.slc, rocMod.nt, rocMod.lds, rocMod.isStore, hasMUBUFConst, - hasGLCModifier, hasSC0Modifier); + hasGLCModifier, hasSC0Modifier, hasSCOPEModifier, scope); } stinkytofu::SMEMModifiers convertSMEMModifiers(const rocisa::SMEMModifiers& rocMod, diff --git a/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp b/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp index 623ad6aa92c..73889af4565 100644 --- a/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp +++ b/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp @@ -159,6 +159,9 @@ bool serializeVisit(const MUBUFModifiers& mod, std::ostream& os) { os << " offen = " << (mod.offen ? "true" : "false") << ", offset12 = " << mod.offset12 << ", glc = " << (mod.glc ? "true" : "false") << ", slc = " << (mod.slc ? "true" : "false") << ", nt = " << (mod.nt ? "true" : "false") << ", lds = " << (mod.lds ? "true" : "false"); + if (mod.hasSCOPEModifier && !mod.scope.empty()) { + os << ", scope = \"" << mod.scope << "\""; + } os << " }"; return true; } @@ -380,7 +383,9 @@ void deserializeVisit(StinkyInstruction* inst, const std::string& attrKey, inst->addModifier( MUBUFModifiers(getBool(fields, "offen", false), getInt(fields, "offset12", 0), getBool(fields, "glc", false), getBool(fields, "slc", false), - getBool(fields, "nt", false), getBool(fields, "lds", false))); + getBool(fields, "nt", false), getBool(fields, "lds", false), + false, false, false, false, getStr(fields, "scope", "") != "", + getStr(fields, "scope", ""))); } else if (attrKey == "mod.smem") { inst->addModifier(SMEMModifiers(getBool(fields, "glc", false), getBool(fields, "nv", false), getInt(fields, "offset", 0))); diff --git a/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp b/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp index 43761f6bd22..16fcc193577 100644 --- a/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp +++ b/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp @@ -203,6 +203,9 @@ inline std::ostream& operator<<(std::ostream& os, const MUBUFModifiers& mubufMod else if (mubufMod.hasSC0Modifier) os << " sc1"; } + if (mubufMod.hasSCOPEModifier && !mubufMod.scope.empty()) { + os << " scope:" << mubufMod.scope; + } if (mubufMod.nt) { os << " nt"; } diff --git a/shared/stinkytofu/tests/filecheck/mubuf_off_vaddr.stir b/shared/stinkytofu/tests/filecheck/mubuf_off_vaddr.stir index 8669ecb2ac8..84c4276be43 100644 --- a/shared/stinkytofu/tests/filecheck/mubuf_off_vaddr.stir +++ b/shared/stinkytofu/tests/filecheck/mubuf_off_vaddr.stir @@ -25,3 +25,13 @@ st.func @buffer_load_b32_off_vaddr() { ^entry: v0 = "st.buffer_load_b32"(off, s[4:7], s3) { issueCycles = 1, latencyCycles = 100 } } + +#--- MUBUF scope modifier: IR round-trip must preserve scope for asm emission +# CHECK-LABEL: @buffer_store_b32_scope +# CHECK: mod.mubuf +# CHECK-SAME: scope = "SCOPE_DEV" + +st.func @buffer_store_b32_scope() { +^entry: + "st.buffer_store_b32"(v12, v32, s[60:63], s46) { issueCycles = 1, latencyCycles = 1, mod.mubuf = { offen = true, offset12 = 0, glc = false, slc = false, nt = false, lds = false, scope = "SCOPE_DEV" } } +} diff --git a/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp b/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp index 17182231430..2ed655f42ed 100644 --- a/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp +++ b/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp @@ -703,3 +703,33 @@ TEST_F(AsmEmitterTest, MUBUFLoadB32_OffVAddr) { std::string expected = "buffer_load_b32 v0, off, s[4:7], s3\n"; EXPECT_EQ(assembly, expected); } + +TEST_F(AsmEmitterTest, MUBUFScopeModifier) { + StinkyInstruction* inst = createInstruction("buffer_store_b32"); + ASSERT_NE(inst, nullptr); + + inst->addSrcReg(StinkyRegister("v", 12, 1)); + inst->addSrcReg(StinkyRegister("v", 32, 1)); + inst->addSrcReg(StinkyRegister("s", 60, 4)); + inst->addSrcReg(StinkyRegister("s", 46, 1)); + + MUBUFModifiers mubufMod(/*offen=*/true, /*offset12=*/0, /*glc=*/false, /*slc=*/false, + /*nt=*/false, /*lds=*/false, /*isStore=*/true, + /*hasMUBUFConst=*/false, /*hasGLCModifier=*/false, + /*hasSC0Modifier=*/false, /*hasSCOPEModifier=*/true, + /*scope=*/"SCOPE_DEV"); + inst->addModifier(mubufMod); + + AsmEmitterOptions options; + options.emitComments = false; + options.emitCycleInfo = false; + options.indent = 0; + options.emitBlankLines = false; + + StinkyAsmEmitter emitter(options); + std::string assembly = emitter.emit(*inst); + + std::string expected = + "buffer_store_b32 v12, v32, s[60:63], s46 offen offset:0 scope:SCOPE_DEV\n"; + EXPECT_EQ(assembly, expected); +} From 089ba4670c4be16cc3e50664773d90e47aa99637 Mon Sep 17 00:00:00 2001 From: "Joao P. L. de Carvalho" Date: Mon, 27 Apr 2026 10:24:42 -0500 Subject: [PATCH 02/17] Format fix --- .../src/serialization/asm/ModifierSerializer.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp b/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp index 73889af4565..ec1c4530444 100644 --- a/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp +++ b/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp @@ -380,12 +380,11 @@ void deserializeVisit(StinkyInstruction* inst, const std::string& attrKey, } else if (attrKey == "mod.global") { inst->addModifier(GLOBALModifiers(getInt(fields, "offset", 0))); } else if (attrKey == "mod.mubuf") { - inst->addModifier( - MUBUFModifiers(getBool(fields, "offen", false), getInt(fields, "offset12", 0), - getBool(fields, "glc", false), getBool(fields, "slc", false), - getBool(fields, "nt", false), getBool(fields, "lds", false), - false, false, false, false, getStr(fields, "scope", "") != "", - getStr(fields, "scope", ""))); + inst->addModifier(MUBUFModifiers( + getBool(fields, "offen", false), getInt(fields, "offset12", 0), + getBool(fields, "glc", false), getBool(fields, "slc", false), + getBool(fields, "nt", false), getBool(fields, "lds", false), false, false, false, false, + getStr(fields, "scope", "") != "", getStr(fields, "scope", ""))); } else if (attrKey == "mod.smem") { inst->addModifier(SMEMModifiers(getBool(fields, "glc", false), getBool(fields, "nv", false), getInt(fields, "offset", 0))); From e79e255f1edbc5a6383ca0ac6e01b77316e63a1e Mon Sep 17 00:00:00 2001 From: "Joao P. L. de Carvalho" Date: Mon, 27 Apr 2026 18:41:38 -0500 Subject: [PATCH 03/17] Use enun rather than std::string for MUBUF modifiers --- .../stinkytofu/ir/asm/StinkyModifiers.hpp | 36 +++++++++++++++++-- .../conversion/rocisa/ToStinkyTofuUtils.cpp | 18 ++++++++-- .../serialization/asm/ModifierSerializer.cpp | 15 ++++---- .../serialization/asm/StinkyAsmEmitter.cpp | 4 +-- .../tests/unit/asm/StinkyAsmEmitterTest.cpp | 2 +- 5 files changed, 61 insertions(+), 14 deletions(-) diff --git a/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp b/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp index c835f8bc40c..d7db3688491 100644 --- a/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp +++ b/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #include namespace stinkytofu { @@ -34,6 +35,37 @@ enum class HighBitSel : int { NONE = -1, LOW = 0, HIGH = 1 }; enum class MatrixFmt : uint8_t { FP4 = 0, FP6 = 1, FP8 = 2 }; +enum class MUBUFScope : uint8_t { + SCOPE_NONE = 0, + SCOPE_CU = 1, + SCOPE_SE = 2, + SCOPE_DEV = 3, + SCOPE_SYS = 4 +}; + +inline std::string_view toString(MUBUFScope scope) { + switch (scope) { + case MUBUFScope::SCOPE_CU: + return "SCOPE_CU"; + case MUBUFScope::SCOPE_SE: + return "SCOPE_SE"; + case MUBUFScope::SCOPE_DEV: + return "SCOPE_DEV"; + case MUBUFScope::SCOPE_SYS: + return "SCOPE_SYS"; + default: + return ""; + } +} + +inline MUBUFScope parseMUBUFScope(std::string_view scope) { + if (scope == "SCOPE_CU") return MUBUFScope::SCOPE_CU; + if (scope == "SCOPE_SE") return MUBUFScope::SCOPE_SE; + if (scope == "SCOPE_DEV") return MUBUFScope::SCOPE_DEV; + if (scope == "SCOPE_SYS") return MUBUFScope::SCOPE_SYS; + return MUBUFScope::SCOPE_NONE; +} + struct Modifier { enum class Type : uint8_t { DS, @@ -152,7 +184,7 @@ struct MUBUFModifiers : public TypedModifier { bool nt = false, bool lds = false, bool isStore = false, bool hasMUBUFConst = false, bool hasGLCModifier = false, bool hasSC0Modifier = false, bool hasSCOPEModifier = false, - const std::string& scope = "") + MUBUFScope scope = MUBUFScope::SCOPE_NONE) : TypedModifier(), offset12(offset12), scope(scope), @@ -168,7 +200,7 @@ struct MUBUFModifiers : public TypedModifier { hasSCOPEModifier(hasSCOPEModifier) {} int offset12; - std::string scope; + MUBUFScope scope; uint32_t offen : 1; uint32_t glc : 1; uint32_t slc : 1; diff --git a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp index 3877efc9789..1763da75e69 100644 --- a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp +++ b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp @@ -80,14 +80,28 @@ stinkytofu::FLATModifiers convertFLATModifiers(const rocisa::FLATModifiers& rocM rocMod.isStore, hasGLCModifier, hasSC0Modifier); } +stinkytofu::MUBUFScope convertMUBUFScope(rocisa::CacheScope scope) { + switch (scope) { + case rocisa::CacheScope::SCOPE_CU: + return stinkytofu::MUBUFScope::SCOPE_CU; + case rocisa::CacheScope::SCOPE_SE: + return stinkytofu::MUBUFScope::SCOPE_SE; + case rocisa::CacheScope::SCOPE_DEV: + return stinkytofu::MUBUFScope::SCOPE_DEV; + case rocisa::CacheScope::SCOPE_SYS: + return stinkytofu::MUBUFScope::SCOPE_SYS; + default: + return stinkytofu::MUBUFScope::SCOPE_NONE; + } +} + stinkytofu::MUBUFModifiers convertMUBUFModifiers(const rocisa::MUBUFModifiers& rocMod, const std::map& asmCaps) { bool hasMUBUFConst = asmCaps.count("HasMUBUFConst") && asmCaps.at("HasMUBUFConst"); bool hasGLCModifier = asmCaps.count("HasGLCModifier") && asmCaps.at("HasGLCModifier"); bool hasSC0Modifier = asmCaps.count("HasSC0Modifier") && asmCaps.at("HasSC0Modifier"); bool hasSCOPEModifier = asmCaps.count("HasSCOPEModifier") && asmCaps.at("HasSCOPEModifier"); - std::string scope = - rocMod.scope == rocisa::CacheScope::SCOPE_NONE ? "" : rocisa::toString(rocMod.scope); + stinkytofu::MUBUFScope scope = convertMUBUFScope(rocMod.scope); return stinkytofu::MUBUFModifiers(rocMod.offen, rocMod.offset12, rocMod.glc, rocMod.slc, rocMod.nt, rocMod.lds, rocMod.isStore, hasMUBUFConst, hasGLCModifier, hasSC0Modifier, hasSCOPEModifier, scope); diff --git a/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp b/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp index ec1c4530444..750d96ea6da 100644 --- a/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp +++ b/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp @@ -159,8 +159,8 @@ bool serializeVisit(const MUBUFModifiers& mod, std::ostream& os) { os << " offen = " << (mod.offen ? "true" : "false") << ", offset12 = " << mod.offset12 << ", glc = " << (mod.glc ? "true" : "false") << ", slc = " << (mod.slc ? "true" : "false") << ", nt = " << (mod.nt ? "true" : "false") << ", lds = " << (mod.lds ? "true" : "false"); - if (mod.hasSCOPEModifier && !mod.scope.empty()) { - os << ", scope = \"" << mod.scope << "\""; + if (mod.hasSCOPEModifier && mod.scope != MUBUFScope::SCOPE_NONE) { + os << ", scope = \"" << toString(mod.scope) << "\""; } os << " }"; return true; @@ -380,11 +380,12 @@ void deserializeVisit(StinkyInstruction* inst, const std::string& attrKey, } else if (attrKey == "mod.global") { inst->addModifier(GLOBALModifiers(getInt(fields, "offset", 0))); } else if (attrKey == "mod.mubuf") { - inst->addModifier(MUBUFModifiers( - getBool(fields, "offen", false), getInt(fields, "offset12", 0), - getBool(fields, "glc", false), getBool(fields, "slc", false), - getBool(fields, "nt", false), getBool(fields, "lds", false), false, false, false, false, - getStr(fields, "scope", "") != "", getStr(fields, "scope", ""))); + MUBUFScope scope = parseMUBUFScope(getStr(fields, "scope", "")); + inst->addModifier( + MUBUFModifiers(getBool(fields, "offen", false), getInt(fields, "offset12", 0), + getBool(fields, "glc", false), getBool(fields, "slc", false), + getBool(fields, "nt", false), getBool(fields, "lds", false), false, + false, false, false, scope != MUBUFScope::SCOPE_NONE, scope)); } else if (attrKey == "mod.smem") { inst->addModifier(SMEMModifiers(getBool(fields, "glc", false), getBool(fields, "nv", false), getInt(fields, "offset", 0))); diff --git a/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp b/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp index 16fcc193577..b8b49df1c87 100644 --- a/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp +++ b/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp @@ -203,8 +203,8 @@ inline std::ostream& operator<<(std::ostream& os, const MUBUFModifiers& mubufMod else if (mubufMod.hasSC0Modifier) os << " sc1"; } - if (mubufMod.hasSCOPEModifier && !mubufMod.scope.empty()) { - os << " scope:" << mubufMod.scope; + if (mubufMod.hasSCOPEModifier && mubufMod.scope != MUBUFScope::SCOPE_NONE) { + os << " scope:" << toString(mubufMod.scope); } if (mubufMod.nt) { os << " nt"; diff --git a/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp b/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp index 2ed655f42ed..8623669e7c8 100644 --- a/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp +++ b/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp @@ -717,7 +717,7 @@ TEST_F(AsmEmitterTest, MUBUFScopeModifier) { /*nt=*/false, /*lds=*/false, /*isStore=*/true, /*hasMUBUFConst=*/false, /*hasGLCModifier=*/false, /*hasSC0Modifier=*/false, /*hasSCOPEModifier=*/true, - /*scope=*/"SCOPE_DEV"); + /*scope=*/MUBUFScope::SCOPE_DEV); inst->addModifier(mubufMod); AsmEmitterOptions options; From d574f981d588af5cf4da6cf100afdab5b0dbc43c Mon Sep 17 00:00:00 2001 From: "Joao P. L. de Carvalho" Date: Tue, 28 Apr 2026 20:03:28 +0000 Subject: [PATCH 04/17] Add offset flags if not set or no mubuf modifiers --- .../tensilelite/rocisa/rocisa/include/instruction/mem.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp index 486cee166b0..f5d55deb506 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp @@ -352,6 +352,10 @@ namespace rocisa { kStr += mubuf->toString(); } + if(!mubuf || !mubuf->offen) + { + kStr += " offen offset:0"; + } kStr = formatWithComment(kStr); setMsb(kStr, {vaddr}, dst); return kStr; @@ -714,6 +718,10 @@ namespace rocisa { kStr += mubuf->toString(); } + if(!mubuf || !mubuf->offen) + { + kStr += " offen offset:0"; + } kStr = formatWithComment(kStr); setMsb(kStr, {vaddr}, srcData); return kStr; From 58c54c2041c5796b2b813b5b18c31de0c74195e4 Mon Sep 17 00:00:00 2001 From: "Henderson, Nathan" Date: Wed, 29 Apr 2026 16:38:35 +0000 Subject: [PATCH 05/17] Fix MUBUF null soffset emission Ensure real-vaddr MUBUF instructions keep an address modifier when literal zero soffsets are lowered to null, while preserving the off-vaddr form. --- .../rocisa/rocisa/include/instruction/mem.hpp | 18 ++- .../tensilelite/rocisa/test/test_mubuf.py | 135 +++++++++++++++++- .../stinkytofu/ir/asm/StinkyModifiers.hpp | 6 +- .../conversion/rocisa/ToStinkyTofuUtils.cpp | 44 +++++- 4 files changed, 192 insertions(+), 11 deletions(-) diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp index f5d55deb506..6c1e7a16391 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp @@ -275,6 +275,16 @@ namespace rocisa } }; + /// Returns true when vaddr is the MUBUF "off" keyword. + inline bool isOffVAddr(const std::shared_ptr& vaddr) + { + if(auto* regCont = dynamic_cast(vaddr.get())) + { + return regCont->isOff; + } + return false; + } + struct MUBUFReadInstruction : public GlobalReadInstruction { std::shared_ptr vaddr; @@ -352,7 +362,7 @@ namespace rocisa { kStr += mubuf->toString(); } - if(!mubuf || !mubuf->offen) + if((!mubuf || !mubuf->offen) && !isOffVAddr(vaddr)) { kStr += " offen offset:0"; } @@ -718,7 +728,7 @@ namespace rocisa { kStr += mubuf->toString(); } - if(!mubuf || !mubuf->offen) + if((!mubuf || !mubuf->offen) && !isOffVAddr(vaddr)) { kStr += " offen offset:0"; } @@ -1624,6 +1634,10 @@ namespace rocisa std::string kStr = instStr + " " + getArgStr(); if(mubuf) kStr += mubuf->toString(); + if((!mubuf || !mubuf->offen) && !isOffVAddr(vaddr)) + { + kStr += " offen offset:0"; + } kStr = formatWithComment(kStr); setMsb(kStr, {vaddr}, srcData); return kStr; diff --git a/projects/hipblaslt/tensilelite/rocisa/test/test_mubuf.py b/projects/hipblaslt/tensilelite/rocisa/test/test_mubuf.py index 2b62e277238..c5689246d0e 100644 --- a/projects/hipblaslt/tensilelite/rocisa/test/test_mubuf.py +++ b/projects/hipblaslt/tensilelite/rocisa/test/test_mubuf.py @@ -29,7 +29,13 @@ from rocisa.code import Module, SignatureBase from rocisa.container import MUBUFModifiers, sgpr, vgpr from rocisa.enum import CacheScope -from rocisa.instruction import BufferLoadB32, BufferStoreB32 +from rocisa.instruction import ( + BufferAtomicAddF32, + BufferLoadB32, + BufferLoadB64, + BufferLoadB128, + BufferStoreB32, +) _ISA = (12, 5, 0) @@ -60,6 +66,14 @@ def _mubuf_off_asm() -> str: soffset=sgpr(46), ) ) + mod.add( + BufferStoreB32( + src=vgpr(13), + vaddr=vgpr("off", isOff=True), + saddr=sgpr(64, 4), + soffset=0, + ) + ) mod.setParent() # resolves symbolic register names before conversion sig = SignatureBase( @@ -80,8 +94,8 @@ def _mubuf_off_asm() -> str: def test_mubuf_off_vaddr_stinkytofu(_mubuf_off_asm): - # Assembler rejects 'v[vgproff]'; 'off' must appear as the literal vaddr operand. assert re.search(r"buffer_store_b32 v12, off, s\[60:63\], s46", _mubuf_off_asm) + assert re.search(r"buffer_store_b32 v13, off, s\[64:67\], null", _mubuf_off_asm) @pytest.fixture(scope="module") @@ -134,3 +148,120 @@ def test_mubuf_scope_modifiers_stinkytofu(_mubuf_scope_asm): r"buffer_load_b32 v13, v33, s\[64:67\], s47 offen offset:0 scope:SCOPE_DEV", _mubuf_scope_asm, ) + + +def test_rocisa_load_null_soffset_adds_offen(): + inst = BufferLoadB32( + dst=vgpr(13), + vaddr=vgpr(33), + saddr=sgpr(64, 4), + soffset=0, + ) + + assert str(inst).strip() == "buffer_load_b32 v13, v33, s[64:67], null offen offset:0" + + +def test_rocisa_store_null_soffset_adds_offen(): + inst = BufferStoreB32( + src=vgpr(12), + vaddr=vgpr(32), + saddr=sgpr(60, 4), + soffset=0, + ) + + assert str(inst).strip() == "buffer_store_b32 v12, v32, s[60:63], null offen offset:0" + + +def test_rocisa_atomic_null_soffset_adds_offen(): + inst = BufferAtomicAddF32( + src=vgpr(12), + vaddr=vgpr(32), + saddr=sgpr(60, 4), + soffset=0, + ) + + assert str(inst).strip() == "buffer_atomic_add_f32 v12, v32, s[60:63], null offen offset:0" + + +def test_rocisa_off_vaddr_null_soffset(): + inst = BufferStoreB32( + src=vgpr(12), + vaddr=vgpr("off", isOff=True), + saddr=sgpr(60, 4), + soffset=0, + ) + + assert str(inst).strip() == "buffer_store_b32 v12, off, s[60:63], null" + + +@pytest.fixture(scope="module") +def _mubuf_zero_soffset_asm() -> str: + mod = Module("mubuf_zero_soffset") + mod.add( + BufferLoadB128( + dst=vgpr(0, 4), + vaddr=vgpr(32), + saddr=sgpr(60, 4), + soffset=0, + mubuf=MUBUFModifiers(offen=True), + comment="G -> Reg 0_0_0_0", + ) + ) + mod.add( + BufferLoadB64( + dst=vgpr(4, 2), + vaddr=vgpr(33), + saddr=sgpr(60, 4), + soffset=0, + comment="G -> Reg 0_0_1_0", + ) + ) + mod.add( + BufferLoadB32( + dst=vgpr(6), + vaddr=vgpr(34), + saddr=sgpr(60, 4), + soffset=0, + mubuf=MUBUFModifiers(offen=False), + comment="G -> Reg 0_0_2_0", + ) + ) + mod.setParent() + + sig = SignatureBase( + kernelName="mubuf_zero_soffset", + kernArgsVersion=1, + codeObjectVersion="4", + groupSegmentSize=0, + sgprWorkGroup=(1, 1, 0), + vgprWorkItem=0, + flatWorkGroupSize=64, + preloadKernArgs=False, + ) + + st = rocisa.toStinkyTofuModule( + mod, _ISA, "mubuf_zero_soffset", signature=sig, options={"OptLevel": 0} + ) + st.runOptimizationPipeline() + return st.emitAssembly() + + +def test_stinky_null_soffset_explicit_offen(_mubuf_zero_soffset_asm): + assert re.search( + r"buffer_load_b128 v\[0:3\], v32, s\[60:63\], null offen offset:0", + _mubuf_zero_soffset_asm, + ) + + +def test_stinky_null_soffset_default_mubuf(_mubuf_zero_soffset_asm): + assert re.search( + r"buffer_load_b64 v\[4:5\], v33, s\[60:63\], null offen offset:0", + _mubuf_zero_soffset_asm, + ) + + +def test_stinky_null_soffset_offen_false(_mubuf_zero_soffset_asm): + assert re.search( + r"buffer_load_b32 v6, v34, s\[60:63\], null offen offset:0", + _mubuf_zero_soffset_asm, + ) diff --git a/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp b/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp index d7db3688491..2c7ff4b7272 100644 --- a/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp +++ b/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp @@ -187,7 +187,6 @@ struct MUBUFModifiers : public TypedModifier { MUBUFScope scope = MUBUFScope::SCOPE_NONE) : TypedModifier(), offset12(offset12), - scope(scope), offen(offen), glc(glc), slc(slc), @@ -197,10 +196,10 @@ struct MUBUFModifiers : public TypedModifier { hasMUBUFConst(hasMUBUFConst), hasGLCModifier(hasGLCModifier), hasSC0Modifier(hasSC0Modifier), - hasSCOPEModifier(hasSCOPEModifier) {} + hasSCOPEModifier(hasSCOPEModifier), + scope(scope) {} int offset12; - MUBUFScope scope; uint32_t offen : 1; uint32_t glc : 1; uint32_t slc : 1; @@ -211,6 +210,7 @@ struct MUBUFModifiers : public TypedModifier { uint32_t hasGLCModifier : 1; uint32_t hasSC0Modifier : 1; uint32_t hasSCOPEModifier : 1; + MUBUFScope scope; }; struct SMEMModifiers : public TypedModifier { diff --git a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp index 1763da75e69..ad683695f1b 100644 --- a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp +++ b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp @@ -107,6 +107,38 @@ stinkytofu::MUBUFModifiers convertMUBUFModifiers(const rocisa::MUBUFModifiers& r hasGLCModifier, hasSC0Modifier, hasSCOPEModifier, scope); } +/// Returns true when vaddr is the MUBUF "off" keyword. +bool isOffVaddrContainer(const rocisa::Container* vaddr) { + if (auto* regCont = dynamic_cast(vaddr)) { + return regCont->isOff; + } + return false; +} + +/// Build modifiers matching rocisa's MUBUF address form: real vaddr operands +/// use `offen`, while `off` keeps the no-vaddr form. +stinkytofu::MUBUFModifiers buildMUBUFModifiersForBufferOp( + const std::optional& rocMubuf, const rocisa::Container* vaddr, + const std::map& asmCaps) { + bool hasMUBUFConst = asmCaps.count("HasMUBUFConst") && asmCaps.at("HasMUBUFConst"); + bool hasGLCModifier = asmCaps.count("HasGLCModifier") && asmCaps.at("HasGLCModifier"); + bool hasSC0Modifier = asmCaps.count("HasSC0Modifier") && asmCaps.at("HasSC0Modifier"); + + stinkytofu::MUBUFModifiers mod = rocMubuf.has_value() + ? convertMUBUFModifiers(rocMubuf.value(), asmCaps) + : stinkytofu::MUBUFModifiers( + /*offen=*/false, /*offset12=*/0, /*glc=*/false, + /*slc=*/false, /*nt=*/false, /*lds=*/false, + /*isStore=*/false, hasMUBUFConst, hasGLCModifier, + hasSC0Modifier, /*hasSCOPEModifier=*/false, + stinkytofu::MUBUFScope::SCOPE_NONE); + + if (!mod.offen && !isOffVaddrContainer(vaddr)) { + mod.offen = 1; + } + return mod; +} + stinkytofu::SMEMModifiers convertSMEMModifiers(const rocisa::SMEMModifiers& rocMod, const std::map& asmCaps) { bool hasSCOPEModifier = asmCaps.count("HasSCOPEModifier") && asmCaps.at("HasSCOPEModifier"); @@ -571,10 +603,14 @@ void addModifiersToInstruction(StinkyInstruction* stinkyInst, const rocisa::Inst [&](const auto& mod) { return convertFLATModifiers(mod, asmCaps); }) else TRY_ADD_MOD(FLATStoreInstruction, flat, stinkytofu::FLATModifiers, [&](const auto& mod) { return convertFLATModifiers(mod, asmCaps); }) - else TRY_ADD_MOD(MUBUFReadInstruction, mubuf, stinkytofu::MUBUFModifiers, - [&](const auto& mod) { return convertMUBUFModifiers(mod, asmCaps); }) - else TRY_ADD_MOD(MUBUFStoreInstruction, mubuf, stinkytofu::MUBUFModifiers, - [&](const auto& mod) { return convertMUBUFModifiers(mod, asmCaps); }) + else if (auto typed = dynamic_cast(inst)) { + stinkyInst->addModifier( + buildMUBUFModifiersForBufferOp(typed->mubuf, typed->vaddr.get(), asmCaps)); + } + else if (auto typed = dynamic_cast(inst)) { + stinkyInst->addModifier( + buildMUBUFModifiersForBufferOp(typed->mubuf, typed->vaddr.get(), asmCaps)); + } else TRY_ADD_MOD(SMemLoadInstruction, smem, stinkytofu::SMEMModifiers, [&](const auto& mod) { return convertSMEMModifiers(mod, asmCaps); }) else TRY_ADD_MOD(SMemStoreInstruction, smem, stinkytofu::SMEMModifiers, From 676fad726dd61f0b6e3be8006c1579f6950c5394 Mon Sep 17 00:00:00 2001 From: "Henderson, Nathan" Date: Wed, 29 Apr 2026 17:03:58 +0000 Subject: [PATCH 06/17] Fix format --- .../src/conversion/rocisa/ToStinkyTofuUtils.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp index ad683695f1b..e5d058c3a1c 100644 --- a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp +++ b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp @@ -124,14 +124,13 @@ stinkytofu::MUBUFModifiers buildMUBUFModifiersForBufferOp( bool hasGLCModifier = asmCaps.count("HasGLCModifier") && asmCaps.at("HasGLCModifier"); bool hasSC0Modifier = asmCaps.count("HasSC0Modifier") && asmCaps.at("HasSC0Modifier"); - stinkytofu::MUBUFModifiers mod = rocMubuf.has_value() - ? convertMUBUFModifiers(rocMubuf.value(), asmCaps) - : stinkytofu::MUBUFModifiers( - /*offen=*/false, /*offset12=*/0, /*glc=*/false, - /*slc=*/false, /*nt=*/false, /*lds=*/false, - /*isStore=*/false, hasMUBUFConst, hasGLCModifier, - hasSC0Modifier, /*hasSCOPEModifier=*/false, - stinkytofu::MUBUFScope::SCOPE_NONE); + stinkytofu::MUBUFModifiers mod = + rocMubuf.has_value() ? convertMUBUFModifiers(rocMubuf.value(), asmCaps) + : stinkytofu::MUBUFModifiers( + /*offen=*/false, /*offset12=*/0, /*glc=*/false, + /*slc=*/false, /*nt=*/false, /*lds=*/false, + /*isStore=*/false, hasMUBUFConst, hasGLCModifier, hasSC0Modifier, + /*hasSCOPEModifier=*/false, stinkytofu::MUBUFScope::SCOPE_NONE); if (!mod.offen && !isOffVaddrContainer(vaddr)) { mod.offen = 1; From aa9b63c9a2a00d01626a4bbd8bd273831178db84 Mon Sep 17 00:00:00 2001 From: "Henderson, Nathan" Date: Thu, 30 Apr 2026 15:08:02 +0000 Subject: [PATCH 07/17] Remove redundant MUBUF scope flag --- .../stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp | 5 +---- .../stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp | 5 ++--- .../stinkytofu/src/serialization/asm/ModifierSerializer.cpp | 4 ++-- shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp | 2 +- shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp | 3 +-- 5 files changed, 7 insertions(+), 12 deletions(-) diff --git a/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp b/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp index 2c7ff4b7272..d6164a25987 100644 --- a/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp +++ b/shared/stinkytofu/include/stinkytofu/ir/asm/StinkyModifiers.hpp @@ -183,8 +183,7 @@ struct MUBUFModifiers : public TypedModifier { MUBUFModifiers(bool offen = false, int offset12 = 0, bool glc = false, bool slc = false, bool nt = false, bool lds = false, bool isStore = false, bool hasMUBUFConst = false, bool hasGLCModifier = false, - bool hasSC0Modifier = false, bool hasSCOPEModifier = false, - MUBUFScope scope = MUBUFScope::SCOPE_NONE) + bool hasSC0Modifier = false, MUBUFScope scope = MUBUFScope::SCOPE_NONE) : TypedModifier(), offset12(offset12), offen(offen), @@ -196,7 +195,6 @@ struct MUBUFModifiers : public TypedModifier { hasMUBUFConst(hasMUBUFConst), hasGLCModifier(hasGLCModifier), hasSC0Modifier(hasSC0Modifier), - hasSCOPEModifier(hasSCOPEModifier), scope(scope) {} int offset12; @@ -209,7 +207,6 @@ struct MUBUFModifiers : public TypedModifier { uint32_t hasMUBUFConst : 1; uint32_t hasGLCModifier : 1; uint32_t hasSC0Modifier : 1; - uint32_t hasSCOPEModifier : 1; MUBUFScope scope; }; diff --git a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp index e5d058c3a1c..2da0ffb511b 100644 --- a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp +++ b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp @@ -100,11 +100,10 @@ stinkytofu::MUBUFModifiers convertMUBUFModifiers(const rocisa::MUBUFModifiers& r bool hasMUBUFConst = asmCaps.count("HasMUBUFConst") && asmCaps.at("HasMUBUFConst"); bool hasGLCModifier = asmCaps.count("HasGLCModifier") && asmCaps.at("HasGLCModifier"); bool hasSC0Modifier = asmCaps.count("HasSC0Modifier") && asmCaps.at("HasSC0Modifier"); - bool hasSCOPEModifier = asmCaps.count("HasSCOPEModifier") && asmCaps.at("HasSCOPEModifier"); stinkytofu::MUBUFScope scope = convertMUBUFScope(rocMod.scope); return stinkytofu::MUBUFModifiers(rocMod.offen, rocMod.offset12, rocMod.glc, rocMod.slc, rocMod.nt, rocMod.lds, rocMod.isStore, hasMUBUFConst, - hasGLCModifier, hasSC0Modifier, hasSCOPEModifier, scope); + hasGLCModifier, hasSC0Modifier, scope); } /// Returns true when vaddr is the MUBUF "off" keyword. @@ -130,7 +129,7 @@ stinkytofu::MUBUFModifiers buildMUBUFModifiersForBufferOp( /*offen=*/false, /*offset12=*/0, /*glc=*/false, /*slc=*/false, /*nt=*/false, /*lds=*/false, /*isStore=*/false, hasMUBUFConst, hasGLCModifier, hasSC0Modifier, - /*hasSCOPEModifier=*/false, stinkytofu::MUBUFScope::SCOPE_NONE); + stinkytofu::MUBUFScope::SCOPE_NONE); if (!mod.offen && !isOffVaddrContainer(vaddr)) { mod.offen = 1; diff --git a/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp b/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp index 750d96ea6da..39388f9fdda 100644 --- a/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp +++ b/shared/stinkytofu/src/serialization/asm/ModifierSerializer.cpp @@ -159,7 +159,7 @@ bool serializeVisit(const MUBUFModifiers& mod, std::ostream& os) { os << " offen = " << (mod.offen ? "true" : "false") << ", offset12 = " << mod.offset12 << ", glc = " << (mod.glc ? "true" : "false") << ", slc = " << (mod.slc ? "true" : "false") << ", nt = " << (mod.nt ? "true" : "false") << ", lds = " << (mod.lds ? "true" : "false"); - if (mod.hasSCOPEModifier && mod.scope != MUBUFScope::SCOPE_NONE) { + if (mod.scope != MUBUFScope::SCOPE_NONE) { os << ", scope = \"" << toString(mod.scope) << "\""; } os << " }"; @@ -385,7 +385,7 @@ void deserializeVisit(StinkyInstruction* inst, const std::string& attrKey, MUBUFModifiers(getBool(fields, "offen", false), getInt(fields, "offset12", 0), getBool(fields, "glc", false), getBool(fields, "slc", false), getBool(fields, "nt", false), getBool(fields, "lds", false), false, - false, false, false, scope != MUBUFScope::SCOPE_NONE, scope)); + false, false, false, scope)); } else if (attrKey == "mod.smem") { inst->addModifier(SMEMModifiers(getBool(fields, "glc", false), getBool(fields, "nv", false), getInt(fields, "offset", 0))); diff --git a/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp b/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp index b8b49df1c87..c4884623a38 100644 --- a/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp +++ b/shared/stinkytofu/src/serialization/asm/StinkyAsmEmitter.cpp @@ -203,7 +203,7 @@ inline std::ostream& operator<<(std::ostream& os, const MUBUFModifiers& mubufMod else if (mubufMod.hasSC0Modifier) os << " sc1"; } - if (mubufMod.hasSCOPEModifier && mubufMod.scope != MUBUFScope::SCOPE_NONE) { + if (mubufMod.scope != MUBUFScope::SCOPE_NONE) { os << " scope:" << toString(mubufMod.scope); } if (mubufMod.nt) { diff --git a/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp b/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp index 8623669e7c8..8bdcc69facc 100644 --- a/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp +++ b/shared/stinkytofu/tests/unit/asm/StinkyAsmEmitterTest.cpp @@ -716,8 +716,7 @@ TEST_F(AsmEmitterTest, MUBUFScopeModifier) { MUBUFModifiers mubufMod(/*offen=*/true, /*offset12=*/0, /*glc=*/false, /*slc=*/false, /*nt=*/false, /*lds=*/false, /*isStore=*/true, /*hasMUBUFConst=*/false, /*hasGLCModifier=*/false, - /*hasSC0Modifier=*/false, /*hasSCOPEModifier=*/true, - /*scope=*/MUBUFScope::SCOPE_DEV); + /*hasSC0Modifier=*/false, /*scope=*/MUBUFScope::SCOPE_DEV); inst->addModifier(mubufMod); AsmEmitterOptions options; From 53bf38bda6bbb17124ea7796133c1073f6056e27 Mon Sep 17 00:00:00 2001 From: "Henderson, Nathan" Date: Thu, 30 Apr 2026 15:23:26 +0000 Subject: [PATCH 08/17] Fix format --- .../src/conversion/rocisa/ToStinkyTofuUtils.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp index 2da0ffb511b..c7425f08781 100644 --- a/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp +++ b/shared/stinkytofu/src/conversion/rocisa/ToStinkyTofuUtils.cpp @@ -123,13 +123,13 @@ stinkytofu::MUBUFModifiers buildMUBUFModifiersForBufferOp( bool hasGLCModifier = asmCaps.count("HasGLCModifier") && asmCaps.at("HasGLCModifier"); bool hasSC0Modifier = asmCaps.count("HasSC0Modifier") && asmCaps.at("HasSC0Modifier"); - stinkytofu::MUBUFModifiers mod = - rocMubuf.has_value() ? convertMUBUFModifiers(rocMubuf.value(), asmCaps) - : stinkytofu::MUBUFModifiers( - /*offen=*/false, /*offset12=*/0, /*glc=*/false, - /*slc=*/false, /*nt=*/false, /*lds=*/false, - /*isStore=*/false, hasMUBUFConst, hasGLCModifier, hasSC0Modifier, - stinkytofu::MUBUFScope::SCOPE_NONE); + stinkytofu::MUBUFModifiers mod = rocMubuf.has_value() + ? convertMUBUFModifiers(rocMubuf.value(), asmCaps) + : stinkytofu::MUBUFModifiers( + /*offen=*/false, /*offset12=*/0, /*glc=*/false, + /*slc=*/false, /*nt=*/false, /*lds=*/false, + /*isStore=*/false, hasMUBUFConst, hasGLCModifier, + hasSC0Modifier, stinkytofu::MUBUFScope::SCOPE_NONE); if (!mod.offen && !isOffVaddrContainer(vaddr)) { mod.offen = 1; From aae24e60344091d69999cabab6d87fcca6d351b9 Mon Sep 17 00:00:00 2001 From: "Henderson, Nathan" Date: Fri, 24 Apr 2026 10:27:27 -0600 Subject: [PATCH 09/17] KernelWriter: fix missing s_wait_tensorcnt before PGR barrier in TDM kernels tensor_load_to_lds writes to LDS via the tensor counter. The PGR barrier ("PGR, and wait until LW done to sync LDSx") that guards the double-buffer swap requires s_wait_tensorcnt 0 before s_barrier_signal to ensure TDM stores have committed to LDS before other wavefronts proceed past the barrier. The DirectToLds path already emits an equivalent vlcnt wait here; the TDM path did not, resulting in a RAW hazard on the LDS buffer. Add an elif for enableTDMA+enableTDMB that calls _wait(skipGlobalRead=0), routing through tdmWait() to emit s_wait_loadcnt 0 + s_wait_tensorcnt 0 into waitLWCode ahead of the barrier. Set skipForceWaitcnt0=True for TDM to suppress the redundant force-waitcnt0 path. --- .../hipblaslt/tensilelite/Tensile/KernelWriter.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py index acc1de93c80..35bc1678ffa 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py @@ -3917,12 +3917,19 @@ def _loopBody( self, kernel, tensorParametersA, tensorParametersB, pack, packPre vlcntVal = kernel["PrefetchGlobalRead"] - 1 if kernel["PrefetchGlobalRead"] >= 2 else 0 waitLWCode.add(self._wait(kernel, tensorParametersA, tensorParametersB, vlcntVal, -1, -1, \ "wait for previous set of global reads")) + elif kernel["enableTDMA"] and kernel["enableTDMB"]: + # TDM case: tensor_load_to_lds instructions (issued in prior iter) write to LDS via the + # tensor counter. A s_wait_tensorcnt 0 is required before the barrier to guarantee all + # TDM stores to LDS have landed before other waves read from that LDS buffer. + waitLWCode.add(self._wait(kernel, tensorParametersA, tensorParametersB, 0, -1, -1, \ + "wait for TDM global reads")) # (no local write code. Global read wait for DirectToLds is already done) if not kernel["NoLdsWriteCode"]: waitLWCode.add(self._wait(kernel, tensorParametersA, tensorParametersB, -1, 0, -1, "3wait for local write")) skipForceWaitcnt0 = False - if kernel["DirectToVgprA"] or kernel["DirectToVgprB"] or kernel["DirectToLdsA"] or kernel["DirectToLdsB"]: - # DTVA/B or DTLA/B case, skip generating force waitcnt0 + if kernel["DirectToVgprA"] or kernel["DirectToVgprB"] or kernel["DirectToLdsA"] or kernel["DirectToLdsB"] or \ + (kernel["enableTDMA"] and kernel["enableTDMB"]): + # DTVA/B, DTLA/B, or TDM case: global read wait is handled above, skip force waitcnt0 skipForceWaitcnt0 = True syncCode.add(self._syncThreads(kernel, "PGR, and wait until LW done to sync LDS%u"%(self.states.ldsBarrierTokenIdx), skipForceWaitcnt0=skipForceWaitcnt0, memoryToken=self.states.ldsBarrierTokenIdx)) # swap barrier token From 7facbccc64d353c954790541cb5d4f308025a1a5 Mon Sep 17 00:00:00 2001 From: "Henderson, Nathan" Date: Sun, 26 Apr 2026 19:09:07 +0000 Subject: [PATCH 10/17] StreamK: fix gfx1250 flag and partial ordering Use scoped VMEM flag polling with release/acquire cache operations so StreamK partials are visible before consumers proceed on gfx1250. --- .../tensilelite/Tensile/Components/StreamK.py | 103 ++++++++++++++++-- 1 file changed, 95 insertions(+), 8 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py index d7e759d9f7c..79622f9163f 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py @@ -24,7 +24,7 @@ from rocisa.code import Module, Label from rocisa.container import vgpr, sgpr, SMEMModifiers, MUBUFModifiers, replaceHolder, EXEC,\ VOP3PModifiers, ContinuousRegister -from rocisa.instruction import SAddCU32, SAddU32, SAndB32, SBarrier, \ +from rocisa.instruction import MacroInstruction, SAddCU32, SAddU32, SAndB32, SBarrier, \ SBranch, SCBranchSCC0, SCBranchSCC1, SCMovB32, SCSelectB32, SCmpEQU32, SCmpEQU64, \ SCmpGtU32, SCmpLeU32, SCmpLtU32, SLShiftLeftB32, SLShiftLeftB64, SLShiftRightB32, SLoadB32, \ SMaxI32, SMinU32, SMovB32, SMovB64, SMulI32, SNop, SOrB32, SSleep, SStoreB32, SSubU32, \ @@ -139,6 +139,57 @@ def shiftSrd(self, writer, srdIdx) -> Module: return module + # gfx1250 StreamK flag/partial ordering helpers. No-op on other arches. + # Predicate matches shiftSrd above. + def _isGfx1250(self, writer) -> bool: + return writer.states.version[:2] == (12, 5) + + def _waitXcnt0(self, writer, comment: str = "") -> Module: + module = Module("gfx1250 s_wait_xcnt 0") + if self._isGfx1250(writer): + module.add(MacroInstruction(name="s_wait_xcnt 0", args=[], comment=comment)) + return module + + def _globalWbDev(self, writer, comment: str = "") -> Module: + module = Module("gfx1250 global_wb") + if self._isGfx1250(writer): + module.add(MacroInstruction(name="global_wb scope:SCOPE_DEV", args=[], comment=comment)) + return module + + def _globalInvDev(self, writer, comment: str = "") -> Module: + module = Module("gfx1250 global_inv") + if self._isGfx1250(writer): + module.add(MacroInstruction(name="global_inv scope:SCOPE_DEV", args=[], comment=comment)) + return module + + def _waitLoadcntForRelease(self, writer) -> Module: + # Drain in-flight VMEM loads before the release writeback. + module = Module("gfx1250 s_wait_loadcnt 0 (release)") + if self._isGfx1250(writer): + module.add(SWaitCnt(vlcnt=0, comment="release: drain in-flight loads before global_wb")) + return module + + def _waitGlobalWbForRelease(self, writer) -> Module: + module = Module("gfx1250 wait global_wb") + if self._isGfx1250(writer): + # gfx1250 has split VMEM counters. Drain both sides after global_wb + # before publishing the StreamK flag. + module.add(SWaitCnt(vlcnt=0, vscnt=0, comment="release: wait for global_wb")) + return module + + def _readFlagValue(self, writer, dst, soffset) -> Module: + module = Module("Read StreamK flag") + if self._isGfx1250(writer): + flagVgpr = writer.vgprPool.checkOut(1, "flagAcq") + module.add(self.getFlagValue(writer, dst=vgpr(flagVgpr), soffset=soffset, comment="acquire: get flag (VMEM)")) + module.add(SWaitCnt(vlcnt=0, comment="acquire: wait VMEM flag load")) + module.add(VReadfirstlaneB32(dst=sgpr(dst), src=vgpr(flagVgpr), comment="move VMEM flag to SGPR for compare")) + writer.vgprPool.checkIn(flagVgpr) + else: + module.add(SLoadB32(dst=sgpr(dst), base=sgpr("AddressFlags", 2), soffset=soffset, smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag")) + module.add(SWaitCnt(kmcnt=0, comment="wait for flag load")) + return module + def _skv(self, writer, name): """Return the VGPR index holding a StreamK constant.""" return writer.states.skConstVgprs[name] @@ -593,11 +644,13 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sFlagIdx), shiftHex=log2(4), comment="flag offset based on wg index")) module.add(skFixupWaitForFlag) # loop to wait for flag - module.add(SLoadB32(dst=sgpr(tmpSgpr+1), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag")) - module.add(SWaitCnt(kmcnt=0, comment="wait for flag load")) + module.add(self._readFlagValue(writer, dst=tmpSgpr+1, soffset=sgpr(tmpSgpr))) if kernel["DebugStreamK"] & 2 == 0: # Don't wait for partials if not being written module.add(SCmpEQU32(src0=sgpr(tmpSgpr+1), src1=1, comment="check if ready")) module.add(SCBranchSCC0(labelName=skFixupWaitForFlag.getLabelName(), comment="if flag not set, wait and check again")) + if self._isGfx1250(writer): + module.add(self._globalInvDev(writer, comment="acquire: invalidate partials after flag")) + module.add(SWaitCnt(vlcnt=0, comment="acquire: wait for global_inv")) module.add(SBarrier(comment="wait for all workgroups before resetting flag")) skipFlagReset = Label(label=writer.labels.getNameInc("SK_SkipFlagReset"), comment="") @@ -681,11 +734,13 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele # Check flag module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sCtaIdx), shiftHex=log2(4), comment="flag offset based on CTA index")) - module.add(SLoadB32(dst=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag")) - module.add(SWaitCnt(kmcnt=0, comment="wait for flag load")) + module.add(self._readFlagValue(writer, dst=tmpSgpr+2, soffset=sgpr(tmpSgpr))) if kernel["DebugStreamK"] & 2 == 0: module.add(SCmpEQU32(src0=sgpr(tmpSgpr+2), src1=1, comment="check if ready")) module.add(SCBranchSCC0(labelName=skFixupLabel.getLabelName(), comment="if flag not set, wait and check again")) + if self._isGfx1250(writer): + module.add(self._globalInvDev(writer, comment="acquire: invalidate partials after flag")) + module.add(SWaitCnt(vlcnt=0, comment="acquire: wait for global_inv")) # TODO Barrier here to sync all threads in workgroup, but maybe better to have separate flag for each wavefront (to be tested) module.add(SBarrier(comment="wait for all workgroups before resetting flag")) @@ -1009,7 +1064,10 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, # kStr += PreLoopVmcntCaseStr # Set flag + module.add(self._waitLoadcntForRelease(writer)) module.add(SWaitCnt(vscnt=0, comment="wait for data store")) + module.add(self._globalWbDev(writer, comment="release: writeback partials to L2-coherent point")) + module.add(self._waitGlobalWbForRelease(writer)) module.add(SBarrier(comment="store all data before setting flag")) sIdx = writer.acquireStreamKConstSgpr(kernel, "StreamKIdx") if writer.isStreamKConstantsToVgprEnabled(kernel): @@ -1047,15 +1105,44 @@ def setFlagValue(self, writer, src, soffset, comment=""): module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB")) module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96")) module.add(self.shiftSrd(writer, tmpSgprBuffer)) - module.add(BufferStoreB32(src=src, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, \ - mubuf=MUBUFModifiers(offen=True, glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), \ - comment=comment)) + # gfx1250: drain in-flight XNACK retries before the flag store. + module.add(self._waitXcnt0(writer, comment="drain xnacks before volatile VMEM store")) + if self._isGfx1250(writer): + module.add(MacroInstruction( + name=f"buffer_store_b32 {src}, {vgpr(tmpVgprOff)}, {sgpr(tmpSgprBuffer, 4)}, {soffset} offen offset:0 scope:SCOPE_DEV", + args=[], + comment=comment)) + else: + module.add(BufferStoreB32(src=src, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, \ + mubuf=MUBUFModifiers(offen=True, glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), \ + comment=comment)) module.add(SWaitCnt(vscnt=0, comment="wait for data store")) writer.vgprPool.checkIn(tmpVgprOff) writer.sgprPool.checkIn(tmpSgprBuffer) return module + def getFlagValue(self, writer, dst, soffset, comment=""): + """gfx1250 acquire-side VMEM load of the StreamK flag.""" + module = Module("Buffer Load Flag Value") + tmpSgprBuffer = writer.sgprPool.checkOutAligned(4, 4, preventOverflow=False) + tmpVgprOff = writer.vgprPool.checkOut(1, "vaddr_off") + module.add(VMovB32(dst=vgpr(tmpVgprOff), src=0, comment="zero vaddr offset")) + module.add(SMovB64(dst=sgpr(tmpSgprBuffer, 2), src=sgpr("AddressFlags", 2))) + module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB")) + module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96")) + module.add(self.shiftSrd(writer, tmpSgprBuffer)) + # gfx1250: drain in-flight XNACK retries before the flag load. + module.add(self._waitXcnt0(writer, comment="drain xnacks before volatile VMEM load")) + module.add(MacroInstruction( + name=f"buffer_load_b32 {dst}, {vgpr(tmpVgprOff)}, {sgpr(tmpSgprBuffer, 4)}, {soffset} offen offset:0 scope:SCOPE_DEV", + args=[], + comment=comment)) + writer.vgprPool.checkIn(tmpVgprOff) + writer.sgprPool.checkIn(tmpSgprBuffer) + + return module + def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edge, gwvw, atomicW, \ batchElements, addrD, addrC, \ tmpVgpr, cvtVgprStruct, batchElementSgprs, tmpSgpr, codeAccVgprRead): From 064fe03866c668afb582aa2b220cec817f1290d9 Mon Sep 17 00:00:00 2001 From: "Henderson, Nathan" Date: Sun, 26 Apr 2026 19:17:57 +0000 Subject: [PATCH 11/17] Remove CrossLaneWait for 1250 --- .../tensilelite/rocisa/rocisa/include/hardware_caps.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp index a56af7d3a98..ecf3339e249 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp @@ -488,7 +488,7 @@ inline std::map initArchCaps(const IsaVersion& isaVersion) rv["HasSchedMode"] = checkInList(isaVersion[0], {}); //TODO: https://github.com/ROCm/rocm-libraries/issues/3211 rv["HasAccCD"] = checkInList(isaVersion, {{9, 0, 10}, {9, 4, 2}, {9, 5, 0}}); rv["ArchAccUnifiedRegs"] = checkInList(isaVersion, {{9, 0, 10}, {9, 4, 2}, {9, 5, 0}}); - rv["CrosslaneWait"] = checkInList(isaVersion, {{9, 4, 2}, {9, 5, 0}, {12, 5, 0}}); + rv["CrosslaneWait"] = checkInList(isaVersion, {{9, 4, 2}, {9, 5, 0}}); rv["TransOpWait"] = checkInList(isaVersion, {{9, 4, 2}, {9, 5, 0}, {12, 5, 0}}); rv["SDWAWait"] = checkInList(isaVersion, {{9, 4, 2}, {9, 5, 0}, {12, 5, 0}}); rv["VgprBank"] = checkInList(isaVersion[0], {10, 11, 12}); From a8eaba85a7651a4b0c0f5df01fb02ceb34a7360d Mon Sep 17 00:00:00 2001 From: "Joao P. L. de Carvalho" Date: Sun, 26 Apr 2026 20:37:28 +0000 Subject: [PATCH 12/17] Set Scope flags via MUBUFModifiers instead of inline asm --- .../tensilelite/Tensile/Components/StreamK.py | 20 ++++++++----------- .../rocisa/rocisa/include/container.hpp | 7 ++++--- .../rocisa/rocisa/include/hardware_caps.hpp | 2 +- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py index 79622f9163f..48e79efe034 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py @@ -29,7 +29,7 @@ SCmpGtU32, SCmpLeU32, SCmpLtU32, SLShiftLeftB32, SLShiftLeftB64, SLShiftRightB32, SLoadB32, \ SMaxI32, SMinU32, SMovB32, SMovB64, SMulI32, SNop, SOrB32, SSleep, SStoreB32, SSubU32, \ SWaitCnt, VAddF32, VAddF64, VAddPKF16, VAddU32, VLShiftRightB32, VMovB32, \ - VReadfirstlaneB32, VCvtBF16toFP32, BufferStoreB32 + VReadfirstlaneB32, VCvtBF16toFP32, BufferLoadB32, BufferStoreB32 from rocisa.functions import scalarStaticDivideAndRemainder, sMagicDiv2, \ vectorStaticMultiply, BranchIfNotZero, scalarUInt24DivideAndRemainder, scalarUInt32DivideAndRemainder @@ -1108,14 +1108,11 @@ def setFlagValue(self, writer, src, soffset, comment=""): # gfx1250: drain in-flight XNACK retries before the flag store. module.add(self._waitXcnt0(writer, comment="drain xnacks before volatile VMEM store")) if self._isGfx1250(writer): - module.add(MacroInstruction( - name=f"buffer_store_b32 {src}, {vgpr(tmpVgprOff)}, {sgpr(tmpSgprBuffer, 4)}, {soffset} offen offset:0 scope:SCOPE_DEV", - args=[], - comment=comment)) + mubuf = MUBUFModifiers(offen=True, scope=CacheScope.SCOPE_DEV) else: - module.add(BufferStoreB32(src=src, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, \ - mubuf=MUBUFModifiers(offen=True, glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), \ - comment=comment)) + mubuf = MUBUFModifiers(offen=True, glc=True, dlc=True, scope=CacheScope.SCOPE_DEV) + module.add(BufferStoreB32(src=src, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, \ + mubuf=mubuf, comment=comment)) module.add(SWaitCnt(vscnt=0, comment="wait for data store")) writer.vgprPool.checkIn(tmpVgprOff) writer.sgprPool.checkIn(tmpSgprBuffer) @@ -1134,10 +1131,9 @@ def getFlagValue(self, writer, dst, soffset, comment=""): module.add(self.shiftSrd(writer, tmpSgprBuffer)) # gfx1250: drain in-flight XNACK retries before the flag load. module.add(self._waitXcnt0(writer, comment="drain xnacks before volatile VMEM load")) - module.add(MacroInstruction( - name=f"buffer_load_b32 {dst}, {vgpr(tmpVgprOff)}, {sgpr(tmpSgprBuffer, 4)}, {soffset} offen offset:0 scope:SCOPE_DEV", - args=[], - comment=comment)) + module.add(BufferLoadB32(dst=dst, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, + mubuf=MUBUFModifiers(offen=True, scope=CacheScope.SCOPE_DEV), + comment=comment)) writer.vgprPool.checkIn(tmpVgprOff) writer.sgprPool.checkIn(tmpSgprBuffer) diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp index 0b153dffd71..b7bc9f650dd 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp @@ -255,9 +255,10 @@ namespace rocisa std::string toString() const override { - auto hasDLCModifier = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"]; - auto hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"]; - auto hasNTModifier = rocIsa::getInstance().getAsmCaps()["HasNTModifier"]; + auto asmCaps = rocIsa::getInstance().getAsmCaps(); + auto hasDLCModifier = asmCaps["HasDLCModifier"]; + auto hasSCOPEModifier = asmCaps["HasSCOPEModifier"]; + auto hasNTModifier = asmCaps["HasNTModifier"]; std::string kStr; if(offen) { diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp index ecf3339e249..eb58437df98 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp @@ -387,7 +387,7 @@ inline std::map || tryAssembler( isaVersion, assemblerPath, - "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, scope:SCOPE_DEV", + "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null offen offset:0, scope:SCOPE_DEV", isDebug); rv["HasMUBUFConst"] = tryAssembler(isaVersion, assemblerPath, From a8a265daf58c0bf2c2dd8375a3b878873722ab5b Mon Sep 17 00:00:00 2001 From: "Joao P. L. de Carvalho" Date: Sun, 26 Apr 2026 21:25:36 +0000 Subject: [PATCH 13/17] StreamK: use 1LDSBuffer for MX quick tests Match the MXFP4 and MXFP8 StreamK quick coverage to the validated one-LDS-buffer configuration so SubIter is disabled naturally by existing solution selection instead of a StreamK-specific code guard. --- .../Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml | 2 +- .../Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml index 5028c3f2f10..6f8ae16ba8b 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml @@ -209,7 +209,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [0] + - 1LDSBuffer: [1] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml index 9f72faa73bf..1cd8d3f3856 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml @@ -213,7 +213,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [0] + - 1LDSBuffer: [1] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] From 2883bbfeed91bb21670b3f33999e9880a75c5aa6 Mon Sep 17 00:00:00 2001 From: "Joao P. L. de Carvalho" Date: Mon, 27 Apr 2026 15:32:24 +0000 Subject: [PATCH 14/17] Re-enable SK tests on GFX1250 --- .../Tensile/Tests/common/streamk/gfx1250/sk_f8gemm_quick.yaml | 2 +- .../Tensile/Tests/common/streamk/gfx1250/sk_hgemm_quick.yaml | 2 +- .../Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml | 2 +- .../Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_tdm.yaml | 2 +- .../Tests/common/streamk/gfx1250/sk_mxf6b8gemm_quick.yaml | 2 +- .../Tests/common/streamk/gfx1250/sk_mxf8b6gemm_quick.yaml | 2 +- .../Tests/common/streamk/gfx1250/sk_mxf8f4gemm_quick.yaml | 2 +- .../Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_tdm.yaml | 2 +- .../Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml | 2 +- .../Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_tdm.yaml | 2 +- .../Tensile/Tests/common/streamk/gfx1250/sk_sgemm_quick.yaml | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_f8gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_f8gemm_quick.yaml index 7b84d522e03..fe7c661e964 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_f8gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_f8gemm_quick.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch GlobalParameters: NumElementsToValidate: -1 diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_hgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_hgemm_quick.yaml index 2f362c46d2c..29d90257383 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_hgemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_hgemm_quick.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch GlobalParameters: NumElementsToValidate: -1 diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml index 6f8ae16ba8b..6f2b743c192 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch GlobalParameters: NumElementsToValidate: -1 BoundsCheck: False diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_tdm.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_tdm.yaml index 36e592f0de6..a23bd2d6d87 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_tdm.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_tdm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch GlobalParameters: NumElementsToValidate: -1 BoundsCheck: False diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf6b8gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf6b8gemm_quick.yaml index ac5f1d325be..5e7e1a799ab 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf6b8gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf6b8gemm_quick.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch GlobalParameters: NumElementsToValidate: -1 diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8b6gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8b6gemm_quick.yaml index 56f431556fc..7acd6c0c8c3 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8b6gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8b6gemm_quick.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch GlobalParameters: NumElementsToValidate: -1 diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_quick.yaml index 1bdb04a625f..85c4a0a4f9d 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_quick.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch GlobalParameters: NumElementsToValidate: -1 diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_tdm.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_tdm.yaml index b935eef9500..ecff80e23de 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_tdm.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_tdm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch GlobalParameters: NumElementsToValidate: -1 BoundsCheck: False diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml index 1cd8d3f3856..b20058c5cc8 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch GlobalParameters: NumElementsToValidate: -1 BoundsCheck: False diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_tdm.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_tdm.yaml index e7ab5f70755..71f9a8a2b57 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_tdm.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_tdm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch GlobalParameters: NumElementsToValidate: -1 BoundsCheck: False diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_sgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_sgemm_quick.yaml index c0de7411367..1ae42b52efe 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_sgemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_sgemm_quick.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch GlobalParameters: NumElementsToValidate: -1 From 5f12b69983d0f763566540cd176038ebbfe6a2d8 Mon Sep 17 00:00:00 2001 From: "Joao P. L. de Carvalho" Date: Mon, 27 Apr 2026 16:42:53 +0000 Subject: [PATCH 15/17] Fix gfx1250 MX SubIter LDS waits Emit dependent LDS waits before MX SubIter WMMAs and cover LDSB0 in gfx1250 StreamK quick tests. --- projects/hipblaslt/tensilelite/Tensile/KernelWriter.py | 7 ++++++- .../Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml | 2 +- .../Tests/common/streamk/gfx1250/sk_mxf6b8gemm_quick.yaml | 8 ++++---- .../Tests/common/streamk/gfx1250/sk_mxf8b6gemm_quick.yaml | 8 ++++---- .../Tests/common/streamk/gfx1250/sk_mxf8f4gemm_quick.yaml | 8 ++++---- .../Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml | 8 ++++---- 6 files changed, 23 insertions(+), 18 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py index 35bc1678ffa..672e5de29f5 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py @@ -1809,7 +1809,12 @@ def calculateRangeAndUpdateCounter(itemCounter, writeCounters, length): #### # scheduled wait localReads #### - if self.states.numItersPLR == 0 and kernel["EnableMatrixInstruction"] and self.do["OptimizeNumItersPLR0"] and not scheduleTF32Emu: + subIterMxNeedsDependentLrWait = ( + kernel["ForceUnrollSubIter"] and self.states.version == (12,5,0) + and (kernel["ProblemType"]["MXBlockA"] or kernel["ProblemType"]["MXBlockB"]) + ) + if ((self.states.numItersPLR == 0 and self.do["OptimizeNumItersPLR0"]) or subIterMxNeedsDependentLrWait) \ + and kernel["EnableMatrixInstruction"] and not scheduleTF32Emu: dscnt = -1 mfmas = getMFMAs(macIterCode) ## To support do["MAC"] is False diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml index 6f2b743c192..28cea70f7cf 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf4gemm_quick.yaml @@ -209,7 +209,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf6b8gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf6b8gemm_quick.yaml index 5e7e1a799ab..8eaddd26b05 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf6b8gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf6b8gemm_quick.yaml @@ -83,7 +83,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -157,7 +157,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -231,7 +231,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -305,7 +305,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8b6gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8b6gemm_quick.yaml index 7acd6c0c8c3..e0da4410c66 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8b6gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8b6gemm_quick.yaml @@ -83,7 +83,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -157,7 +157,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -231,7 +231,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -305,7 +305,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_quick.yaml index 85c4a0a4f9d..e89b0aa5915 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8f4gemm_quick.yaml @@ -86,7 +86,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -162,7 +162,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -238,7 +238,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -314,7 +314,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml index b20058c5cc8..d61334e85f0 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx1250/sk_mxf8gemm_quick.yaml @@ -69,7 +69,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -140,7 +140,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -213,7 +213,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] @@ -284,7 +284,7 @@ BenchmarkProblems: - VectorWidthA: [-1] - VectorWidthB: [-1] - LocalReadVectorWidth: [-1] - - 1LDSBuffer: [1] + - 1LDSBuffer: [1, 0] - DirectToVgprSparseMetadata: [0] - StoreVectorWidth: [-1] - StreamK: [3] From e3d54d9c0f87467f77339654cecd70433e2978c8 Mon Sep 17 00:00:00 2001 From: "Henderson, Nathan" Date: Mon, 27 Apr 2026 16:13:29 -0600 Subject: [PATCH 16/17] Refactor StreamK gfx1250 fences into capability-driven Component Replace the `_isGfx1250` version check and the scattered fence/flag helpers with a `StreamKMemoryOrdering` Component selected on two new archCaps, `HasInvWbDevFences` and `RequiresXCntForVolatileVMEM`. --- .../tensilelite/Tensile/Components/StreamK.py | 233 ++++++++++++------ .../rocisa/rocisa/include/hardware_caps.hpp | 14 ++ 2 files changed, 172 insertions(+), 75 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py index 48e79efe034..eb7cf4acb2c 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py @@ -107,6 +107,143 @@ def __call__(self, writer, kernel): return module +class StreamKMemoryOrdering(Component): + """ + Memory-ordering fences and flag accessors for the StreamK partial-tile + handshake. + + StreamK uses a producer/consumer protocol: one workgroup writes a partial + tile to a workspace and signals completion via a flag, and other + workgroups poll the flag and read the partials. The required cross-CU + memory ordering depends on the target ISA: + + - Most arches: `s_waitcnt vscnt(0)` before the flag store and an SMEM + flag load with `glc/dlc/scope:SCOPE_DEV` are sufficient because + ordering between L1/L2 and the device-scope coherence point is + implicit. + + - gfx1250: the L2 has independent partitions and SMEM is not coherent + with the VMEM flag store, so an explicit `global_wb scope:SCOPE_DEV` + is required on the release side and a `global_inv scope:SCOPE_DEV` + on the acquire side. Additionally, XNACK-replay can reorder a + volatile/atomic VMEM op past in-flight VMEM, so `s_wait_xcnt 0` must + precede such ops. The flag itself must be read via VMEM (not SMEM) + to observe the producer's release-side fence. + + Selection is driven by the `HasInvWbDevFences` arch capability. The + XNACK-replay drain in `preVolatileVmem` is gated separately on + `RequiresXCntForVolatileVMEM` and lives on the abstract base so a + future arch needing only one of the two can be supported by adding a + single capability flag. + """ + def __call__(self): + assert(0) + + def preVolatileVmem(self, writer, comment="") -> Module: + """Drain in-flight VMEM (XNACK-replay) before a volatile/atomic VMEM op. + + Required on arches with `RequiresXCntForVolatileVMEM`. No-op + elsewhere. + """ + module = Module("StreamK pre-volatile VMEM drain") + if writer.states.archCaps["RequiresXCntForVolatileVMEM"]: + module.add(MacroInstruction(name="s_wait_xcnt 0", args=[], comment=comment)) + return module + + @abc.abstractmethod + def releaseFence(self, writer) -> Module: + """Memory fence ordering prior partial-tile stores before the flag store.""" + pass + + @abc.abstractmethod + def acquireFence(self, writer) -> Module: + """Memory fence after observing the flag and before reading partials.""" + pass + + @abc.abstractmethod + def readFlag(self, writer, dst, soffset) -> Module: + """Read the StreamK completion flag into SGPR `dst` for compare.""" + pass + + @abc.abstractmethod + def flagBufferMubuf(self) -> MUBUFModifiers: + """MUBUF modifiers for buffer load/store of the flag word.""" + pass + + +class StreamKMemoryOrderingDefault(StreamKMemoryOrdering): + """No-op cross-CU fences; SMEM flag with glc/dlc/SCOPE_DEV. + + Used on every arch that does not require explicit cross-L2 fences. + """ + archCaps = {"HasInvWbDevFences": False} + + def releaseFence(self, writer) -> Module: + module = Module("StreamK release fence (default)") + module.add(SWaitCnt(vscnt=0, comment="wait for data store")) + return module + + def acquireFence(self, writer) -> Module: + return Module("StreamK acquire fence (default, no-op)") + + def readFlag(self, writer, dst, soffset) -> Module: + module = Module("StreamK read flag (SMEM)") + module.add(SLoadB32(dst=sgpr(dst), base=sgpr("AddressFlags", 2), + soffset=soffset, + smem=SMEMModifiers(glc=True, dlc=True, + scope=CacheScope.SCOPE_DEV), + comment="get flag")) + module.add(SWaitCnt(kmcnt=0, comment="wait for flag load")) + return module + + def flagBufferMubuf(self) -> MUBUFModifiers: + return MUBUFModifiers(offen=True, glc=True, dlc=True, + scope=CacheScope.SCOPE_DEV) + + +class StreamKMemoryOrderingDevScopeFences(StreamKMemoryOrdering): + """Explicit cross-L2 release/acquire fences via global_wb/global_inv + scope:SCOPE_DEV plus a VMEM-coherent flag read. + + Selected on arches whose L2 is partitioned across CUs/XCDs and whose + SMEM is not coherent with the VMEM flag write (e.g. gfx1250). + """ + archCaps = {"HasInvWbDevFences": True} + + def releaseFence(self, writer) -> Module: + module = Module("StreamK release fence (dev-scope)") + module.add(SWaitCnt(vlcnt=0, + comment="release: drain in-flight loads before global_wb")) + module.add(SWaitCnt(vscnt=0, comment="wait for data store")) + module.add(MacroInstruction(name="global_wb scope:SCOPE_DEV", args=[], + comment="release: writeback partials to L2-coherent point")) + module.add(SWaitCnt(vlcnt=0, vscnt=0, + comment="release: wait for global_wb")) + return module + + def acquireFence(self, writer) -> Module: + module = Module("StreamK acquire fence (dev-scope)") + module.add(MacroInstruction(name="global_inv scope:SCOPE_DEV", args=[], + comment="acquire: invalidate partials after flag")) + module.add(SWaitCnt(vlcnt=0, comment="acquire: wait for global_inv")) + return module + + def readFlag(self, writer, dst, soffset) -> Module: + streamk = Component.StreamK.find(writer) + module = Module("StreamK read flag (VMEM)") + flagVgpr = writer.vgprPool.checkOut(1, "flagAcq") + module.add(streamk.getFlagValue(writer, dst=vgpr(flagVgpr), + soffset=soffset, comment="acquire: get flag (VMEM)")) + module.add(SWaitCnt(vlcnt=0, comment="acquire: wait VMEM flag load")) + module.add(VReadfirstlaneB32(dst=sgpr(dst), src=vgpr(flagVgpr), + comment="move VMEM flag to SGPR for compare")) + writer.vgprPool.checkIn(flagVgpr) + return module + + def flagBufferMubuf(self) -> MUBUFModifiers: + return MUBUFModifiers(offen=True, scope=CacheScope.SCOPE_DEV) + + class StreamK(Component): """ StreamK code. @@ -139,57 +276,6 @@ def shiftSrd(self, writer, srdIdx) -> Module: return module - # gfx1250 StreamK flag/partial ordering helpers. No-op on other arches. - # Predicate matches shiftSrd above. - def _isGfx1250(self, writer) -> bool: - return writer.states.version[:2] == (12, 5) - - def _waitXcnt0(self, writer, comment: str = "") -> Module: - module = Module("gfx1250 s_wait_xcnt 0") - if self._isGfx1250(writer): - module.add(MacroInstruction(name="s_wait_xcnt 0", args=[], comment=comment)) - return module - - def _globalWbDev(self, writer, comment: str = "") -> Module: - module = Module("gfx1250 global_wb") - if self._isGfx1250(writer): - module.add(MacroInstruction(name="global_wb scope:SCOPE_DEV", args=[], comment=comment)) - return module - - def _globalInvDev(self, writer, comment: str = "") -> Module: - module = Module("gfx1250 global_inv") - if self._isGfx1250(writer): - module.add(MacroInstruction(name="global_inv scope:SCOPE_DEV", args=[], comment=comment)) - return module - - def _waitLoadcntForRelease(self, writer) -> Module: - # Drain in-flight VMEM loads before the release writeback. - module = Module("gfx1250 s_wait_loadcnt 0 (release)") - if self._isGfx1250(writer): - module.add(SWaitCnt(vlcnt=0, comment="release: drain in-flight loads before global_wb")) - return module - - def _waitGlobalWbForRelease(self, writer) -> Module: - module = Module("gfx1250 wait global_wb") - if self._isGfx1250(writer): - # gfx1250 has split VMEM counters. Drain both sides after global_wb - # before publishing the StreamK flag. - module.add(SWaitCnt(vlcnt=0, vscnt=0, comment="release: wait for global_wb")) - return module - - def _readFlagValue(self, writer, dst, soffset) -> Module: - module = Module("Read StreamK flag") - if self._isGfx1250(writer): - flagVgpr = writer.vgprPool.checkOut(1, "flagAcq") - module.add(self.getFlagValue(writer, dst=vgpr(flagVgpr), soffset=soffset, comment="acquire: get flag (VMEM)")) - module.add(SWaitCnt(vlcnt=0, comment="acquire: wait VMEM flag load")) - module.add(VReadfirstlaneB32(dst=sgpr(dst), src=vgpr(flagVgpr), comment="move VMEM flag to SGPR for compare")) - writer.vgprPool.checkIn(flagVgpr) - else: - module.add(SLoadB32(dst=sgpr(dst), base=sgpr("AddressFlags", 2), soffset=soffset, smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag")) - module.add(SWaitCnt(kmcnt=0, comment="wait for flag load")) - return module - def _skv(self, writer, name): """Return the VGPR index holding a StreamK constant.""" return writer.states.skConstVgprs[name] @@ -523,6 +609,7 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele if kernel["StreamKAtomic"]: return module + memOrder = Component.StreamKMemoryOrdering.find(writer) skConstsInVgprs = writer.isStreamKConstantsToVgprEnabled(kernel) skStoreLabel = Label(label=writer.labels.getNameInc("SK_Store"), comment="") @@ -644,13 +731,11 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sFlagIdx), shiftHex=log2(4), comment="flag offset based on wg index")) module.add(skFixupWaitForFlag) # loop to wait for flag - module.add(self._readFlagValue(writer, dst=tmpSgpr+1, soffset=sgpr(tmpSgpr))) + module.add(memOrder.readFlag(writer, dst=tmpSgpr+1, soffset=sgpr(tmpSgpr))) if kernel["DebugStreamK"] & 2 == 0: # Don't wait for partials if not being written module.add(SCmpEQU32(src0=sgpr(tmpSgpr+1), src1=1, comment="check if ready")) module.add(SCBranchSCC0(labelName=skFixupWaitForFlag.getLabelName(), comment="if flag not set, wait and check again")) - if self._isGfx1250(writer): - module.add(self._globalInvDev(writer, comment="acquire: invalidate partials after flag")) - module.add(SWaitCnt(vlcnt=0, comment="acquire: wait for global_inv")) + module.add(memOrder.acquireFence(writer)) module.add(SBarrier(comment="wait for all workgroups before resetting flag")) skipFlagReset = Label(label=writer.labels.getNameInc("SK_SkipFlagReset"), comment="") @@ -734,13 +819,11 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele # Check flag module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sCtaIdx), shiftHex=log2(4), comment="flag offset based on CTA index")) - module.add(self._readFlagValue(writer, dst=tmpSgpr+2, soffset=sgpr(tmpSgpr))) + module.add(memOrder.readFlag(writer, dst=tmpSgpr+2, soffset=sgpr(tmpSgpr))) if kernel["DebugStreamK"] & 2 == 0: module.add(SCmpEQU32(src0=sgpr(tmpSgpr+2), src1=1, comment="check if ready")) module.add(SCBranchSCC0(labelName=skFixupLabel.getLabelName(), comment="if flag not set, wait and check again")) - if self._isGfx1250(writer): - module.add(self._globalInvDev(writer, comment="acquire: invalidate partials after flag")) - module.add(SWaitCnt(vlcnt=0, comment="acquire: wait for global_inv")) + module.add(memOrder.acquireFence(writer)) # TODO Barrier here to sync all threads in workgroup, but maybe better to have separate flag for each wavefront (to be tested) module.add(SBarrier(comment="wait for all workgroups before resetting flag")) @@ -859,6 +942,7 @@ def computeWorkspaceSrd(self, writer, kernel, sCtaIdx, tmpSgpr = None): def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, beta, edge, tmpVgpr, cvtVgprStruct, endLabel): module = Module("StreamK Common partialsWriteProcedure") + memOrder = Component.StreamKMemoryOrdering.find(writer) # PreLoopVmcntCaseStr = "" # # not generate Case 2 if StoreCInUnroll with StoreVectorWidth==1 (Case 2 will be same as Case 3) @@ -1064,10 +1148,7 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, # kStr += PreLoopVmcntCaseStr # Set flag - module.add(self._waitLoadcntForRelease(writer)) - module.add(SWaitCnt(vscnt=0, comment="wait for data store")) - module.add(self._globalWbDev(writer, comment="release: writeback partials to L2-coherent point")) - module.add(self._waitGlobalWbForRelease(writer)) + module.add(memOrder.releaseFence(writer)) module.add(SBarrier(comment="store all data before setting flag")) sIdx = writer.acquireStreamKConstSgpr(kernel, "StreamKIdx") if writer.isStreamKConstantsToVgprEnabled(kernel): @@ -1098,6 +1179,7 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, def setFlagValue(self, writer, src, soffset, comment=""): module = Module("Buffer Store Flag Value") + memOrder = Component.StreamKMemoryOrdering.find(writer) tmpSgprBuffer = writer.sgprPool.checkOutAligned(4, 4, preventOverflow=False) tmpVgprOff = writer.vgprPool.checkOut(1, "vaddr_off") module.add(VMovB32(dst=vgpr(tmpVgprOff), src=0, comment="zero vaddr offset")) @@ -1105,14 +1187,9 @@ def setFlagValue(self, writer, src, soffset, comment=""): module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB")) module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96")) module.add(self.shiftSrd(writer, tmpSgprBuffer)) - # gfx1250: drain in-flight XNACK retries before the flag store. - module.add(self._waitXcnt0(writer, comment="drain xnacks before volatile VMEM store")) - if self._isGfx1250(writer): - mubuf = MUBUFModifiers(offen=True, scope=CacheScope.SCOPE_DEV) - else: - mubuf = MUBUFModifiers(offen=True, glc=True, dlc=True, scope=CacheScope.SCOPE_DEV) - module.add(BufferStoreB32(src=src, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, \ - mubuf=mubuf, comment=comment)) + module.add(memOrder.preVolatileVmem(writer, comment="drain xnacks before volatile VMEM store")) + module.add(BufferStoreB32(src=src, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, + mubuf=memOrder.flagBufferMubuf(), comment=comment)) module.add(SWaitCnt(vscnt=0, comment="wait for data store")) writer.vgprPool.checkIn(tmpVgprOff) writer.sgprPool.checkIn(tmpSgprBuffer) @@ -1120,8 +1197,15 @@ def setFlagValue(self, writer, src, soffset, comment=""): return module def getFlagValue(self, writer, dst, soffset, comment=""): - """gfx1250 acquire-side VMEM load of the StreamK flag.""" + """Buffer-load primitive for the StreamK flag. + + Used by `StreamKMemoryOrderingDevScopeFences.readFlag` to perform a + VMEM-coherent flag load. Default arches read the flag via SMEM + directly in `StreamKMemoryOrderingDefault.readFlag` and never call + this helper. + """ module = Module("Buffer Load Flag Value") + memOrder = Component.StreamKMemoryOrdering.find(writer) tmpSgprBuffer = writer.sgprPool.checkOutAligned(4, 4, preventOverflow=False) tmpVgprOff = writer.vgprPool.checkOut(1, "vaddr_off") module.add(VMovB32(dst=vgpr(tmpVgprOff), src=0, comment="zero vaddr offset")) @@ -1129,10 +1213,9 @@ def getFlagValue(self, writer, dst, soffset, comment=""): module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB")) module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96")) module.add(self.shiftSrd(writer, tmpSgprBuffer)) - # gfx1250: drain in-flight XNACK retries before the flag load. - module.add(self._waitXcnt0(writer, comment="drain xnacks before volatile VMEM load")) + module.add(memOrder.preVolatileVmem(writer, comment="drain xnacks before volatile VMEM load")) module.add(BufferLoadB32(dst=dst, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, - mubuf=MUBUFModifiers(offen=True, scope=CacheScope.SCOPE_DEV), + mubuf=memOrder.flagBufferMubuf(), comment=comment)) writer.vgprPool.checkIn(tmpVgprOff) writer.sgprPool.checkIn(tmpSgprBuffer) diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp index eb58437df98..c8cca798f6a 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp @@ -500,6 +500,20 @@ inline std::map initArchCaps(const IsaVersion& isaVersion) rv["HasWmmaArbStallBit"] = isaVersion[0] == 12 && isaVersion[1] == 5; rv["HasF32XEmulation"] = checkInList(isaVersion, {{9, 5, 0}, {12, 5, 0}}); + // Cross-CU/L2 release+acquire fences for device-scope inter-workgroup + // synchronization (e.g. StreamK partial-tile handshake). When set, a + // store-release sequence must emit `s_wait_loadcnt 0; s_wait_storecnt 0; + // global_wb scope:SCOPE_DEV` before the flag store, and a load-acquire + // sequence must emit `global_inv scope:SCOPE_DEV; s_wait_loadcnt 0` after + // the flag load. Mirrors LLVM's hasINVWBL2WaitCntRequirement(). + rv["HasInvWbDevFences"] = checkInList(isaVersion, {{12, 5, 0}}); + + // XNACK-replay drain. When set, in-flight VMEM ops can be replayed and + // therefore reorder w.r.t. a subsequent volatile/atomic VMEM. An + // `s_wait_xcnt 0` must precede the volatile/atomic VMEM op. Mirrors + // LLVM's requiresWaitXCntForSingleAccessInstructions(). + rv["RequiresXCntForVolatileVMEM"] = checkInList(isaVersion, {{12, 5, 0}}); + // Vector L1 Data cache line size (bytes) used for alignment-sensitive optimizations in codegen. // NOTE: This is a *codegen-time* (compile-time) constant selected by target ISA. // From 46e8c887ce193fe2fc56e4fbfdbf37d3e1b443d6 Mon Sep 17 00:00:00 2001 From: "Henderson, Nathan" Date: Mon, 27 Apr 2026 16:17:57 -0600 Subject: [PATCH 17/17] Fix comment --- .../tensilelite/rocisa/rocisa/include/hardware_caps.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp index c8cca798f6a..b95bf831146 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp @@ -505,13 +505,12 @@ inline std::map initArchCaps(const IsaVersion& isaVersion) // store-release sequence must emit `s_wait_loadcnt 0; s_wait_storecnt 0; // global_wb scope:SCOPE_DEV` before the flag store, and a load-acquire // sequence must emit `global_inv scope:SCOPE_DEV; s_wait_loadcnt 0` after - // the flag load. Mirrors LLVM's hasINVWBL2WaitCntRequirement(). + // the flag load. rv["HasInvWbDevFences"] = checkInList(isaVersion, {{12, 5, 0}}); // XNACK-replay drain. When set, in-flight VMEM ops can be replayed and // therefore reorder w.r.t. a subsequent volatile/atomic VMEM. An - // `s_wait_xcnt 0` must precede the volatile/atomic VMEM op. Mirrors - // LLVM's requiresWaitXCntForSingleAccessInstructions(). + // `s_wait_xcnt 0` must precede the volatile/atomic VMEM op. rv["RequiresXCntForVolatileVMEM"] = checkInList(isaVersion, {{12, 5, 0}}); // Vector L1 Data cache line size (bytes) used for alignment-sensitive optimizations in codegen.