ROCm · ThanHenderson · May 2, 2026 · Apr 26, 2026 · Apr 27, 2026 · Apr 27, 2026
@@ -24,12 +24,12 @@
 from rocisa.code import Module, Label
 from rocisa.container import vgpr, sgpr, SMEMModifiers, MUBUFModifiers, replaceHolder, EXEC,\
     VOP3PModifiers, ContinuousRegister
-from rocisa.instruction import SAddCU32, SAddU32, SAndB32, SBarrier, \
+from rocisa.instruction import MacroInstruction, SAddCU32, SAddU32, SAndB32, SBarrier, \
     SBranch, SCBranchSCC0, SCBranchSCC1, SCMovB32, SCSelectB32, SCmpEQU32, SCmpEQU64, \
     SCmpGtU32, SCmpLeU32, SCmpLtU32, SLShiftLeftB32, SLShiftLeftB64, SLShiftRightB32, SLoadB32, \
     SMaxI32, SMinU32, SMovB32, SMovB64, SMulI32, SNop, SOrB32, SSleep, SStoreB32, SSubU32, \
     SWaitCnt, VAddF32, VAddF64, VAddPKF16, VAddU32, VLShiftRightB32, VMovB32, \
-    VReadfirstlaneB32, VCvtBF16toFP32, BufferStoreB32
+    VReadfirstlaneB32, VCvtBF16toFP32, BufferLoadB32, BufferStoreB32
 from rocisa.functions import scalarStaticDivideAndRemainder, sMagicDiv2, \
     vectorStaticMultiply, BranchIfNotZero, scalarUInt24DivideAndRemainder, scalarUInt32DivideAndRemainder
 
@@ -107,6 +107,143 @@ def __call__(self, writer, kernel):
         return module
 
 
+class StreamKMemoryOrdering(Component):
+    """
+    Memory-ordering fences and flag accessors for the StreamK partial-tile
+    handshake.
+
+    StreamK uses a producer/consumer protocol: one workgroup writes a partial
+    tile to a workspace and signals completion via a flag, and other
+    workgroups poll the flag and read the partials. The required cross-CU
+    memory ordering depends on the target ISA:
+
+    - Most arches: `s_waitcnt vscnt(0)` before the flag store and an SMEM
+      flag load with `glc/dlc/scope:SCOPE_DEV` are sufficient because
+      ordering between L1/L2 and the device-scope coherence point is
+      implicit.
+
+    - gfx1250: the L2 has independent partitions and SMEM is not coherent
+      with the VMEM flag store, so an explicit `global_wb scope:SCOPE_DEV`
+      is required on the release side and a `global_inv scope:SCOPE_DEV`
+      on the acquire side. Additionally, XNACK-replay can reorder a
+      volatile/atomic VMEM op past in-flight VMEM, so `s_wait_xcnt 0` must
+      precede such ops. The flag itself must be read via VMEM (not SMEM)
+      to observe the producer's release-side fence.
+
+    Selection is driven by the `HasInvWbDevFences` arch capability. The
+    XNACK-replay drain in `preVolatileVmem` is gated separately on
+    `RequiresXCntForVolatileVMEM` and lives on the abstract base so a
+    future arch needing only one of the two can be supported by adding a
+    single capability flag.
+    """
+    def __call__(self):
+        assert(0)
+
+    def preVolatileVmem(self, writer, comment="") -> Module:
+        """Drain in-flight VMEM (XNACK-replay) before a volatile/atomic VMEM op.
+
+        Required on arches with `RequiresXCntForVolatileVMEM`. No-op
+        elsewhere.
+        """
+        module = Module("StreamK pre-volatile VMEM drain")
+        if writer.states.archCaps["RequiresXCntForVolatileVMEM"]:
+            module.add(MacroInstruction(name="s_wait_xcnt 0", args=[], comment=comment))
+        return module
+
+    @abc.abstractmethod
+    def releaseFence(self, writer) -> Module:
+        """Memory fence ordering prior partial-tile stores before the flag store."""
+        pass
+
+    @abc.abstractmethod
+    def acquireFence(self, writer) -> Module:
+        """Memory fence after observing the flag and before reading partials."""
+        pass
+
+    @abc.abstractmethod
+    def readFlag(self, writer, dst, soffset) -> Module:
+        """Read the StreamK completion flag into SGPR `dst` for compare."""
+        pass
+
+    @abc.abstractmethod
+    def flagBufferMubuf(self) -> MUBUFModifiers:
+        """MUBUF modifiers for buffer load/store of the flag word."""
+        pass
+
+
+class StreamKMemoryOrderingDefault(StreamKMemoryOrdering):
+    """No-op cross-CU fences; SMEM flag with glc/dlc/SCOPE_DEV.
+
+    Used on every arch that does not require explicit cross-L2 fences.
+    """
+    archCaps = {"HasInvWbDevFences": False}
+
+    def releaseFence(self, writer) -> Module:
+        module = Module("StreamK release fence (default)")
+        module.add(SWaitCnt(vscnt=0, comment="wait for data store"))
+        return module
+
+    def acquireFence(self, writer) -> Module:
+        return Module("StreamK acquire fence (default, no-op)")
+
+    def readFlag(self, writer, dst, soffset) -> Module:
+        module = Module("StreamK read flag (SMEM)")
+        module.add(SLoadB32(dst=sgpr(dst), base=sgpr("AddressFlags", 2),
+                            soffset=soffset,
+                            smem=SMEMModifiers(glc=True, dlc=True,
+                                               scope=CacheScope.SCOPE_DEV),
+                            comment="get flag"))
+        module.add(SWaitCnt(kmcnt=0, comment="wait for flag load"))
+        return module
+
+    def flagBufferMubuf(self) -> MUBUFModifiers:
+        return MUBUFModifiers(offen=True, glc=True, dlc=True,
+                              scope=CacheScope.SCOPE_DEV)
+
+
+class StreamKMemoryOrderingDevScopeFences(StreamKMemoryOrdering):
+    """Explicit cross-L2 release/acquire fences via global_wb/global_inv
+    scope:SCOPE_DEV plus a VMEM-coherent flag read.
+
+    Selected on arches whose L2 is partitioned across CUs/XCDs and whose
+    SMEM is not coherent with the VMEM flag write (e.g. gfx1250).
+    """
+    archCaps = {"HasInvWbDevFences": True}
+
+    def releaseFence(self, writer) -> Module:
+        module = Module("StreamK release fence (dev-scope)")
+        module.add(SWaitCnt(vlcnt=0,
+            comment="release: drain in-flight loads before global_wb"))
+        module.add(SWaitCnt(vscnt=0, comment="wait for data store"))
+        module.add(MacroInstruction(name="global_wb scope:SCOPE_DEV", args=[],
+            comment="release: writeback partials to L2-coherent point"))
+        module.add(SWaitCnt(vlcnt=0, vscnt=0,
+            comment="release: wait for global_wb"))
+        return module
+
+    def acquireFence(self, writer) -> Module:
+        module = Module("StreamK acquire fence (dev-scope)")
+        module.add(MacroInstruction(name="global_inv scope:SCOPE_DEV", args=[],
+            comment="acquire: invalidate partials after flag"))
+        module.add(SWaitCnt(vlcnt=0, comment="acquire: wait for global_inv"))
+        return module
+
+    def readFlag(self, writer, dst, soffset) -> Module:
+        streamk = Component.StreamK.find(writer)
+        module = Module("StreamK read flag (VMEM)")
+        flagVgpr = writer.vgprPool.checkOut(1, "flagAcq")
+        module.add(streamk.getFlagValue(writer, dst=vgpr(flagVgpr),
+            soffset=soffset, comment="acquire: get flag (VMEM)"))
+        module.add(SWaitCnt(vlcnt=0, comment="acquire: wait VMEM flag load"))
+        module.add(VReadfirstlaneB32(dst=sgpr(dst), src=vgpr(flagVgpr),
+            comment="move VMEM flag to SGPR for compare"))
+        writer.vgprPool.checkIn(flagVgpr)
+        return module
+
+    def flagBufferMubuf(self) -> MUBUFModifiers:
+        return MUBUFModifiers(offen=True, scope=CacheScope.SCOPE_DEV)
+
+
 class StreamK(Component):
     """
     StreamK code.
@@ -472,6 +609,7 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele
         if kernel["StreamKAtomic"]:
             return module
 
+        memOrder = Component.StreamKMemoryOrdering.find(writer)
         skConstsInVgprs = writer.isStreamKConstantsToVgprEnabled(kernel)
         skStoreLabel = Label(label=writer.labels.getNameInc("SK_Store"), comment="")
 
@@ -593,11 +731,11 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele
             module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sFlagIdx), shiftHex=log2(4), comment="flag offset based on wg index"))
 
             module.add(skFixupWaitForFlag) # loop to wait for flag
-            module.add(SLoadB32(dst=sgpr(tmpSgpr+1), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag"))
-            module.add(SWaitCnt(kmcnt=0, comment="wait for flag load"))
+            module.add(memOrder.readFlag(writer, dst=tmpSgpr+1, soffset=sgpr(tmpSgpr)))
             if kernel["DebugStreamK"] & 2 == 0: # Don't wait for partials if not being written
                 module.add(SCmpEQU32(src0=sgpr(tmpSgpr+1), src1=1, comment="check if ready"))
                 module.add(SCBranchSCC0(labelName=skFixupWaitForFlag.getLabelName(), comment="if flag not set, wait and check again"))
+                module.add(memOrder.acquireFence(writer))
 
             module.add(SBarrier(comment="wait for all workgroups before resetting flag"))
             skipFlagReset = Label(label=writer.labels.getNameInc("SK_SkipFlagReset"), comment="")
@@ -681,11 +819,11 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele
 
                 # Check flag
                 module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sCtaIdx), shiftHex=log2(4), comment="flag offset based on CTA index"))
-                module.add(SLoadB32(dst=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag"))
-                module.add(SWaitCnt(kmcnt=0, comment="wait for flag load"))
+                module.add(memOrder.readFlag(writer, dst=tmpSgpr+2, soffset=sgpr(tmpSgpr)))
                 if kernel["DebugStreamK"] & 2 == 0:
                     module.add(SCmpEQU32(src0=sgpr(tmpSgpr+2), src1=1, comment="check if ready"))
                     module.add(SCBranchSCC0(labelName=skFixupLabel.getLabelName(), comment="if flag not set, wait and check again"))
+                    module.add(memOrder.acquireFence(writer))
 
                 # TODO Barrier here to sync all threads in workgroup, but maybe better to have separate flag for each wavefront (to be tested)
                 module.add(SBarrier(comment="wait for all workgroups before resetting flag"))
@@ -804,6 +942,7 @@ def computeWorkspaceSrd(self, writer, kernel, sCtaIdx, tmpSgpr = None):
 
     def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, beta, edge, tmpVgpr, cvtVgprStruct, endLabel):
         module = Module("StreamK Common partialsWriteProcedure")
+        memOrder = Component.StreamKMemoryOrdering.find(writer)
 
         # PreLoopVmcntCaseStr = ""
         # # not generate Case 2 if StoreCInUnroll with StoreVectorWidth==1 (Case 2 will be same as Case 3)
@@ -1009,7 +1148,7 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha,
             #     kStr += PreLoopVmcntCaseStr
 
             # Set flag
-            module.add(SWaitCnt(vscnt=0, comment="wait for data store"))
+            module.add(memOrder.releaseFence(writer))
             module.add(SBarrier(comment="store all data before setting flag"))
             sIdx = writer.acquireStreamKConstSgpr(kernel, "StreamKIdx")
             if writer.isStreamKConstantsToVgprEnabled(kernel):
@@ -1040,22 +1179,49 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha,
 
     def setFlagValue(self, writer, src, soffset, comment=""):
         module = Module("Buffer Store Flag Value")
+        memOrder = Component.StreamKMemoryOrdering.find(writer)
         tmpSgprBuffer = writer.sgprPool.checkOutAligned(4, 4, preventOverflow=False)
         tmpVgprOff = writer.vgprPool.checkOut(1, "vaddr_off")
         module.add(VMovB32(dst=vgpr(tmpVgprOff), src=0, comment="zero vaddr offset"))
         module.add(SMovB64(dst=sgpr(tmpSgprBuffer, 2), src=sgpr("AddressFlags", 2)))
         module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB"))
         module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96"))
         module.add(self.shiftSrd(writer, tmpSgprBuffer))
-        module.add(BufferStoreB32(src=src, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, \
-                                  mubuf=MUBUFModifiers(offen=True, glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), \
-                                  comment=comment))
+        module.add(memOrder.preVolatileVmem(writer, comment="drain xnacks before volatile VMEM store"))
+        module.add(BufferStoreB32(src=src, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset,
+                                  mubuf=memOrder.flagBufferMubuf(), comment=comment))
         module.add(SWaitCnt(vscnt=0, comment="wait for data store"))
         writer.vgprPool.checkIn(tmpVgprOff)
         writer.sgprPool.checkIn(tmpSgprBuffer)
 
         return module
 
+    def getFlagValue(self, writer, dst, soffset, comment=""):
+        """Buffer-load primitive for the StreamK flag.
+
+        Used by `StreamKMemoryOrderingDevScopeFences.readFlag` to perform a
+        VMEM-coherent flag load. Default arches read the flag via SMEM
+        directly in `StreamKMemoryOrderingDefault.readFlag` and never call
+        this helper.
+        """
+        module = Module("Buffer Load Flag Value")
+        memOrder = Component.StreamKMemoryOrdering.find(writer)
+        tmpSgprBuffer = writer.sgprPool.checkOutAligned(4, 4, preventOverflow=False)
+        tmpVgprOff = writer.vgprPool.checkOut(1, "vaddr_off")
+        module.add(VMovB32(dst=vgpr(tmpVgprOff), src=0, comment="zero vaddr offset"))
+        module.add(SMovB64(dst=sgpr(tmpSgprBuffer, 2), src=sgpr("AddressFlags", 2)))
+        module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB"))
+        module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96"))
+        module.add(self.shiftSrd(writer, tmpSgprBuffer))
+        module.add(memOrder.preVolatileVmem(writer, comment="drain xnacks before volatile VMEM load"))
+        module.add(BufferLoadB32(dst=dst, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset,
+                                 mubuf=memOrder.flagBufferMubuf(),
+                                 comment=comment))
+        writer.vgprPool.checkIn(tmpVgprOff)
+        writer.sgprPool.checkIn(tmpSgprBuffer)
+
+        return module
+
     def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edge, gwvw, atomicW, \
             batchElements, addrD, addrC, \
             tmpVgpr, cvtVgprStruct, batchElementSgprs, tmpSgpr, codeAccVgprRead):

@@ -1809,7 +1809,12 @@ def calculateRangeAndUpdateCounter(itemCounter, writeCounters, length):
         ####
         # scheduled wait localReads
         ####
-        if self.states.numItersPLR == 0 and kernel["EnableMatrixInstruction"] and self.do["OptimizeNumItersPLR0"] and not scheduleTF32Emu:
+        subIterMxNeedsDependentLrWait = (
+          kernel["ForceUnrollSubIter"] and self.states.version == (12,5,0)
+          and (kernel["ProblemType"]["MXBlockA"] or kernel["ProblemType"]["MXBlockB"])
+        )
+        if ((self.states.numItersPLR == 0 and self.do["OptimizeNumItersPLR0"]) or subIterMxNeedsDependentLrWait) \
+            and kernel["EnableMatrixInstruction"] and not scheduleTF32Emu:
           dscnt = -1
           mfmas = getMFMAs(macIterCode)
           ## To support do["MAC"] is False
@@ -3942,12 +3947,19 @@ def _loopBody( self, kernel, tensorParametersA, tensorParametersB, pack, packPre
             vlcntVal = kernel["PrefetchGlobalRead"] - 1 if kernel["PrefetchGlobalRead"] >= 2 else 0
             waitLWCode.add(self._wait(kernel, tensorParametersA, tensorParametersB, vlcntVal, -1, -1, \
                                       "wait for previous set of global reads"))
+          elif kernel["enableTDMA"] and kernel["enableTDMB"]:
+            # TDM case: tensor_load_to_lds instructions (issued in prior iter) write to LDS via the
+            # tensor counter. A s_wait_tensorcnt 0 is required before the barrier to guarantee all
+            # TDM stores to LDS have landed before other waves read from that LDS buffer.
+            waitLWCode.add(self._wait(kernel, tensorParametersA, tensorParametersB, 0, -1, -1, \
+                                      "wait for TDM global reads"))
           # (no local write code. Global read wait for DirectToLds is already done)
           if not kernel["NoLdsWriteCode"]:
             waitLWCode.add(self._wait(kernel, tensorParametersA, tensorParametersB, -1, 0, -1, "3wait for local write"))
           skipForceWaitcnt0 = False
-          if kernel["DirectToVgprA"] or kernel["DirectToVgprB"] or kernel["DirectToLdsA"] or kernel["DirectToLdsB"]:
-            # DTVA/B or DTLA/B case, skip generating force waitcnt0
+          if kernel["DirectToVgprA"] or kernel["DirectToVgprB"] or kernel["DirectToLdsA"] or kernel["DirectToLdsB"] or \
+             (kernel["enableTDMA"] and kernel["enableTDMB"]):
+            # DTVA/B, DTLA/B, or TDM case: global read wait is handled above, skip force waitcnt0
             skipForceWaitcnt0 = True
           syncCode.add(self._syncThreads(kernel, "PGR, and wait until LW done to sync LDS%u"%(self.states.ldsBarrierTokenIdx), skipForceWaitcnt0=skipForceWaitcnt0, memoryToken=[self.states.ldsBarrierTokenIdx]))
 

@@ -1,5 +1,5 @@
 TestParameters:
-  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch
 
 GlobalParameters:
   NumElementsToValidate: -1

@@ -1,5 +1,5 @@
 TestParameters:
-  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch
 
 GlobalParameters:
   NumElementsToValidate: -1

@@ -1,5 +1,5 @@
 TestParameters:
-  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch
 GlobalParameters:
   NumElementsToValidate: -1
   BoundsCheck: False
@@ -209,7 +209,7 @@ BenchmarkProblems:
         - VectorWidthA: [-1]
         - VectorWidthB: [-1]
         - LocalReadVectorWidth: [-1]
-        - 1LDSBuffer: [0]
+        - 1LDSBuffer: [1, 0]
         - DirectToVgprSparseMetadata: [0]
         - StoreVectorWidth: [-1]
         - StreamK: [3]

@@ -1,5 +1,5 @@
 TestParameters:
-  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch
 GlobalParameters:
   NumElementsToValidate: -1
   BoundsCheck: False