diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py
index 5d55cbdebda..255a41aeb16 100644
--- a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py
+++ b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py
@@ -20,15 +20,16 @@
 # CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ################################################################################
 
+from rocisa.enum import CacheScope
 from rocisa.code import Module, Label
-from rocisa.container import vgpr, sgpr, SMEMModifiers, replaceHolder, EXEC,\
+from rocisa.container import vgpr, sgpr, SMEMModifiers, MUBUFModifiers, replaceHolder, EXEC,\
     VOP3PModifiers, ContinuousRegister
 from rocisa.instruction import SAddCU32, SAddI32, SAddU32, SAndB32, SBarrier, \
     SBranch, SCBranchSCC0, SCBranchSCC1, SCMovB32, SCSelectB32, SCmpEQU32, SCmpEQU64, \
     SCmpGtU32, SCmpLeU32, SCmpLtU32, SCmpGeU32, SLShiftLeftB32, SLShiftLeftB64, SLShiftRightB32, SLoadB32, \
     SMinU32, SMovB32, SMovB64, SMulI32, SNop, SSleep, SStoreB32, SSubU32, \
     SWaitCnt, VAddF32, VAddF64, VAddPKF16, VAddU32, VLShiftRightB32, VMovB32, \
-    VReadfirstlaneB32, VCvtBF16toFP32
+    VReadfirstlaneB32, VCvtBF16toFP32, BufferStoreB32
 from rocisa.functions import scalarStaticDivideAndRemainder, sMagicDiv2, \
     vectorStaticMultiply, BranchIfNotZero, scalarUInt32DivideAndRemainder
 
@@ -228,7 +229,7 @@ def computeStoreSrdStartCommon(self, writer, kernel):
                 tmpSgpr0   = tmpSgprInfo.idx+1
                 tmpSgpr1   = tmpSgprInfo.idx+2
                 tmpSgpr2   = tmpSgprInfo.idx+0
-                tmpSgpr3   = tmpSgprInfo.idx+3                
+                tmpSgpr3   = tmpSgprInfo.idx+3
             module.addComment("Split Output Buffer offset: Free0 + (Free1-1)*StrideC1J + (Free2-1)*StrideCK * SplitIdx * bpe%s")
             # PartialIdx was saved in sgprBeta for re-use
             module.addModuleAsFlatItems(writer.s_mul_u64_u32(sgpr(tmpSgpr0), sgpr(tmpSgpr1), sgpr("SizesFree+0"), sgpr("SkPartialIdx"), comment="Free0"))
@@ -427,7 +428,7 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele
             module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sFlagIdx), shiftHex=log2(4), comment="flag offset based on wg index"))
 
             module.add(skFixupWaitForFlag) # loop to wait for flag
-            module.add(SLoadB32(dst=sgpr(tmpSgpr+1), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="get flag"))
+            module.add(SLoadB32(dst=sgpr(tmpSgpr+1), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag"))
 
             module.add(SWaitCnt(kmcnt=0, comment="wait for flag load"))
             if kernel["DebugStreamK"] & 2 == 0: # Don't wait for partials if not being written
@@ -489,7 +490,7 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele
 
                 # Check flag
                 module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sCtaIdx), shiftHex=log2(4), comment="flag offset based on CTA index"))
-                module.add(SLoadB32(dst=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="get flag"))
+                module.add(SLoadB32(dst=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag"))
 
                 module.add(SWaitCnt(kmcnt=0, comment="wait for flag load"))
                 if kernel["DebugStreamK"] & 2 == 0:
@@ -502,8 +503,12 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele
                 module.add(VReadfirstlaneB32(dst=sgpr(tmpSgpr+2), src=vgpr("Serial"), comment="Wave 0 updates flags"))
                 module.add(SCmpEQU32(src0=sgpr(tmpSgpr+2), src1=0, comment="Check for wave 0"))
                 module.add(SCBranchSCC0(labelName=skipFlagReset.getLabelName(), comment="Skip flag reset"))
-                # (tmpSgpr+2) contains a vlue of 0, use it to reset the flag
-                module.add(SStoreB32(src=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="reset flag"))
+                if writer.states.asmCaps["HasScalarStore"]:
+                    # (tmpSgpr+2) contains a vlue of 0, use it to reset the flag
+                    module.add(SStoreB32(src=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="reset flag"))
+                else:
+                    module.add(VMovB32(dst=vgpr(tmpVgpr), src=0, comment="move 0 to tmpVgpr"))
+                    module.add(self.setFlagValue(writer, src=vgpr(tmpVgpr), soffset=sgpr(tmpSgpr), comment="reset flag"))
                 module.add(skipFlagReset)
                 writer.sgprPool.checkIn(tmpSgpr)
 
@@ -806,8 +811,12 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha,
                 module.add(VReadfirstlaneB32(dst=sgpr(flagSgpr), src=vgpr("Serial"), comment="Wave 0 updates flags"))
                 module.add(SCmpEQU32(src0=sgpr(flagSgpr), src1=0, comment="Check for wave 0"))
                 module.add(SCBranchSCC0(labelName=skipFlagSet.getLabelName(), comment="Skip flag set"))
-                module.add(SMovB32(dst=sgpr(flagSgpr), src=1, comment="flag data"))
-                module.add(SStoreB32(src=sgpr(flagSgpr), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="set flag"))
+                if writer.states.asmCaps["HasScalarStore"]:
+                    module.add(SMovB32(dst=sgpr(flagSgpr), src=1, comment="flag data"))
+                    module.add(SStoreB32(src=sgpr(flagSgpr), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="set flag"))
+                else:
+                    module.add(VMovB32(dst=vgpr(tmpVgpr), src=1, comment="move 1 to tmpVgpr"))
+                    module.add(self.setFlagValue(writer, src=vgpr(tmpVgpr), soffset=sgpr(tmpSgpr), comment="set flag"))
                 module.add(skipFlagSet)
             module.add(SWaitCnt(kmcnt=0, comment="wait for flag")) # TODO just for testing
 
@@ -818,6 +827,20 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha,
 
         return module
 
+    def setFlagValue(self, writer, src, soffset, comment=""):
+        module = Module("Buffer Store Flag Value")
+        tmpSgprBuffer = writer.sgprPool.checkOutAligned(4, 4, preventOverflow=False)
+        module.add(SMovB64(dst=sgpr(tmpSgprBuffer, 2), src=sgpr("AddressFlags", 2)))
+        module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB"))
+        module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96"))
+        module.add(BufferStoreB32(src=src, vaddr=vgpr("off", isOff=True), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, \
+                                  mubuf=MUBUFModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), \
+                                  comment=comment))
+        module.add(SWaitCnt(vscnt=0, comment="wait for data store")) #TODO: See if this wait is necessery
+        writer.sgprPool.checkIn(tmpSgprBuffer)
+
+        return module
+
     def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edge, gwvw, atomicW, \
             batchElements, addrD, addrC, \
             tmpVgpr, cvtVgprStruct, batchElementSgprs, tmpSgpr, codeAccVgprRead):
@@ -1753,6 +1776,12 @@ def preLoop(self, writer, kernel):
         xccMapping = Component.XCCMapping.find(writer)
         module.add(xccMapping(writer, kernel))
 
+        # Workaround for gfx12
+        if writer.states.archCaps["WorkGroupIdFromTTM"]:
+            module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround"))
+            module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround"))
+            module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround"))
+
         module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index"))
         # Basic SK
         module.add(SMulI32(dst=sgpr("StreamKIter"), src0=sgpr("StreamKIdx"), src1=sgpr("SKItersPerWG"), comment="StreamK starting iteration"))
@@ -1831,6 +1860,12 @@ def preLoop(self, writer, kernel):
         xccMapping = Component.XCCMapping.find(writer)
         module.add(xccMapping(writer, kernel))
 
+        # Workaround for gfx12
+        if writer.states.archCaps["WorkGroupIdFromTTM"]:
+            module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround"))
+            module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround"))
+            module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround"))
+
         module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index"))
         # Two-tile SK (SK first)
         # iter count after all extra iters have been distributed
@@ -1950,6 +1985,12 @@ def preLoop(self, writer, kernel):
         xccMapping = Component.XCCMapping.find(writer)
         module.add(xccMapping(writer, kernel))
 
+        # Workaround for gfx12
+        if writer.states.archCaps["WorkGroupIdFromTTM"]:
+            module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround"))
+            module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround"))
+            module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround"))
+
         module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index"))
         # Two-tile SK (DP first)
         # Do DP tiles before SK
@@ -1979,13 +2020,13 @@ def preLoop(self, writer, kernel):
         # if (partialIdx < extraIters) then (skIter = partialIdx * (itersPerWG + 1)) else (skIter = partialIdx * itersPerWG + extraIters)
         skHasExtraLabel = Label("SK_HasExtra", "")
         skDoneExtraLabel = Label("SK_DoneExtra", "")
-        
+
         # PartialIdx = itersPerTile % skSplit (skSplit is passed as SkSplit)
         # extraIters = ItersPerTile - SkSplit * skItersPerWG
         sSkExtraIters = writer.sgprPool.checkOut(1, "extraIters")
         module.add(SMulI32(dst=sgpr(sSkExtraIters), src0=sgpr("SkSplit"), src1=sgpr("SKItersPerWG")))
         module.add(SSubU32(dst=sgpr(sSkExtraIters), src0=sgpr("ItersPerTile"), src1=sgpr(sSkExtraIters), comment="extraIters = itersPerTile - SkSplit * skItersPerWG"))
-        
+
         module.add(SMulI32(dst=sgpr("StreamKIter"), src0=sgpr(stmpPartialIdx), src1=sgpr("SKItersPerWG"), comment="StreamK starting iteration (case: after extra iters)"))
         module.add(SCmpLtU32(src0=sgpr(stmpPartialIdx), src1=sgpr(sSkExtraIters), comment="Check if WG gets an extra iteration"))
         module.add(SCBranchSCC1(labelName=skHasExtraLabel.getLabelName(), comment="Has extra iter"))
@@ -2007,7 +2048,7 @@ def preLoop(self, writer, kernel):
         module.add(SMovB32(dst=sgpr("SkPartialIdx"), src=sgpr(stmpPartialIdx), comment="Save partial idx for SrdD calculation"))
         # Done init
         module.add(SBranch(labelName=skInitDone.getLabelName(), comment="Done init for parallel reduction"))
-        
+
         # # Save PratialIdx for later, skExtraIters is unused for partial reduction
         # module.add(SMovB32(dst=sgpr("skExtraIters"), src=sgpr(stmpPartialIdx), comment="Save partial idx for SrdD calculation"))
         # # StreamKIter = tile * itersPerTile + itersPerWG * partialIndex
diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py
index fef11d022f1..9fd8cc1c730 100644
--- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py
+++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py
@@ -38,7 +38,7 @@
                         scalarStaticRemainder, scalarUInt32DivideAndRemainder, sMagicDiv, vectorStaticMultiply, \
                         vectorStaticMultiplyAdd, scalarStaticMultiply64, BranchIfZero, BranchIfNotZero, DSInit, \
                         ArgumentLoader
-from rocisa.enum import InstType, SelectBit
+from rocisa.enum import InstType, SelectBit, CacheScope
 from rocisa.macro import MacroVMagicDiv, PseudoRandomGenerator
 from . import CUSTOM_KERNEL_PATH
 from rocisa.instruction import BranchInstruction, BufferLoadB128, BufferLoadB32, \
@@ -12295,7 +12295,9 @@ def bufferLoadImpl(soffset):
   ##############################################################################
   def chooseGlobalWrite(self, useBuffer, bps, srcVgpr, rpv, \
                         addr0, addr1, offset, soffset=0, \
-                        glc=False, slc=False, nt=False, hi16=0, comment="store"):
+                        glc=False, slc=False, nt=False, dlc=False, \
+                        scope=CacheScope.SCOPE_NONE, \
+                        hi16=0, comment="store"):
     """
     create the store instruction for requested vector width and other parms
     rpv = regs per vector
@@ -12338,7 +12340,7 @@ def bufferStoreImpl(tmpSgpr, mubuf):
           if offset2 >= 4096:
             module.add(SMovB32(dst=tmpSgpr, src=offset2, comment="large offset"))
             offset2 = 0
-          mubuf2 = MUBUFModifiers(offen=True, offset12=offset2, glc=glc, slc=slc, nt=nt, isStore=True)
+          mubuf2 = MUBUFModifiers(offen=True, offset12=offset2, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True)
           vgprOff = int(srcVgpr + shiftRpv * i) if isinstance(srcVgpr, int) else f"{srcVgpr}+{int(shiftRpv * i)}"
           module.add(BufferStoreB128(src=vgpr(vgprOff, shiftRpv), vaddr=addr0, \
                 saddr=addr1, soffset=tmpSgpr, mubuf=mubuf2, comment=comment))
@@ -12346,7 +12348,7 @@ def bufferStoreImpl(tmpSgpr, mubuf):
         assert 0, "bad bps"
 
     if useBuffer:
-      mubuf = MUBUFModifiers(offen=True, offset12=offset, glc=glc, slc=slc, nt=nt, isStore=True)
+      mubuf = MUBUFModifiers(offen=True, offset12=offset, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True)
       if soffset != 0:
         assert offset < 4096, "sgpr offset provided with large const offset"
       # buffer_load offset field is 12-bit.
@@ -12357,13 +12359,13 @@ def bufferStoreImpl(tmpSgpr, mubuf):
           tmpSgpr = sgpr(tmpSgprInfo.idx)
           if offset >= 4096:
             module.add(SMovB32(dst=tmpSgpr, src=offset, comment="large offset"))
-            mubuf = MUBUFModifiers(offen=True, offset12=0, glc=glc, slc=slc, nt=nt, isStore=True)
+            mubuf = MUBUFModifiers(offen=True, offset12=0, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True)
           bufferStoreImpl(tmpSgpr, mubuf)
       else:
         bufferStoreImpl(soffset, mubuf)
 
     else:
-      flat = FLATModifiers(glc=glc, slc=slc, isStore=True)
+      flat = FLATModifiers(glc=glc, slc=slc, dlc=dlc, scope=scope, isStore=True)
       if bps==2 and hi16:
         module.add(FlatStoreD16HIB16(vaddr=addr0, src=vgpr(srcVgpr*2), flat=flat, comment=comment))
       elif bps==2 and not hi16:
@@ -12493,6 +12495,8 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset
       isGlc = False
       isSlc = False
       isNT = False
+      scope = CacheScope.SCOPE_NONE
+      isDlc = False
 
       if tc == 'D':
         isGlc = bool(kernel["NonTemporalD"] & 0x1)
@@ -12533,6 +12537,8 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset
         isGlc = True
         isSlc = True
         isNT  = bool(kernel["NonTemporalD"] & 0x4)
+        isDlc = True
+        scope = CacheScope.SCOPE_DEV
 
         bps = self.states.bpeCinternal * ss.cfg.gwvw
         rpv = self.states.bpeCinternal * ss.cfg.gwvw / self.states.bpr
@@ -12582,36 +12588,36 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset
           if self.states.asmCaps["HasWMMA_V1"] and kernel["EnableMatrixInstruction"]:
             module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \
                 addr0, addr1, globalOffset, soffset=wsOffset, \
-                glc=isGlc, slc=isSlc, nt=isNT, hi16=0, comment=comment))
+                glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=0, comment=comment))
           else:
             module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx//2, rpv, \
                 addr0, addr1, globalOffset, soffset=wsOffset, \
-                glc=isGlc, slc=isSlc, nt=isNT, hi16=sumIdx%2, comment=comment))
+                glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=sumIdx%2, comment=comment))
         else:
           # (B,B,B,B,S,S), internal S
           # (H,H,H,H,H,H), internal S
           # (H,H,H,H,S,S), internal S
           module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \
               addr0, addr1, globalOffset, soffset=wsOffset, \
-              glc=isGlc, slc=isSlc, nt=isNT, hi16=0, comment=comment))
+              glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=0, comment=comment))
       elif dataType.isInt32() or dataType.isSingle():
         module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \
             addr0, addr1, globalOffset, soffset=wsOffset, \
-            glc=isGlc, slc=isSlc, nt=isNT, comment=comment))
+            glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment))
       elif dataType.isDouble() or dataType.isSingleComplex():
         module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx*2, rpv, \
             addr0, addr1, globalOffset, soffset=wsOffset, \
-            glc=isGlc, slc=isSlc, nt=isNT, comment=comment))
+            glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment))
       elif dataType.isDoubleComplex():
         rps = dataType.numRegisters()
         module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx*rps, rpv, \
             addr0, addr1, globalOffset, soffset=wsOffset, \
-            glc=isGlc, slc=isSlc, nt=isNT, comment=comment))
+            glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment))
       elif dataType.isInt8() or dataType.isAnyFloat8() or dataType.isAnyBFloat8() or dataType.isAnyFloat8BFloat8() or dataType.isAnyBFloat8Float8():
         if kernel["ProblemType"]["HighPrecisionAccumulate"]:
           module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \
               addr0, addr1, globalOffset, soffset=wsOffset, \
-              glc=isGlc, slc=isSlc, nt=isNT, comment=comment))
+              glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment))
     return module
 
   ##############################################################################
diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py
index b7162c7f21b..f7bd5ad5353 100644
--- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py
+++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py
@@ -980,8 +980,6 @@ def assignDerivedParameters(
       state["GlobalSplitUAlgorithm"] = "MultipleBuffer" # Set default Algorithm
       if not state["EnableMatrixInstruction"]:
         reject(state, printRejectionReason, "Stream-K requires MatrixInstruction")
-      if isaInfoMap[isa].asmCaps["HasWMMA"]:
-        reject(state, printRejectionReason, "Stream-K untested with WMMA")
       # if state["PersistentKernel"]:
       #   reject(state, printRejectionReason, "Cannot enable both Stream-K and PersistentKernel")
       if not state["ProblemType"]["StridedBatched"]:
diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml
new file mode 100644
index 00000000000..d949ba5f633
--- /dev/null
+++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml
@@ -0,0 +1,456 @@
+TestParameters:
+  marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  BoundsCheck: False
+  KernelTime: False
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 1
+  DataInitTypeA: 12
+  DataInitTypeB: 13
+  DataInitTypeC: 12
+  # DataInitTypeC: 1
+  # ValidationPrintValids: True
+  MaxWorkspaceSize: 134217728
+  # PrintSolutionRejectionReason: True
+  # ForceGenerateKernel: True
+  # GenerateSourcesAndExit: True
+  NumWarmups: 0
+  EnqueuesPerSync: 1
+  # NumBenchmarks: 10
+  SleepPercent: 50
+
+BenchmarkProblems:
+
+  - # HGEMM NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NT - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+    - # HGEMM NT - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+    - # HGEMM NT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM NN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NN - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+  - # HGEMM TN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TN - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM TT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0, 1]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml
new file mode 100644
index 00000000000..675d87bde37
--- /dev/null
+++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml
@@ -0,0 +1,456 @@
+TestParameters:
+  marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  BoundsCheck: False
+  KernelTime: False
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 1
+  DataInitTypeA: 12
+  DataInitTypeB: 13
+  DataInitTypeC: 12
+  # DataInitTypeC: 1
+  # ValidationPrintValids: True
+  MaxWorkspaceSize: 134217728
+  # PrintSolutionRejectionReason: True
+  # ForceGenerateKernel: True
+  # GenerateSourcesAndExit: True
+  NumWarmups: 0
+  EnqueuesPerSync: 1
+  # NumBenchmarks: 10
+  SleepPercent: 50
+
+BenchmarkProblems:
+
+  - # HGEMM NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NT - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+    - # HGEMM NT - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+    - # HGEMM NT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM NN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NN - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+  - # HGEMM TN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TN - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM TT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0, 1]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml
new file mode 100644
index 00000000000..da6174efcb8
--- /dev/null
+++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml
@@ -0,0 +1,420 @@
+TestParameters:
+  marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200] # not supported by arch
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  BoundsCheck: False
+  KernelTime: False
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 1
+  DataInitTypeA: 12
+  DataInitTypeB: 13
+  DataInitTypeC: 12
+  # DataInitTypeC: 1
+  # ValidationPrintValids: True
+  MaxWorkspaceSize: 134217728
+  # PrintSolutionRejectionReason: True
+  # ForceGenerateKernel: True
+  # GenerateSourcesAndExit: True
+  NumWarmups: 0
+  EnqueuesPerSync: 1
+  # NumBenchmarks: 10
+  SleepPercent: 50
+
+BenchmarkProblems:
+
+  - # HGEMM NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NT - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [4]
+        # - VectorWidthB: [4]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+    - # HGEMM NT - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1, 2, 4]
+        # - VectorWidthB: [1, 2, 4]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+    - # HGEMM NT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [1]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1]
+        # - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM NN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NN - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [4]
+        # - VectorWidthB: [4]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+  - # HGEMM TN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TN - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1, 2, 4]
+        # - VectorWidthB: [1, 2, 4]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM TT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [1]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0, 1]
+        # - VectorWidthA: [1]
+        # - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
\ No newline at end of file
diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml
new file mode 100644
index 00000000000..a483f8ab460
--- /dev/null
+++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml
@@ -0,0 +1,420 @@
+TestParameters:
+  marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200] # not supported by arch
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  BoundsCheck: False
+  KernelTime: False
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 1
+  DataInitTypeA: 12
+  DataInitTypeB: 13
+  DataInitTypeC: 12
+  # DataInitTypeC: 1
+  # ValidationPrintValids: True
+  MaxWorkspaceSize: 134217728
+  # PrintSolutionRejectionReason: True
+  # ForceGenerateKernel: True
+  # GenerateSourcesAndExit: True
+  NumWarmups: 0
+  EnqueuesPerSync: 1
+  # NumBenchmarks: 10
+  SleepPercent: 50
+
+BenchmarkProblems:
+
+  - # HGEMM NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NT - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [4]
+        # - VectorWidthB: [4]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+    - # HGEMM NT - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1, 2, 4]
+        # - VectorWidthB: [1, 2, 4]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+    - # HGEMM NT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [1]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1]
+        # - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM NN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NN - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [4]
+        # - VectorWidthB: [4]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+  - # HGEMM TN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TN - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1, 2, 4]
+        # - VectorWidthB: [1, 2, 4]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM TT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [1]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - StreamKFixupTreeReduction: [0]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0, 1]
+        # - VectorWidthA: [1]
+        # - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
\ No newline at end of file
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/base.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/base.hpp
index a73f04c1ec9..c5e916130e6 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/base.hpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/base.hpp
@@ -270,6 +270,6 @@ namespace rocisa
 
     std::string isaToGfx(const nb::tuple& arch);
     std::string isaToGfx(const IsaVersion& arch);
-    std::string getGlcBitName(bool hasGLCModifier);
-    std::string getSlcBitName(bool hasGLCModifier);
+    std::string getGlcBitName();
+    std::string getSlcBitName();
 } // namespace rocisa
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp
index 8079d5f1098..693c925339a 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp
@@ -102,15 +102,19 @@ namespace rocisa
 
     struct FLATModifiers : public Container
     {
-        FLATModifiers(int  offset12 = 0,
-                      bool glc      = false,
-                      bool slc      = false,
-                      bool lds      = false,
-                      bool isStore  = false)
+        FLATModifiers(int        offset12 = 0,
+                      bool       glc      = false,
+                      bool       slc      = false,
+                      bool       dlc      = false,
+                      CacheScope scope    = CacheScope::SCOPE_NONE,
+                      bool       lds      = false,
+                      bool       isStore  = false)
             : Container()
             , offset12(offset12)
             , glc(glc)
             , slc(slc)
+            , dlc(dlc)
+            , scope(scope)
             , lds(lds)
             , isStore(isStore)
         {
@@ -121,6 +125,8 @@ namespace rocisa
             , offset12(other.offset12)
             , glc(other.glc)
             , slc(other.slc)
+            , dlc(other.dlc)
+            , scope(other.scope)
             , lds(other.lds)
             , isStore(other.isStore)
         {
@@ -133,7 +139,8 @@ namespace rocisa
 
         std::string toString() const override
         {
-            auto        hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"];
+            auto        hasDLCModifier   = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"];
+            auto        hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"];
             std::string kStr;
             if(offset12 != 0)
             {
@@ -141,11 +148,19 @@ namespace rocisa
             }
             if(glc)
             {
-                kStr += " " + getGlcBitName(hasGLCModifier);
+                kStr += " " + getGlcBitName();
             }
             if(slc)
             {
-                kStr += " " + getSlcBitName(hasGLCModifier);
+                kStr += " " + getSlcBitName();
+            }
+            if(hasDLCModifier && dlc)
+            {
+                kStr += " dlc";
+            }
+            if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE)
+            {
+                kStr += " scope:" + ::rocisa::toString(scope);
             }
             if(lds)
             {
@@ -154,11 +169,13 @@ namespace rocisa
             return kStr;
         }
 
-        int  offset12;
-        bool glc;
-        bool slc;
-        bool lds;
-        bool isStore;
+        int        offset12;
+        bool       glc;
+        bool       slc;
+        bool       dlc;
+        CacheScope scope;
+        bool       lds;
+        bool       isStore;
     };
 
     struct GLOBALModifiers : public Container
@@ -190,23 +207,27 @@ namespace rocisa
             return kStr;
         }
 
-        int  offset;
+        int offset;
     };
 
     struct MUBUFModifiers : public Container
     {
-        MUBUFModifiers(bool offen    = false,
-                       int  offset12 = 0,
-                       bool glc      = false,
-                       bool slc      = false,
-                       bool nt       = false,
-                       bool lds      = false,
-                       bool isStore  = false)
+        MUBUFModifiers(bool       offen    = false,
+                       int        offset12 = 0,
+                       bool       glc      = false,
+                       bool       slc      = false,
+                       bool       dlc      = false,
+                       CacheScope scope    = CacheScope::SCOPE_NONE,
+                       bool       nt       = false,
+                       bool       lds      = false,
+                       bool       isStore  = false)
             : Container()
             , offen(offen)
             , offset12(offset12)
             , glc(glc)
             , slc(slc)
+            , dlc(dlc)
+            , scope(scope)
             , nt(nt)
             , lds(lds)
             , isStore(isStore)
@@ -219,6 +240,8 @@ namespace rocisa
             , offset12(other.offset12)
             , glc(other.glc)
             , slc(other.slc)
+            , dlc(other.dlc)
+            , scope(other.scope)
             , nt(other.nt)
             , lds(other.lds)
             , isStore(other.isStore)
@@ -232,25 +255,29 @@ namespace rocisa
 
         std::string toString() const override
         {
-            auto        hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"];
-            auto        hasSLCModifier = rocIsa::getInstance().getAsmCaps()["HasSLCModifier"];
-            auto        hasNTModifier  = rocIsa::getInstance().getAsmCaps()["HasNTModifier"];
+            auto        hasDLCModifier   = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"];
+            auto        hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"];
+            auto        hasNTModifier    = rocIsa::getInstance().getAsmCaps()["HasNTModifier"];
             std::string kStr;
             if(offen)
             {
                 kStr += " offen offset:" + std::to_string(offset12);
             }
-            if(glc || slc || lds)
-            {
-                kStr += ",";
-            }
             if(glc)
             {
-                kStr += " " + getGlcBitName(hasGLCModifier);
+                kStr += " " + getGlcBitName();
             }
             if(slc)
             {
-                kStr += " " + getSlcBitName(hasGLCModifier);
+                kStr += " " + getSlcBitName();
+            }
+            if(hasDLCModifier && dlc)
+            {
+                kStr += " dlc";
+            }
+            if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE)
+            {
+                kStr += " scope:" + ::rocisa::toString(scope);
             }
             if(hasNTModifier && nt)
             {
@@ -263,20 +290,28 @@ namespace rocisa
             return kStr;
         }
 
-        bool offen;
-        int  offset12;
-        bool glc;
-        bool slc;
-        bool nt;
-        bool lds;
-        bool isStore;
+        bool       offen;
+        int        offset12;
+        bool       glc;
+        bool       slc;
+        bool       dlc;
+        CacheScope scope;
+        bool       nt;
+        bool       lds;
+        bool       isStore;
     };
 
     struct SMEMModifiers : public Container
     {
-        SMEMModifiers(bool glc = false, bool nv = false, int offset = 0)
+        SMEMModifiers(bool       glc    = false,
+                      bool       dlc    = false,
+                      CacheScope scope  = CacheScope::SCOPE_NONE,
+                      bool       nv     = false,
+                      int        offset = 0)
             : Container()
             , glc(glc)
+            , dlc(dlc)
+            , scope(scope)
             , nv(nv)
             , offset(offset) // 20u 21s shaes the same
         {
@@ -285,6 +320,8 @@ namespace rocisa
         SMEMModifiers(const SMEMModifiers& other)
             : Container()
             , glc(other.glc)
+            , dlc(other.dlc)
+            , scope(other.scope)
             , nv(other.nv)
             , offset(other.offset)
         {
@@ -297,15 +334,25 @@ namespace rocisa
 
         std::string toString() const override
         {
+            auto        hasDLCModifier   = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"];
+            auto        hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"];
             std::string kStr;
             if(offset != 0)
             {
                 kStr += " offset:" + std::to_string(offset);
             }
-            if(glc)
+            if(!hasSCOPEModifier && glc)
             {
                 kStr += " glc";
             }
+            if(hasDLCModifier && dlc)
+            {
+                kStr += " dlc";
+            }
+            if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE)
+            {
+                kStr += " scope:" + ::rocisa::toString(scope);
+            }
             if(nv)
             {
                 kStr += " nv";
@@ -313,9 +360,11 @@ namespace rocisa
             return kStr;
         }
 
-        bool glc;
-        bool nv;
-        int  offset;
+        bool       glc;
+        bool       dlc;
+        CacheScope scope;
+        bool       nv;
+        int        offset;
     };
 
     struct SDWAModifiers : public Container
@@ -651,6 +700,7 @@ namespace rocisa
         bool                   isMinus;
         bool                   isAbs;
         bool                   isMacro;
+        bool                   isOff;
 
         RegisterContainer(const std::string&            regType,
                           const std::optional<RegName>& regName,
@@ -665,6 +715,7 @@ namespace rocisa
             , isMinus(false)
             , isAbs(false)
             , isMacro(false)
+            , isOff(false)
         {
         }
 
@@ -672,6 +723,7 @@ namespace rocisa
                           const std::optional<RegName>& regName,
                           bool                          isAbs,
                           bool                          isMacro,
+                          bool                          isOff,
                           int                           regIdx = 0,
                           float                         regNum = 1)
             : Container()
@@ -683,6 +735,7 @@ namespace rocisa
             , isMinus(false)
             , isAbs(isAbs)
             , isMacro(isMacro)
+            , isOff(isOff)
         {
         }
 
@@ -696,6 +749,7 @@ namespace rocisa
             , isMinus(other.isMinus)
             , isAbs(other.isAbs)
             , isMacro(other.isMacro)
+            , isOff(other.isOff)
         {
         }
 
@@ -719,6 +773,7 @@ namespace rocisa
             , isMinus(other.isMinus)
             , isAbs(other.isAbs)
             , isMacro(other.isMacro)
+            , isOff(other.isOff)
         {
         }
 
@@ -734,6 +789,7 @@ namespace rocisa
                 isMinus     = other.isMinus;
                 isAbs       = other.isAbs;
                 isMacro     = other.isMacro;
+                isOff       = other.isOff;
             }
             return *this;
         }
@@ -750,6 +806,7 @@ namespace rocisa
                 isMinus     = other.isMinus;
                 isAbs       = other.isAbs;
                 isMacro     = other.isMacro;
+                isOff       = other.isOff;
             }
             return *this;
         }
@@ -861,6 +918,11 @@ namespace rocisa
 
         std::string toString() const override
         {
+            if(isOff)
+            {
+                return "off";
+            }
+
             std::string minusStr = isMinus ? "-" : "";
             minusStr             = isAbs ? "abs(" + minusStr : minusStr;
             auto absStr          = isAbs ? ")" : "";
@@ -1122,8 +1184,11 @@ namespace rocisa
     // Overloaded functions to create specific GPR containers with default regNum = 1.f
     std::shared_ptr<RegisterContainer> vgpr(const Holder& holder, float regNum = 1.f);
     std::shared_ptr<RegisterContainer> vgpr(int idx, float regNum = 1.f);
-    std::shared_ptr<RegisterContainer>
-        vgpr(const std::string& name, float regNum = 1.f, bool isMacro = false, bool isAbs = false);
+    std::shared_ptr<RegisterContainer> vgpr(const std::string& name,
+                                            float              regNum  = 1.f,
+                                            bool               isMacro = false,
+                                            bool               isAbs   = false,
+                                            bool               isOff   = false);
     std::shared_ptr<RegisterContainer> sgpr(const Holder& holder, float regNum = 1.f);
     std::shared_ptr<RegisterContainer> sgpr(int idx, float regNum = 1.f);
     std::shared_ptr<RegisterContainer>
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp
index 2bbc4fdb2bf..e5d6ee55a54 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp
@@ -224,6 +224,15 @@ namespace rocisa
         UNUSED_PRESERVE = 3
     };
 
+    enum class CacheScope : int
+    {
+        SCOPE_NONE = 0,
+        SCOPE_CU   = 1,
+        SCOPE_SE   = 2,
+        SCOPE_DEV  = 3,
+        SCOPE_SYS  = 4,
+    };
+
     enum class CvtType : int
     {
         CVT_F16_to_F32          = 1,
@@ -298,6 +307,23 @@ namespace rocisa
         }
     }
 
+    inline std::string toString(CacheScope scope)
+    {
+        switch(scope)
+        {
+        case CacheScope::SCOPE_CU:
+            return "SCOPE_CU";
+        case CacheScope::SCOPE_SE:
+            return "SCOPE_SE";
+        case CacheScope::SCOPE_DEV:
+            return "SCOPE_DEV";
+        case CacheScope::SCOPE_SYS:
+            return "SCOPE_SYS";
+        default:
+            return "";
+        }
+    }
+
     enum class SaturateCastType : int
     {
         NORMAL     = 1,
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp
index 7e0512ee608..1452a84e91a 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp
@@ -69,16 +69,17 @@ inline bool tryAssembler(const IsaVersion&  isaVersion,
     return true;
 }
 
-inline int getMaxCnt(const IsaVersion& isaVersion,
+inline int getMaxCnt(const IsaVersion&  isaVersion,
                      const std::string& assemblerPath,
                      const std::string& prefix,
                      const std::string& suffix,
-                     bool isDebug)
+                     bool               isDebug)
 {
     for(int p = 64; p > 1; p >>= 1)
     {
         // Try ( pow(2) - 1 ) from high to low
-        if(tryAssembler(isaVersion, assemblerPath, prefix + std::to_string(p - 1) + suffix, isDebug))
+        if(tryAssembler(
+               isaVersion, assemblerPath, prefix + std::to_string(p - 1) + suffix, isDebug))
             return p - 1;
     }
     return 0;
@@ -117,7 +118,9 @@ inline std::map<std::string, int>
         = tryAssembler(isaVersion, assemblerPath, "v_lshl_or_b32 v47, v36, 0x2, v34", isDebug);
     rv["HasSMulHi"]
         = tryAssembler(isaVersion, assemblerPath, "s_mul_hi_u32 s47, s36, s34", isDebug);
-
+    rv["HasScalarStore"]
+        = tryAssembler(isaVersion, assemblerPath, "s_store_dword s79, s[70:71], s77", isDebug)
+          || tryAssembler(isaVersion, assemblerPath, "s_store_b32 s79, s[70:71], s77", isDebug);
     rv["HasMFMA_explictB"] = tryAssembler(
         isaVersion, assemblerPath, "v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]", isDebug);
     rv["HasMFMA"] = tryAssembler(isaVersion,
@@ -276,6 +279,35 @@ inline std::map<std::string, int>
                           assemblerPath,
                           "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, glc",
                           isDebug);
+    rv["HasSC0Modifier"]
+        = tryAssembler(isaVersion,
+                       assemblerPath,
+                       "buffer_load_dwordx4 v[10:13], v[0], s[0:3], 0, offen offset:0, sc0",
+                       isDebug)
+          || tryAssembler(isaVersion,
+                          assemblerPath,
+                          "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, sc0",
+                          isDebug);
+    rv["HasDLCModifier"]
+        = tryAssembler(isaVersion,
+                       assemblerPath,
+                       "buffer_load_dwordx4 v[10:13], v[0], s[0:3], 0, offen offset:0, dlc",
+                       isDebug)
+          || tryAssembler(isaVersion,
+                          assemblerPath,
+                          "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, dlc",
+                          isDebug);
+    rv["HasSCOPEModifier"]
+        = tryAssembler(
+              isaVersion,
+              assemblerPath,
+              "buffer_load_dwordx4 v[10:13], v[0], s[0:3], 0, offen offset:0, scope:SCOPE_DEV",
+              isDebug)
+          || tryAssembler(
+              isaVersion,
+              assemblerPath,
+              "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, scope:SCOPE_DEV",
+              isDebug);
     rv["HasMUBUFConst"] = tryAssembler(isaVersion,
                                        assemblerPath,
                                        "buffer_load_dword v40, v36, s[24:27], 1 offen offset:0",
@@ -303,18 +335,19 @@ inline std::map<std::string, int>
     rv["s_delay_alu"]
         = tryAssembler(isaVersion, assemblerPath, "s_delay_alu instid0(VALU_DEP_1)", isDebug);
 
-    rv["SeparateVscnt"] = tryAssembler(isaVersion, assemblerPath, "s_waitcnt_vscnt null 0", isDebug);
+    rv["SeparateVscnt"]
+        = tryAssembler(isaVersion, assemblerPath, "s_waitcnt_vscnt null 0", isDebug);
 
     rv["SeparateLGKMcnt"] = tryAssembler(isaVersion, assemblerPath, "s_wait_dscnt 0", isDebug)
-                        && tryAssembler(isaVersion, assemblerPath, "s_wait_kmcnt 0", isDebug);
+                            && tryAssembler(isaVersion, assemblerPath, "s_wait_kmcnt 0", isDebug);
 
     rv["SeparateVMcnt"] = tryAssembler(isaVersion, assemblerPath, "s_wait_loadcnt 0", isDebug)
-                        && tryAssembler(isaVersion, assemblerPath, "s_wait_storecnt 0", isDebug);
+                          && tryAssembler(isaVersion, assemblerPath, "s_wait_storecnt 0", isDebug);
 
     if(rv["SeparateVMcnt"])
     {
         // s_wait_loadcnt accept 16 bits immediate, but only use the lowest 6 bits are used, can't use tryAssembler
-        rv["MaxLoadcnt"]  = 63;
+        rv["MaxLoadcnt"] = 63;
         // s_wait_storecnt accept 16 bits immediate, but only use the lowest 6 bits are used, can't use tryAssembler
         rv["MaxStorecnt"] = 63;
     }
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp
index cf0446b8f48..a24dbe27b12 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp
@@ -831,7 +831,7 @@ namespace rocisa
                           std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                           const std::string&                        comment = "")
             : MUBUFReadInstruction(
-                InstType::INST_D16_HI_U8, dst, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_HI_U8, dst, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -855,7 +855,7 @@ namespace rocisa
                         std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                         const std::string&                        comment = "")
             : MUBUFReadInstruction(
-                InstType::INST_D16_U8, dst, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_U8, dst, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -879,7 +879,7 @@ namespace rocisa
                            std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                            const std::string&                        comment = "")
             : MUBUFReadInstruction(
-                InstType::INST_D16_HI_B16, dst, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_HI_B16, dst, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -903,7 +903,7 @@ namespace rocisa
                          std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                          const std::string&                        comment = "")
             : MUBUFReadInstruction(
-                InstType::INST_D16_B16, dst, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_B16, dst, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -1210,7 +1210,7 @@ namespace rocisa
                            std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                            const std::string&                        comment = "")
             : MUBUFStoreInstruction(
-                InstType::INST_D16_HI_U8, src, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_HI_U8, src, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -1234,7 +1234,7 @@ namespace rocisa
                          std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                          const std::string&                        comment = "")
             : MUBUFStoreInstruction(
-                InstType::INST_D16_U8, src, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_U8, src, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -1258,7 +1258,7 @@ namespace rocisa
                             std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                             const std::string&                        comment = "")
             : MUBUFStoreInstruction(
-                InstType::INST_D16_HI_B16, src, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_HI_B16, src, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -1282,7 +1282,7 @@ namespace rocisa
                           std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                           const std::string&                        comment = "")
             : MUBUFStoreInstruction(
-                InstType::INST_D16_B16, src, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_B16, src, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/base.cpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/base.cpp
index b4b8b9fe829..b2a1120c73f 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/base.cpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/base.cpp
@@ -37,18 +37,26 @@ namespace rocisa
         return getGfxNameTuple(arch);
     }
 
-    std::string getGlcBitName(bool hasGLCModifier)
+    std::string getGlcBitName()
     {
+        auto hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"];
+        auto hasSC0Modifier = rocIsa::getInstance().getAsmCaps()["HasSC0Modifier"];
         if(hasGLCModifier)
             return "glc";
-        return "sc0";
+        if(hasSC0Modifier)
+            return "sc0";
+        return "";
     }
 
-    std::string getSlcBitName(bool hasGLCModifier)
+    std::string getSlcBitName()
     {
+        auto hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"];
+        auto hasSC0Modifier = rocIsa::getInstance().getAsmCaps()["HasSC0Modifier"];
         if(hasGLCModifier)
             return "slc";
-        return "sc1";
+        if(hasSC0Modifier)
+            return "sc1";
+        return "";
     }
 
     // Force init the instance
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp
index a9384c22744..cf990539973 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp
@@ -105,10 +105,12 @@ namespace rocisa
                                                  const std::string& name,
                                                  float              regNum  = 1.f,
                                                  bool               isMacro = false,
-                                                 bool               isAbs   = false)
+                                                 bool               isAbs   = false,
+                                                 bool               isOff   = false)
     {
         RegName regname = generateRegName(name);
-        return std::make_shared<RegisterContainer>(gprType, regname, isAbs, isMacro, -1, regNum);
+        return std::make_shared<RegisterContainer>(
+            gprType, regname, isAbs, isMacro, isOff, -1, regNum);
     }
 
     // Overloaded functions to create specific GPR containers with default regNum = 1.f
@@ -123,9 +125,9 @@ namespace rocisa
     }
 
     std::shared_ptr<RegisterContainer>
-        vgpr(const std::string& name, float regNum, bool isMacro, bool isAbs)
+        vgpr(const std::string& name, float regNum, bool isMacro, bool isAbs, bool isOff)
     {
-        return createGPR("v", name, regNum, isMacro, isAbs);
+        return createGPR("v", name, regNum, isMacro, isAbs, isOff);
     }
 
     std::shared_ptr<RegisterContainer> sgpr(const Holder& holder, float regNum)
@@ -187,11 +189,12 @@ void init_containers(nb::module_ m)
               nb::arg("idx"),
               nb::arg("regNum") = 1.f);
     m_con.def("vgpr",
-              nb::overload_cast<const std::string&, float, bool, bool>(&rocisa::vgpr),
+              nb::overload_cast<const std::string&, float, bool, bool, bool>(&rocisa::vgpr),
               nb::arg("name"),
               nb::arg("regNum")  = 1.f,
               nb::arg("isMacro") = false,
-              nb::arg("isAbs")   = false);
+              nb::arg("isAbs")   = false,
+              nb::arg("isOff")   = false);
 
     m_con.def("sgpr",
               nb::overload_cast<const rocisa::Holder&, float>(&rocisa::sgpr),
@@ -276,10 +279,12 @@ void init_containers(nb::module_ m)
                 std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t));
         });
     nb::class_<rocisa::FLATModifiers, rocisa::Container>(m_con, "FLATModifiers")
-        .def(nb::init<int, bool, bool, bool, bool>(),
+        .def(nb::init<int, bool, bool, bool, rocisa::CacheScope, bool, bool>(),
              nb::arg("offset12") = 0,
              nb::arg("glc")      = false,
              nb::arg("slc")      = false,
+             nb::arg("dlc")      = false,
+             nb::arg("scope")    = 0,
              nb::arg("lds")      = false,
              nb::arg("isStore")  = false)
         .def_rw("isStore", &rocisa::FLATModifiers::isStore)
@@ -290,14 +295,25 @@ void init_containers(nb::module_ m)
              })
         .def("__getstate__",
              [](const rocisa::FLATModifiers& self) {
-                 return std::make_tuple(self.offset12, self.glc, self.slc, self.lds, self.isStore);
+                 return std::make_tuple(self.offset12,
+                                        self.glc,
+                                        self.slc,
+                                        self.dlc,
+                                        self.scope,
+                                        self.lds,
+                                        self.isStore);
              })
-        .def(
-            "__setstate__",
-            [](rocisa::FLATModifiers& self, std::tuple<int, bool, bool, bool, bool> t) {
-                new(&self) rocisa::FLATModifiers(
-                    std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t));
-            });
+        .def("__setstate__",
+             [](rocisa::FLATModifiers&                                            self,
+                std::tuple<int, bool, bool, bool, rocisa::CacheScope, bool, bool> t) {
+                 new(&self) rocisa::FLATModifiers(std::get<0>(t),
+                                                  std::get<1>(t),
+                                                  std::get<2>(t),
+                                                  std::get<3>(t),
+                                                  std::get<4>(t),
+                                                  std::get<5>(t),
+                                                  std::get<6>(t));
+             });
 
     nb::class_<rocisa::GLOBALModifiers, rocisa::Container>(m_con, "GLOBALModifiers")
         .def(nb::init<int>(),
@@ -318,11 +334,13 @@ void init_containers(nb::module_ m)
              });
 
     nb::class_<rocisa::MUBUFModifiers, rocisa::Container>(m_con, "MUBUFModifiers")
-        .def(nb::init<bool, int, bool, bool, bool, bool, bool>(),
+        .def(nb::init<bool, int, bool, bool, bool, rocisa::CacheScope, bool, bool, bool>(),
              nb::arg("offen")    = false,
              nb::arg("offset12") = 0,
              nb::arg("glc")      = false,
              nb::arg("slc")      = false,
+             nb::arg("dlc")      = false,
+             nb::arg("scope")    = 0,
              nb::arg("nt")       = false,
              nb::arg("lds")      = false,
              nb::arg("isStore")  = false)
@@ -332,27 +350,37 @@ void init_containers(nb::module_ m)
              [](const rocisa::MUBUFModifiers& self, nb::dict&) {
                  return rocisa::MUBUFModifiers(self);
              })
-        .def(
-            "__getstate__",
-            [](const rocisa::MUBUFModifiers& self) {
-                return std::make_tuple(
-                    self.offen, self.offset12, self.glc, self.slc, self.nt, self.lds, self.isStore);
-            })
+        .def("__getstate__",
+             [](const rocisa::MUBUFModifiers& self) {
+                 return std::make_tuple(self.offen,
+                                        self.offset12,
+                                        self.glc,
+                                        self.slc,
+                                        self.dlc,
+                                        self.scope,
+                                        self.nt,
+                                        self.lds,
+                                        self.isStore);
+             })
         .def("__setstate__",
-             [](rocisa::MUBUFModifiers&                             self,
-                std::tuple<bool, int, bool, bool, bool, bool, bool> t) {
+             [](rocisa::MUBUFModifiers&                                                       self,
+                std::tuple<bool, int, bool, bool, bool, rocisa::CacheScope, bool, bool, bool> t) {
                  new(&self) rocisa::MUBUFModifiers(std::get<0>(t),
                                                    std::get<1>(t),
                                                    std::get<2>(t),
                                                    std::get<3>(t),
                                                    std::get<4>(t),
                                                    std::get<5>(t),
-                                                   std::get<6>(t));
+                                                   std::get<6>(t),
+                                                   std::get<7>(t),
+                                                   std::get<8>(t));
              });
 
     nb::class_<rocisa::SMEMModifiers, rocisa::Container>(m_con, "SMEMModifiers")
-        .def(nb::init<bool, bool, int>(),
+        .def(nb::init<bool, bool, rocisa::CacheScope, bool, int>(),
              nb::arg("glc")    = false,
+             nb::arg("dlc")    = false,
+             nb::arg("scope")  = 0,
              nb::arg("nv")     = false,
              nb::arg("offset") = 0)
         .def("__str__", &rocisa::SMEMModifiers::toString)
@@ -362,11 +390,15 @@ void init_containers(nb::module_ m)
              })
         .def("__getstate__",
              [](const rocisa::SMEMModifiers& self) {
-                 return std::make_tuple(self.glc, self.nv, self.offset);
+                 return std::make_tuple(self.glc, self.dlc, self.scope, self.nv, self.offset);
              })
-        .def("__setstate__", [](rocisa::SMEMModifiers& self, std::tuple<bool, bool, int> t) {
-            new(&self) rocisa::SMEMModifiers(std::get<0>(t), std::get<1>(t), std::get<2>(t));
-        });
+        .def(
+            "__setstate__",
+            [](rocisa::SMEMModifiers&                                self,
+               std::tuple<bool, bool, rocisa::CacheScope, bool, int> t) {
+                new(&self) rocisa::SMEMModifiers(
+                    std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t));
+            });
 
     nb::class_<rocisa::SDWAModifiers, rocisa::Container>(m_con, "SDWAModifiers")
         .def(nb::init<rocisa::SelectBit, rocisa::UnusedBit, rocisa::SelectBit, rocisa::SelectBit>(),
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp
index 287771b3350..ecac17cd9b9 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp
@@ -130,6 +130,14 @@ void init_enum(nb::module_ m)
         .value("UNUSED_PRESERVE", rocisa::UnusedBit::UNUSED_PRESERVE)
         .export_values();
 
+    nb::enum_<rocisa::CacheScope>(m_enum, "CacheScope")
+        .value("SCOPE_NONE", rocisa::CacheScope::SCOPE_NONE)
+        .value("SCOPE_CU", rocisa::CacheScope::SCOPE_CU)
+        .value("SCOPE_SE", rocisa::CacheScope::SCOPE_SE)
+        .value("SCOPE_DEV", rocisa::CacheScope::SCOPE_DEV)
+        .value("SCOPE_SYS", rocisa::CacheScope::SCOPE_SYS)
+        .export_values();
+
     nb::enum_<rocisa::CvtType>(m_enum, "CvtType")
         .value("CVT_F16_to_F32", rocisa::CvtType::CVT_F16_to_F32)
         .value("CVT_F32_to_F16", rocisa::CvtType::CVT_F32_to_F16)
diff --git a/projects/hipblaslt/tensilelite/rocisa/test/test_base.py b/projects/hipblaslt/tensilelite/rocisa/test/test_base.py
index e461d24e866..e675fd42976 100644
--- a/projects/hipblaslt/tensilelite/rocisa/test/test_base.py
+++ b/projects/hipblaslt/tensilelite/rocisa/test/test_base.py
@@ -111,8 +111,8 @@ def test_copy():
     print("This is a deepcopied function:", deepcopiedFunction(isa))
 
 def test_functions():
-    print("GLC:", rocisa.getGlcBitName(True))
-    print("SLC:", rocisa.getSlcBitName(False))
+    print("GLC:", rocisa.getGlcBitName())
+    print("SLC:", rocisa.getSlcBitName())
 
 test_rocisa()
 test_item()
diff --git a/projects/hipblaslt/tensilelite/rocisa/test/test_container.py b/projects/hipblaslt/tensilelite/rocisa/test/test_container.py
index c550419085a..9fcb7ce6d78 100644
--- a/projects/hipblaslt/tensilelite/rocisa/test/test_container.py
+++ b/projects/hipblaslt/tensilelite/rocisa/test/test_container.py
@@ -41,15 +41,23 @@ def test_containers():
     assert str(ds_modifiers) == " offset:2 gds"
 
     # Test FLATModifiers
-    flat_modifiers = rocisa.container.FLATModifiers(8, True, False, True, False)
+    flat_modifiers = rocisa.container.FLATModifiers(
+        offset12=8, glc=True, slc=False, dlc=False,
+        scope=rocisa.enum.CacheScope.SCOPE_NONE,
+        lds=True, isStore=False)
     assert str(flat_modifiers) == " offset:8 glc lds"
 
     # Test MUBUFModifiers
-    mubuf_modifiers = rocisa.container.MUBUFModifiers(True, 12, True, False, True, False, True)
-    assert str(mubuf_modifiers) == " offen offset:12, glc"
+    mubuf_modifiers = rocisa.container.MUBUFModifiers(
+        offen=True, offset12=12, glc=True, slc=False,
+        dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE,
+        nt=True, lds=False, isStore=True)
+    assert str(mubuf_modifiers) == " offen offset:12 glc"
 
     # Test SMEMModifiers
-    smem_modifiers = rocisa.container.SMEMModifiers(True, False, 8)
+    smem_modifiers = rocisa.container.SMEMModifiers(
+        glc=True, dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE,
+        nv=False, offset=8)
     assert str(smem_modifiers) == " offset:8 glc"
 
     # Test SDWAModifiers
@@ -104,7 +112,7 @@ def test_containers():
 
     # Test Holder
     from rocisa.container import vgpr, sgpr, accvgpr, mgpr, Holder
-    holder = Holder("holder");
+    holder = Holder("holder")
     assert holder.idx == -1
     assert str(holder.name) == "holder"
     assert isinstance(holder, Holder)
@@ -120,6 +128,8 @@ def test_containers():
     assert str(testGpr) == "acc4"
     testGpr = mgpr(holder)
     assert str(testGpr) == "m[mgprholder]"
+    testGpr = vgpr("off", isOff=True)
+    assert str(testGpr) == "off"
 
 def test_containers_copy():
     def copy_test(name, obj):
@@ -133,9 +143,14 @@ def copy_test(name, obj):
     copy_test("DSModifiers", ds)
     flat = rocisa.container.FLATModifiers(lds=True)
     copy_test("FLATModifiers", flat)
-    mubuf = rocisa.container.MUBUFModifiers(True, 12, True, False, True, False, True)
+    mubuf = rocisa.container.MUBUFModifiers(
+        offen=True, offset12=12, glc=True, slc=False,
+        dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE,
+        nt=True, lds=False, isStore=True)
     copy_test("MUBUFModifiers", mubuf)
-    smem = rocisa.container.SMEMModifiers(True, False, 8)
+    smem = rocisa.container.SMEMModifiers(
+        glc=True, dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE,
+        nv=False, offset=8)
     copy_test("SMEMModifiers", smem)
     sdwa = rocisa.container.SDWAModifiers(
         dst_sel=rocisa.enum.SelectBit.WORD_0,
diff --git a/shared/origami/include/origami/hardware.hpp b/shared/origami/include/origami/hardware.hpp
index 2d81538a615..4648cfb5315 100644
--- a/shared/origami/include/origami/hardware.hpp
+++ b/shared/origami/include/origami/hardware.hpp
@@ -23,6 +23,7 @@ namespace origami
         Int32,
         BFloat16,
         Int8,
+        Int4,
         Int64,
         XFloat32,
         Float8_fnuz,
@@ -67,6 +68,8 @@ namespace origami
             return 16;
         case data_type_t::Int8:
             return 8;
+        case data_type_t::Int4:
+            return 4;
         case data_type_t::Int64:
             return 64;
         case data_type_t::XFloat32:
@@ -120,6 +123,8 @@ namespace origami
             return "BFloat16";
         case data_type_t::Int8:
             return "Int8";
+        case data_type_t::Int4:
+            return "Int4";
         case data_type_t::Int64:
             return "Int64";
         case data_type_t::XFloat32:
@@ -170,6 +175,8 @@ namespace origami
             return data_type_t::BFloat16;
         if (s == "i8")
             return data_type_t::Int8;
+        if (s == "i4")
+            return data_type_t::Int4;
         if (s == "xf32")
             return data_type_t::XFloat32;
         if (s == "f8")
@@ -259,6 +266,8 @@ namespace origami
             gfx90a,
             gfx942,
             gfx950,
+            gfx1201,
+            gfx1100,
             Count
         };
 
@@ -267,7 +276,9 @@ namespace origami
             static const std::unordered_map<std::string, architecture_t> str_to_enum_map
                 = {{"gfx90a", architecture_t::gfx90a},
                     {"gfx942", architecture_t::gfx942},
-                    {"gfx950", architecture_t::gfx950}};
+                    {"gfx950", architecture_t::gfx950},
+                    {"gfx1201", architecture_t::gfx1201},
+                    {"gfx1100", architecture_t::gfx1100}};
 
             auto it = str_to_enum_map.find(str);
             if(it != str_to_enum_map.end())
@@ -319,7 +330,13 @@ namespace origami
                 // hardware_t::architecture_constants(
                 //     8, 17, 1.21875121875121875122 * 7, 6, 4, std::make_tuple(-0.000013, 0.007070, 0.027355), 1.5)}};
                 hardware_t::architecture_constants(
-                    8, 17, 1.21875121875121875122 * 7, 6, 4, std::make_tuple(0, 0.008, 0), 1.5)}};
+                    8, 17, 1.21875121875121875122 * 7, 6, 4, std::make_tuple(0, 0.008, 0), 1.5)},
+               {hardware_t::architecture_t::gfx1201,
+                hardware_t::architecture_constants(
+                    1, 5.74, 1.21875121875121875122 * 2.41, 0.464, 2, std::make_tuple(0, 0.17, 0), 1.5)},
+               {hardware_t::architecture_t::gfx1100,
+                hardware_t::architecture_constants(
+                    1, 7.12, 1.21875121875121875122 * 3.48, 0.732, 2, std::make_tuple(0, 0.11, 0), 1.5)}};
 
         inline static const std::unordered_map<architecture_t,
                                         std::unordered_map<matrix_instruction, size_t>> INSTRUCTION_MAP
@@ -473,6 +490,37 @@ namespace origami
                     // DOT2
                     {matrix_instruction( 1,  1,  64, data_type_t::Half), 16}, // V_DOT2_F32_F16
                     {matrix_instruction( 1,  1,  64, data_type_t::BFloat16), 16}, // V_DOT2_F32_BF16
+                }},
+               {hardware_t::architecture_t::gfx1201,
+                {
+                    // F16
+                    {matrix_instruction(16, 16, 16, data_type_t::Half), 16}, // v_wmma_f16_16x16x16_f16/v_wmma_f32_16x16x16_f16
+                    // BF16
+                    {matrix_instruction(16, 16, 16, data_type_t::BFloat16), 16}, // v_wmma_bf16_16x16x16_bf16/v_wmma_f32_16x16x16_bf16
+                    // F8
+                    {matrix_instruction(16, 16, 16, data_type_t::Float8), 8}, // v_wmma_f32_16x16x16_fp8_fp8
+                    // F8B8
+                    {matrix_instruction(16, 16, 16, data_type_t::Float8BFloat8), 8}, // v_wmma_f32_16x16x16_fp8_bf8
+                    // B8F8
+                    {matrix_instruction(16, 16, 16, data_type_t::BFloat8Float8), 8}, // v_wmma_f32_16x16x16_bf8_fp8
+                    // B8
+                    {matrix_instruction(16, 16, 16, data_type_t::BFloat8), 8}, // v_wmma_f32_16x16x16_bf8_bf8
+                    // I8
+                    {matrix_instruction(16, 16, 16, data_type_t::Int8), 8}, // v_wmma_i32_16x16x16_iu8
+                    // I4
+                    {matrix_instruction(16, 16, 16, data_type_t::Int4), 8}, // v_wmma_i32_16x16x16_iu4
+                    {matrix_instruction(16, 16, 32, data_type_t::Int4), 8}, // v_wmma_i32_16x16x32_iu4
+                }},
+               {hardware_t::architecture_t::gfx1100,
+                {
+                    // F16
+                    {matrix_instruction(16, 16, 16, data_type_t::Half), 16}, // v_wmma_f32_16x16x16_f16/v_wmma_f16_16x16x16_f16
+                    // BF16
+                    {matrix_instruction(16, 16, 16, data_type_t::BFloat16), 16}, // v_wmma_f32_16x16x16_bf16/v_wmma_bf16_16x16x16_bf16
+                    // I8
+                    {matrix_instruction(16, 16, 16, data_type_t::Int8), 8}, // v_wmma_i32_16x16x16_iu8
+                    // I4
+                    {matrix_instruction(16, 16, 16, data_type_t::Int4), 8}, // v_wmma_i32_16x16x16_iu4
                 }}};
 
         architecture_t                      arch;