diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py
index ec43cdd0cb0..d9970c833f1 100644
--- a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py
+++ b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py
@@ -20,15 +20,16 @@
 # CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ################################################################################
 
+from rocisa.enum import CacheScope
 from rocisa.code import Module, Label
-from rocisa.container import vgpr, sgpr, SMEMModifiers, replaceHolder, EXEC,\
+from rocisa.container import vgpr, sgpr, SMEMModifiers, MUBUFModifiers, replaceHolder, EXEC,\
     VOP3PModifiers, ContinuousRegister
 from rocisa.instruction import SAddCU32, SAddI32, SAddU32, SAndB32, SBarrier, \
     SBranch, SCBranchSCC0, SCBranchSCC1, SCMovB32, SCSelectB32, SCmpEQU32, SCmpEQU64, \
     SCmpGtU32, SCmpLeU32, SCmpLtU32, SLShiftLeftB32, SLShiftLeftB64, SLShiftRightB32, SLoadB32, \
     SMinU32, SMovB32, SMovB64, SMulI32, SNop, SSleep, SStoreB32, SSubU32, \
     SWaitCnt, VAddF32, VAddF64, VAddPKF16, VAddU32, VLShiftRightB32, VMovB32, \
-    VReadfirstlaneB32, VCvtBF16toFP32
+    VReadfirstlaneB32, VCvtBF16toFP32, BufferStoreB32
 from rocisa.functions import scalarStaticDivideAndRemainder, sMagicDiv2, \
     vectorStaticMultiply, BranchIfNotZero, scalarUInt32DivideAndRemainder
 
@@ -370,7 +371,7 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele
 
             # Check flag
             module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sCtaIdx), shiftHex=log2(4), comment="flag offset based on CTA index"))
-            module.add(SLoadB32(dst=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="get flag"))
+            module.add(SLoadB32(dst=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag"))
 
             module.add(SWaitCnt(kmcnt=0, comment="wait for flag load"))
             if kernel["DebugStreamK"] & 2 == 0:
@@ -383,8 +384,12 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele
             module.add(VReadfirstlaneB32(dst=sgpr(tmpSgpr+2), src=vgpr("Serial"), comment="Wave 0 updates flags"))
             module.add(SCmpEQU32(src0=sgpr(tmpSgpr+2), src1=0, comment="Check for wave 0"))
             module.add(SCBranchSCC0(labelName=skipFlagReset.getLabelName(), comment="Skip flag reset"))
-            # (tmpSgpr+2) contains a vlue of 0, use it to reset the flag
-            module.add(SStoreB32(src=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="reset flag"))
+            if writer.states.asmCaps["HasScalarStore"]:
+                # (tmpSgpr+2) contains a vlue of 0, use it to reset the flag
+                module.add(SStoreB32(src=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="reset flag"))
+            else:
+                module.add(VMovB32(dst=vgpr(tmpVgpr), src=0, comment="move 0 to tmpVgpr"))
+                module.add(self.setFlagValue(writer, src=vgpr(tmpVgpr), soffset=sgpr(tmpSgpr), comment="reset flag"))
             module.add(skipFlagReset)
             writer.sgprPool.checkIn(tmpSgpr)
 
@@ -684,8 +689,12 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha,
                 module.add(VReadfirstlaneB32(dst=sgpr(flagSgpr), src=vgpr("Serial"), comment="Wave 0 updates flags"))
                 module.add(SCmpEQU32(src0=sgpr(flagSgpr), src1=0, comment="Check for wave 0"))
                 module.add(SCBranchSCC0(labelName=skipFlagSet.getLabelName(), comment="Skip flag set"))
-                module.add(SMovB32(dst=sgpr(flagSgpr), src=1, comment="flag data"))
-                module.add(SStoreB32(src=sgpr(flagSgpr), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="set flag"))
+                if writer.states.asmCaps["HasScalarStore"]:
+                    module.add(SMovB32(dst=sgpr(flagSgpr), src=1, comment="flag data"))
+                    module.add(SStoreB32(src=sgpr(flagSgpr), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="set flag"))
+                else:
+                    module.add(VMovB32(dst=vgpr(tmpVgpr), src=1, comment="move 1 to tmpVgpr"))
+                    module.add(self.setFlagValue(writer, src=vgpr(tmpVgpr), soffset=sgpr(tmpSgpr), comment="set flag"))
                 module.add(skipFlagSet)
             module.add(SWaitCnt(kmcnt=0, comment="wait for flag")) # TODO just for testing
 
@@ -696,6 +705,20 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha,
 
         return module
 
+    def setFlagValue(self, writer, src, soffset, comment=""):
+        module = Module("Buffer Store Flag Value")
+        tmpSgprBuffer = writer.sgprPool.checkOutAligned(4, 4, preventOverflow=False)
+        module.add(SMovB64(dst=sgpr(tmpSgprBuffer, 2), src=sgpr("AddressFlags", 2)))
+        module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB"))
+        module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96"))
+        module.add(BufferStoreB32(src=src, vaddr=vgpr("off", isOff=True), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, \
+                                  mubuf=MUBUFModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), \
+                                  comment=comment))
+        module.add(SWaitCnt(vscnt=0, comment="wait for data store")) #TODO: See if this wait is necessery
+        writer.sgprPool.checkIn(tmpSgprBuffer)
+
+        return module
+
     def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edge, gwvw, atomicW, \
             batchElements, addrD, addrC, \
             tmpVgpr, cvtVgprStruct, batchElementSgprs, tmpSgpr, codeAccVgprRead):
@@ -1631,6 +1654,12 @@ def preLoop(self, writer, kernel):
         xccMapping = Component.XCCMapping.find(writer)
         module.add(xccMapping(writer, kernel))
 
+        # Workaround for gfx12
+        if writer.states.archCaps["WorkGroupIdFromTTM"]:
+            module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround"))
+            module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround"))
+            module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround"))
+
         module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index"))
         # Basic SK
         module.add(SMulI32(dst=sgpr("StreamKIter"), src0=sgpr("StreamKIdx"), src1=sgpr("SKItersPerWG"), comment="StreamK starting iteration"))
@@ -1709,6 +1738,12 @@ def preLoop(self, writer, kernel):
         xccMapping = Component.XCCMapping.find(writer)
         module.add(xccMapping(writer, kernel))
 
+        # Workaround for gfx12
+        if writer.states.archCaps["WorkGroupIdFromTTM"]:
+            module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround"))
+            module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround"))
+            module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround"))
+
         module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index"))
         # Two-tile SK (SK first)
         # iter count after all extra iters have been distributed
@@ -1825,6 +1860,12 @@ def preLoop(self, writer, kernel):
         xccMapping = Component.XCCMapping.find(writer)
         module.add(xccMapping(writer, kernel))
 
+        # Workaround for gfx12
+        if writer.states.archCaps["WorkGroupIdFromTTM"]:
+            module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround"))
+            module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround"))
+            module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround"))
+
         module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index"))
         # Two-tile SK (DP first)
         # Do DP tiles before SK
diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py
index ff426f660c3..d68ac3a46c9 100644
--- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py
+++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py
@@ -38,7 +38,7 @@
                         scalarStaticRemainder, scalarUInt32DivideAndRemainder, sMagicDiv, vectorStaticMultiply, \
                         vectorStaticMultiplyAdd, scalarStaticMultiply64, BranchIfZero, BranchIfNotZero, DSInit, \
                         ArgumentLoader
-from rocisa.enum import InstType, SelectBit
+from rocisa.enum import InstType, SelectBit, CacheScope
 from rocisa.macro import MacroVMagicDiv, PseudoRandomGenerator
 from . import CUSTOM_KERNEL_PATH
 from rocisa.instruction import BranchInstruction, BufferLoadB128, BufferLoadB32, \
@@ -12123,7 +12123,9 @@ def bufferLoadImpl(soffset):
   ##############################################################################
   def chooseGlobalWrite(self, useBuffer, bps, srcVgpr, rpv, \
                         addr0, addr1, offset, soffset=0, \
-                        glc=False, slc=False, nt=False, hi16=0, comment="store"):
+                        glc=False, slc=False, nt=False, dlc=False, \
+                        scope=CacheScope.SCOPE_NONE, \
+                        hi16=0, comment="store"):
     """
     create the store instruction for requested vector width and other parms
     rpv = regs per vector
@@ -12166,7 +12168,7 @@ def bufferStoreImpl(tmpSgpr, mubuf):
           if offset2 >= 4096:
             module.add(SMovB32(dst=tmpSgpr, src=offset2, comment="large offset"))
             offset2 = 0
-          mubuf2 = MUBUFModifiers(offen=True, offset12=offset2, glc=glc, slc=slc, nt=nt, isStore=True)
+          mubuf2 = MUBUFModifiers(offen=True, offset12=offset2, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True)
           vgprOff = int(srcVgpr + shiftRpv * i) if isinstance(srcVgpr, int) else f"{srcVgpr}+{int(shiftRpv * i)}"
           module.add(BufferStoreB128(src=vgpr(vgprOff, shiftRpv), vaddr=addr0, \
                 saddr=addr1, soffset=tmpSgpr, mubuf=mubuf2, comment=comment))
@@ -12174,7 +12176,7 @@ def bufferStoreImpl(tmpSgpr, mubuf):
         assert 0, "bad bps"
 
     if useBuffer:
-      mubuf = MUBUFModifiers(offen=True, offset12=offset, glc=glc, slc=slc, nt=nt, isStore=True)
+      mubuf = MUBUFModifiers(offen=True, offset12=offset, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True)
       if soffset != 0:
         assert offset < 4096, "sgpr offset provided with large const offset"
       # buffer_load offset field is 12-bit.
@@ -12185,13 +12187,13 @@ def bufferStoreImpl(tmpSgpr, mubuf):
           tmpSgpr = sgpr(tmpSgprInfo.idx)
           if offset >= 4096:
             module.add(SMovB32(dst=tmpSgpr, src=offset, comment="large offset"))
-            mubuf = MUBUFModifiers(offen=True, offset12=0, glc=glc, slc=slc, nt=nt, isStore=True)
+            mubuf = MUBUFModifiers(offen=True, offset12=0, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True)
           bufferStoreImpl(tmpSgpr, mubuf)
       else:
         bufferStoreImpl(soffset, mubuf)
 
     else:
-      flat = FLATModifiers(glc=glc, slc=slc, isStore=True)
+      flat = FLATModifiers(glc=glc, slc=slc, dlc=dlc, scope=scope, isStore=True)
       if bps==2 and hi16:
         module.add(FlatStoreD16HIB16(vaddr=addr0, src=vgpr(srcVgpr*2), flat=flat, comment=comment))
       elif bps==2 and not hi16:
@@ -12321,6 +12323,8 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset
       isGlc = False
       isSlc = False
       isNT = False
+      scope = CacheScope.SCOPE_NONE
+      isDlc = False
 
       if tc == 'D':
         isGlc = bool(kernel["NonTemporalD"] & 0x1)
@@ -12361,6 +12365,8 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset
         isGlc = True
         isSlc = True
         isNT  = bool(kernel["NonTemporalD"] & 0x4)
+        isDlc = True
+        scope = CacheScope.SCOPE_DEV
 
         bps = self.states.bpeCinternal * ss.cfg.gwvw
         rpv = self.states.bpeCinternal * ss.cfg.gwvw / self.states.bpr
@@ -12410,36 +12416,36 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset
           if self.states.asmCaps["HasWMMA_V1"] and kernel["EnableMatrixInstruction"]:
             module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \
                 addr0, addr1, globalOffset, soffset=wsOffset, \
-                glc=isGlc, slc=isSlc, nt=isNT, hi16=0, comment=comment))
+                glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=0, comment=comment))
           else:
             module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx//2, rpv, \
                 addr0, addr1, globalOffset, soffset=wsOffset, \
-                glc=isGlc, slc=isSlc, nt=isNT, hi16=sumIdx%2, comment=comment))
+                glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=sumIdx%2, comment=comment))
         else:
           # (B,B,B,B,S,S), internal S
           # (H,H,H,H,H,H), internal S
           # (H,H,H,H,S,S), internal S
           module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \
               addr0, addr1, globalOffset, soffset=wsOffset, \
-              glc=isGlc, slc=isSlc, nt=isNT, hi16=0, comment=comment))
+              glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=0, comment=comment))
       elif dataType.isInt32() or dataType.isSingle():
         module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \
             addr0, addr1, globalOffset, soffset=wsOffset, \
-            glc=isGlc, slc=isSlc, nt=isNT, comment=comment))
+            glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment))
       elif dataType.isDouble() or dataType.isSingleComplex():
         module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx*2, rpv, \
             addr0, addr1, globalOffset, soffset=wsOffset, \
-            glc=isGlc, slc=isSlc, nt=isNT, comment=comment))
+            glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment))
       elif dataType.isDoubleComplex():
         rps = dataType.numRegisters()
         module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx*rps, rpv, \
             addr0, addr1, globalOffset, soffset=wsOffset, \
-            glc=isGlc, slc=isSlc, nt=isNT, comment=comment))
+            glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment))
       elif dataType.isInt8() or dataType.isAnyFloat8() or dataType.isAnyBFloat8() or dataType.isAnyFloat8BFloat8() or dataType.isAnyBFloat8Float8():
         if kernel["ProblemType"]["HighPrecisionAccumulate"]:
           module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \
               addr0, addr1, globalOffset, soffset=wsOffset, \
-              glc=isGlc, slc=isSlc, nt=isNT, comment=comment))
+              glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment))
     return module
 
   ##############################################################################
diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py
index e125227a01a..bb3f2bdf238 100644
--- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py
+++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py
@@ -954,8 +954,8 @@ def assignDerivedParameters(
       state["GlobalSplitUAlgorithm"] = "MultipleBuffer" # Set default Algorithm
       if not state["EnableMatrixInstruction"]:
         reject(state, printRejectionReason, "Stream-K requires MatrixInstruction")
-      if isaInfoMap[isa].asmCaps["HasWMMA"]:
-        reject(state, printRejectionReason, "Stream-K untested with WMMA")
+      # if isaInfoMap[isa].asmCaps["HasWMMA"]:
+      #   reject(state, printRejectionReason, "Stream-K untested with WMMA")
       # if state["PersistentKernel"]:
       #   reject(state, printRejectionReason, "Cannot enable both Stream-K and PersistentKernel")
       if not state["ProblemType"]["StridedBatched"]:
diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml
new file mode 100644
index 00000000000..601f1637277
--- /dev/null
+++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml
@@ -0,0 +1,450 @@
+TestParameters:
+  marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1200, skip-gfx1201] # not supported by arch
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  BoundsCheck: False
+  KernelTime: False
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 1
+  DataInitTypeA: 12
+  DataInitTypeB: 13
+  DataInitTypeC: 12
+  # DataInitTypeC: 1
+  # ValidationPrintValids: True
+  MaxWorkspaceSize: 134217728
+  # PrintSolutionRejectionReason: True
+  # ForceGenerateKernel: True
+  # GenerateSourcesAndExit: True
+  NumWarmups: 0
+  EnqueuesPerSync: 1
+  # NumBenchmarks: 10
+  SleepPercent: 50
+
+BenchmarkProblems:
+
+  - # HGEMM NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NT - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+    - # HGEMM NT - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+    - # HGEMM NT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM NN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NN - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [4]
+        # - VectorWidthB: [4]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+  - # HGEMM TN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TN - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM TT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0, 1]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml
new file mode 100644
index 00000000000..f9113cf9908
--- /dev/null
+++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml
@@ -0,0 +1,450 @@
+TestParameters:
+  marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1200, skip-gfx1201] # not supported by arch
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  BoundsCheck: False
+  KernelTime: False
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 1
+  DataInitTypeA: 12
+  DataInitTypeB: 13
+  DataInitTypeC: 12
+  # DataInitTypeC: 1
+  # ValidationPrintValids: True
+  MaxWorkspaceSize: 134217728
+  # PrintSolutionRejectionReason: True
+  # ForceGenerateKernel: True
+  # GenerateSourcesAndExit: True
+  NumWarmups: 0
+  EnqueuesPerSync: 1
+  # NumBenchmarks: 10
+  SleepPercent: 50
+
+BenchmarkProblems:
+
+  - # HGEMM NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NT - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+    - # HGEMM NT - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+    - # HGEMM NT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM NN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NN - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [4]
+        # - VectorWidthB: [4]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+  - # HGEMM TN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TN - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM TT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        - LocalReadVectorWidth: [16]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 1, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 2, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 3, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 4, 1, 4, 1]
+          - [16, 16, 16, 1, 1, 1, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 2, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 3, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 5, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 6, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 7, 1, 2, 2]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 2, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 6, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 7, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 9, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 0, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 1, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 3, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 4, 1, 1, 4]
+          - [16, 16, 16, 1, 1, 5, 1, 1, 4]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0, 1]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml
new file mode 100644
index 00000000000..ec1755ba557
--- /dev/null
+++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml
@@ -0,0 +1,414 @@
+TestParameters:
+  marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  BoundsCheck: False
+  KernelTime: False
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 1
+  DataInitTypeA: 12
+  DataInitTypeB: 13
+  DataInitTypeC: 12
+  # DataInitTypeC: 1
+  # ValidationPrintValids: True
+  MaxWorkspaceSize: 134217728
+  # PrintSolutionRejectionReason: True
+  # ForceGenerateKernel: True
+  # GenerateSourcesAndExit: True
+  NumWarmups: 0
+  EnqueuesPerSync: 1
+  # NumBenchmarks: 10
+  SleepPercent: 50
+
+BenchmarkProblems:
+
+  - # HGEMM NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NT - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [4]
+        # - VectorWidthB: [4]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+    - # HGEMM NT - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1, 2, 4]
+        # - VectorWidthB: [1, 2, 4]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+    - # HGEMM NT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [1]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1]
+        # - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM NN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NN - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [4]
+        # - VectorWidthB: [4]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+  - # HGEMM TN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TN - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1, 2, 4]
+        # - VectorWidthB: [1, 2, 4]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM TT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: b
+      DestDataType: b
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [1]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0, 1]
+        # - VectorWidthA: [1]
+        # - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
\ No newline at end of file
diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml
new file mode 100644
index 00000000000..c9aa0fe661c
--- /dev/null
+++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml
@@ -0,0 +1,414 @@
+TestParameters:
+  marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  BoundsCheck: False
+  KernelTime: False
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 1
+  DataInitTypeA: 12
+  DataInitTypeB: 13
+  DataInitTypeC: 12
+  # DataInitTypeC: 1
+  # ValidationPrintValids: True
+  MaxWorkspaceSize: 134217728
+  # PrintSolutionRejectionReason: True
+  # ForceGenerateKernel: True
+  # GenerateSourcesAndExit: True
+  NumWarmups: 0
+  EnqueuesPerSync: 1
+  # NumBenchmarks: 10
+  SleepPercent: 50
+
+BenchmarkProblems:
+
+  - # HGEMM NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NT - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [4]
+        # - VectorWidthB: [4]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+    - # HGEMM NT - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1, 2, 4]
+        # - VectorWidthB: [1, 2, 4]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+    - # HGEMM NT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [1]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1]
+        # - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM NN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NN - Test MatrixInstruction variants
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [4]
+        # - VectorWidthB: [4]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+  - # HGEMM TN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TN - Test DepthU, WGM, VW
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [1]
+        - DepthU: [ 32, 64 ]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [2, 4, 8]
+        - GlobalReadVectorWidthB: [2, 4, 8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [8]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        # - StoreVectorWidth: [4]
+        - StreamK: [3]
+        - TransposeLDS: [0]
+        # - VectorWidthA: [1, 2, 4]
+        # - VectorWidthB: [1, 2, 4]
+        - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
+
+  - # HGEMM TT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM TT - Test tuning params
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - 1LDSBuffer: [0, 1]
+        - DepthU: [64]
+        - ExpandPointerSwap: [False]
+        - GlobalReadVectorWidthA: [8]
+        - GlobalReadVectorWidthB: [8]
+        - GlobalSplitU: [0]
+        # - LocalReadVectorWidth: [1]
+        - MatrixInstruction:
+          - [16, 16, 16, 1, 1, 8,8, 2,2]
+          - [16, 16, 16, 1, 1, 8,4, 2,2]
+          - [16, 16, 16, 1, 1, 8,2, 2,2]
+          - [16, 16, 16, 1, 1, 8,1, 2,2]
+          - [16, 16, 16, 1, 1, 4,8, 2,2]
+          - [16, 16, 16, 1, 1, 4,4, 2,2]
+          - [16, 16, 16, 1, 1, 4,2, 2,2]
+          - [16, 16, 16, 1, 1, 4,1, 2,2]
+          - [16, 16, 16, 1, 1, 2,8, 2,2]
+          - [16, 16, 16, 1, 1, 2,4, 2,2]
+          - [16, 16, 16, 1, 1, 2,2, 2,2]
+          - [16, 16, 16, 1, 1, 2,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,8, 2,2]
+          - [16, 16, 16, 1, 1, 1,4, 2,2]
+          - [16, 16, 16, 1, 1, 1,2, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,2]
+        - MIArchVgpr: [0]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - StaggerU: [0, 32]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [False, True]
+        # - StoreVectorWidth: [1]
+        - StreamK: [3]
+        - PrefetchLocalRead: [1, 3]
+        - NonTemporalC: [0, 7]
+        - NonTemporalD: [0, 7]
+        - TransposeLDS: [0, 1]
+        # - VectorWidthA: [1]
+        # - VectorWidthB: [1]
+        - WorkGroupMapping: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          # - Exact: [4103, 4103, 1, 1031]
+          - Exact: [2055, 2055, 1, 1031]
\ No newline at end of file
diff --git a/projects/hipblaslt/tensilelite/include/Tensile/analytical/Hardware.hpp b/projects/hipblaslt/tensilelite/include/Tensile/analytical/Hardware.hpp
index c5c0bc26126..2177f316933 100644
--- a/projects/hipblaslt/tensilelite/include/Tensile/analytical/Hardware.hpp
+++ b/projects/hipblaslt/tensilelite/include/Tensile/analytical/Hardware.hpp
@@ -48,6 +48,7 @@ namespace TensileLite
             Int32,
             BFloat16,
             Int8,
+            Int4,
             Int64,
             XFloat32,
             Float8_fnuz,
@@ -92,6 +93,8 @@ namespace TensileLite
                 return 16;
             case DataType::Int8:
                 return 8;
+            case DataType::Int4:
+                return 4;
             case DataType::Int64:
                 return 64;
             case DataType::XFloat32:
@@ -145,6 +148,8 @@ namespace TensileLite
                 return "BFloat16";
             case DataType::Int8:
                 return "Int8";
+            case DataType::Int4:
+                return "Int4";
             case DataType::Int64:
                 return "Int64";
             case DataType::XFloat32:
@@ -195,6 +200,8 @@ namespace TensileLite
                 return DataType::BFloat16;
             if (s == "i8")
                 return DataType::Int8;
+            if (s == "i4")
+                return DataType::Int4;
             if (s == "xf32")
                 return DataType::XFloat32;
             if (s == "f8")
@@ -287,6 +294,8 @@ namespace TensileLite
                 gfx90a,
                 gfx942,
                 gfx950,
+                gfx1201,
+                gfx1100,
                 Count
             };
 
@@ -295,7 +304,9 @@ namespace TensileLite
                 static const std::unordered_map<std::string, Architecture> strToEnumMap
                     = {{"gfx90a", Architecture::gfx90a},
                        {"gfx942", Architecture::gfx942},
-                       {"gfx950", Architecture::gfx950}};
+                       {"gfx950", Architecture::gfx950},
+                       {"gfx1201", Architecture::gfx1201},
+                       {"gfx1100", Architecture::gfx1100}};
 
                 auto it = strToEnumMap.find(str);
                 if(it != strToEnumMap.end())
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp
index cc8ed19824e..1b0a3c10508 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp
@@ -102,15 +102,19 @@ namespace rocisa
 
     struct FLATModifiers : public Container
     {
-        FLATModifiers(int  offset12 = 0,
-                      bool glc      = false,
-                      bool slc      = false,
-                      bool lds      = false,
-                      bool isStore  = false)
+        FLATModifiers(int        offset12 = 0,
+                      bool       glc      = false,
+                      bool       slc      = false,
+                      bool       dlc      = false,
+                      CacheScope scope    = CacheScope::SCOPE_NONE,
+                      bool       lds      = false,
+                      bool       isStore  = false)
             : Container()
             , offset12(offset12)
             , glc(glc)
             , slc(slc)
+            , dlc(dlc)
+            , scope(scope)
             , lds(lds)
             , isStore(isStore)
         {
@@ -121,6 +125,8 @@ namespace rocisa
             , offset12(other.offset12)
             , glc(other.glc)
             , slc(other.slc)
+            , dlc(other.dlc)
+            , scope(other.scope)
             , lds(other.lds)
             , isStore(other.isStore)
         {
@@ -133,20 +139,30 @@ namespace rocisa
 
         std::string toString() const override
         {
-            auto        hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"];
+            auto        hasGLCModifier   = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"];
+            auto        hasDLCModifier   = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"];
+            auto        hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"];
             std::string kStr;
             if(offset12 != 0)
             {
                 kStr += " offset:" + std::to_string(offset12);
             }
-            if(glc)
+            if(hasGLCModifier && glc)
             {
                 kStr += " " + getGlcBitName(hasGLCModifier);
             }
-            if(slc)
+            if(hasGLCModifier && slc)
             {
                 kStr += " " + getSlcBitName(hasGLCModifier);
             }
+            if(hasDLCModifier && dlc)
+            {
+                kStr += " dlc";
+            }
+            if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE)
+            {
+                kStr += " scope:" + ::rocisa::toString(scope);
+            }
             if(lds)
             {
                 kStr += " lds";
@@ -154,27 +170,33 @@ namespace rocisa
             return kStr;
         }
 
-        int  offset12;
-        bool glc;
-        bool slc;
-        bool lds;
-        bool isStore;
+        int        offset12;
+        bool       glc;
+        bool       slc;
+        bool       dlc;
+        CacheScope scope;
+        bool       lds;
+        bool       isStore;
     };
 
     struct MUBUFModifiers : public Container
     {
-        MUBUFModifiers(bool offen    = false,
-                       int  offset12 = 0,
-                       bool glc      = false,
-                       bool slc      = false,
-                       bool nt       = false,
-                       bool lds      = false,
-                       bool isStore  = false)
+        MUBUFModifiers(bool       offen    = false,
+                       int        offset12 = 0,
+                       bool       glc      = false,
+                       bool       slc      = false,
+                       bool       dlc      = false,
+                       CacheScope scope    = CacheScope::SCOPE_NONE,
+                       bool       nt       = false,
+                       bool       lds      = false,
+                       bool       isStore  = false)
             : Container()
             , offen(offen)
             , offset12(offset12)
             , glc(glc)
             , slc(slc)
+            , dlc(dlc)
+            , scope(scope)
             , nt(nt)
             , lds(lds)
             , isStore(isStore)
@@ -187,6 +209,8 @@ namespace rocisa
             , offset12(other.offset12)
             , glc(other.glc)
             , slc(other.slc)
+            , dlc(other.dlc)
+            , scope(other.scope)
             , nt(other.nt)
             , lds(other.lds)
             , isStore(other.isStore)
@@ -200,26 +224,32 @@ namespace rocisa
 
         std::string toString() const override
         {
-            auto        hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"];
-            auto        hasSLCModifier = rocIsa::getInstance().getAsmCaps()["HasSLCModifier"];
-            auto        hasNTModifier  = rocIsa::getInstance().getAsmCaps()["HasNTModifier"];
+            auto        hasGLCModifier   = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"];
+            auto        hasSLCModifier   = rocIsa::getInstance().getAsmCaps()["HasSLCModifier"];
+            auto        hasDLCModifier   = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"];
+            auto        hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"];
+            auto        hasNTModifier    = rocIsa::getInstance().getAsmCaps()["HasNTModifier"];
             std::string kStr;
             if(offen)
             {
                 kStr += " offen offset:" + std::to_string(offset12);
             }
-            if(glc || slc || lds)
-            {
-                kStr += ",";
-            }
-            if(glc)
+            if(hasGLCModifier && glc)
             {
                 kStr += " " + getGlcBitName(hasGLCModifier);
             }
-            if(slc)
+            if(hasGLCModifier && slc)
             {
                 kStr += " " + getSlcBitName(hasGLCModifier);
             }
+            if(hasDLCModifier && dlc)
+            {
+                kStr += " dlc";
+            }
+            if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE)
+            {
+                kStr += " scope:" + ::rocisa::toString(scope);
+            }
             if(hasNTModifier && nt)
             {
                 kStr += " nt";
@@ -231,20 +261,28 @@ namespace rocisa
             return kStr;
         }
 
-        bool offen;
-        int  offset12;
-        bool glc;
-        bool slc;
-        bool nt;
-        bool lds;
-        bool isStore;
+        bool       offen;
+        int        offset12;
+        bool       glc;
+        bool       slc;
+        bool       dlc;
+        CacheScope scope;
+        bool       nt;
+        bool       lds;
+        bool       isStore;
     };
 
     struct SMEMModifiers : public Container
     {
-        SMEMModifiers(bool glc = false, bool nv = false, int offset = 0)
+        SMEMModifiers(bool       glc    = false,
+                      bool       dlc    = false,
+                      CacheScope scope  = CacheScope::SCOPE_NONE,
+                      bool       nv     = false,
+                      int        offset = 0)
             : Container()
             , glc(glc)
+            , dlc(dlc)
+            , scope(scope)
             , nv(nv)
             , offset(offset) // 20u 21s shaes the same
         {
@@ -253,6 +291,8 @@ namespace rocisa
         SMEMModifiers(const SMEMModifiers& other)
             : Container()
             , glc(other.glc)
+            , dlc(other.dlc)
+            , scope(other.scope)
             , nv(other.nv)
             , offset(other.offset)
         {
@@ -265,14 +305,25 @@ namespace rocisa
 
         std::string toString() const override
         {
+            auto        hasGLCModifier   = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"];
+            auto        hasDLCModifier   = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"];
+            auto        hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"];
             std::string kStr;
             if(offset != 0)
             {
                 kStr += " offset:" + std::to_string(offset);
             }
-            if(glc)
+            if(hasGLCModifier && glc)
             {
-                kStr += " glc";
+                kStr += " " + getGlcBitName(hasGLCModifier);
+            }
+            if(hasDLCModifier && dlc)
+            {
+                kStr += " dlc";
+            }
+            if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE)
+            {
+                kStr += " scope:" + ::rocisa::toString(scope);
             }
             if(nv)
             {
@@ -281,9 +332,11 @@ namespace rocisa
             return kStr;
         }
 
-        bool glc;
-        bool nv;
-        int  offset;
+        bool       glc;
+        bool       dlc;
+        CacheScope scope;
+        bool       nv;
+        int        offset;
     };
 
     struct SDWAModifiers : public Container
@@ -619,6 +672,7 @@ namespace rocisa
         bool                   isMinus;
         bool                   isAbs;
         bool                   isMacro;
+        bool                   isOff;
 
         RegisterContainer(const std::string&            regType,
                           const std::optional<RegName>& regName,
@@ -633,6 +687,7 @@ namespace rocisa
             , isMinus(false)
             , isAbs(false)
             , isMacro(false)
+            , isOff(false)
         {
         }
 
@@ -640,6 +695,7 @@ namespace rocisa
                           const std::optional<RegName>& regName,
                           bool                          isAbs,
                           bool                          isMacro,
+                          bool                          isOff,
                           int                           regIdx = 0,
                           float                         regNum = 1)
             : Container()
@@ -651,6 +707,7 @@ namespace rocisa
             , isMinus(false)
             , isAbs(isAbs)
             , isMacro(isMacro)
+            , isOff(isOff)
         {
         }
 
@@ -664,6 +721,7 @@ namespace rocisa
             , isMinus(other.isMinus)
             , isAbs(other.isAbs)
             , isMacro(other.isMacro)
+            , isOff(other.isOff)
         {
         }
 
@@ -687,6 +745,7 @@ namespace rocisa
             , isMinus(other.isMinus)
             , isAbs(other.isAbs)
             , isMacro(other.isMacro)
+            , isOff(other.isOff)
         {
         }
 
@@ -702,6 +761,7 @@ namespace rocisa
                 isMinus     = other.isMinus;
                 isAbs       = other.isAbs;
                 isMacro     = other.isMacro;
+                isOff       = other.isOff;
             }
             return *this;
         }
@@ -718,6 +778,7 @@ namespace rocisa
                 isMinus     = other.isMinus;
                 isAbs       = other.isAbs;
                 isMacro     = other.isMacro;
+                isOff       = other.isOff;
             }
             return *this;
         }
@@ -829,6 +890,11 @@ namespace rocisa
 
         std::string toString() const override
         {
+            if(isOff)
+            {
+                return "off";
+            }
+
             std::string minusStr = isMinus ? "-" : "";
             minusStr             = isAbs ? "abs(" + minusStr : minusStr;
             auto absStr          = isAbs ? ")" : "";
@@ -1090,8 +1156,11 @@ namespace rocisa
     // Overloaded functions to create specific GPR containers with default regNum = 1.f
     std::shared_ptr<RegisterContainer> vgpr(const Holder& holder, float regNum = 1.f);
     std::shared_ptr<RegisterContainer> vgpr(int idx, float regNum = 1.f);
-    std::shared_ptr<RegisterContainer>
-        vgpr(const std::string& name, float regNum = 1.f, bool isMacro = false, bool isAbs = false);
+    std::shared_ptr<RegisterContainer> vgpr(const std::string& name,
+                                            float              regNum  = 1.f,
+                                            bool               isMacro = false,
+                                            bool               isAbs   = false,
+                                            bool               isOff   = false);
     std::shared_ptr<RegisterContainer> sgpr(const Holder& holder, float regNum = 1.f);
     std::shared_ptr<RegisterContainer> sgpr(int idx, float regNum = 1.f);
     std::shared_ptr<RegisterContainer>
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp
index e1c748d4a2f..d5fa58d6c59 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp
@@ -222,6 +222,15 @@ namespace rocisa
         UNUSED_PRESERVE = 3
     };
 
+    enum class CacheScope : int
+    {
+        SCOPE_NONE = 0,
+        SCOPE_CU   = 1,
+        SCOPE_SE   = 2,
+        SCOPE_DEV  = 3,
+        SCOPE_SYS  = 4,
+    };
+
     enum class CvtType : int
     {
         CVT_F16_to_F32          = 1,
@@ -294,6 +303,23 @@ namespace rocisa
         }
     }
 
+    inline std::string toString(CacheScope scope)
+    {
+        switch(scope)
+        {
+        case CacheScope::SCOPE_CU:
+            return "SCOPE_CU";
+        case CacheScope::SCOPE_SE:
+            return "SCOPE_SE";
+        case CacheScope::SCOPE_DEV:
+            return "SCOPE_DEV";
+        case CacheScope::SCOPE_SYS:
+            return "SCOPE_SYS";
+        default:
+            return "";
+        }
+    }
+
     enum class SaturateCastType : int
     {
         NORMAL     = 1,
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp
index cee81dbf654..fe2f26fc10d 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp
@@ -117,7 +117,8 @@ inline std::map<std::string, int>
         = tryAssembler(isaVersion, assemblerPath, "v_lshl_or_b32 v47, v36, 0x2, v34", isDebug);
     rv["HasSMulHi"]
         = tryAssembler(isaVersion, assemblerPath, "s_mul_hi_u32 s47, s36, s34", isDebug);
-
+    rv["HasScalarStore"]
+        = tryAssembler(isaVersion, assemblerPath, "s_store_dword s79, s[s70:s71], s77", isDebug);
     rv["HasMFMA_explictB"] = tryAssembler(
         isaVersion, assemblerPath, "v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]", isDebug);
     rv["HasMFMA"] = tryAssembler(isaVersion,
@@ -270,6 +271,26 @@ inline std::map<std::string, int>
                           assemblerPath,
                           "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, glc",
                           isDebug);
+    rv["HasDLCModifier"]
+        = tryAssembler(isaVersion,
+                       assemblerPath,
+                       "buffer_load_dwordx4 v[10:13], v[0], s[0:3], 0, offen offset:0, dlc",
+                       isDebug)
+          || tryAssembler(isaVersion,
+                          assemblerPath,
+                          "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, dlc",
+                          isDebug);
+    rv["HasSCOPEModifier"]
+        = tryAssembler(
+              isaVersion,
+              assemblerPath,
+              "buffer_load_dwordx4 v[10:13], v[0], s[0:3], 0, offen offset:0, scope:SCOPE_DEV",
+              isDebug)
+          || tryAssembler(
+              isaVersion,
+              assemblerPath,
+              "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, scope:SCOPE_DEV",
+              isDebug);
     rv["HasMUBUFConst"] = tryAssembler(isaVersion,
                                        assemblerPath,
                                        "buffer_load_dword v40, v36, s[24:27], 1 offen offset:0",
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp
index 5c105cc63fe..8b21934b958 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp
@@ -775,7 +775,7 @@ namespace rocisa
                           std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                           const std::string&                        comment = "")
             : MUBUFReadInstruction(
-                InstType::INST_D16_HI_U8, dst, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_HI_U8, dst, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -799,7 +799,7 @@ namespace rocisa
                         std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                         const std::string&                        comment = "")
             : MUBUFReadInstruction(
-                InstType::INST_D16_U8, dst, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_U8, dst, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -823,7 +823,7 @@ namespace rocisa
                            std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                            const std::string&                        comment = "")
             : MUBUFReadInstruction(
-                InstType::INST_D16_HI_B16, dst, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_HI_B16, dst, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -847,7 +847,7 @@ namespace rocisa
                          std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                          const std::string&                        comment = "")
             : MUBUFReadInstruction(
-                InstType::INST_D16_B16, dst, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_B16, dst, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -1110,7 +1110,7 @@ namespace rocisa
                            std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                            const std::string&                        comment = "")
             : MUBUFStoreInstruction(
-                InstType::INST_D16_HI_U8, src, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_HI_U8, src, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -1134,7 +1134,7 @@ namespace rocisa
                          std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                          const std::string&                        comment = "")
             : MUBUFStoreInstruction(
-                InstType::INST_D16_U8, src, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_U8, src, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -1158,7 +1158,7 @@ namespace rocisa
                             std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                             const std::string&                        comment = "")
             : MUBUFStoreInstruction(
-                InstType::INST_D16_HI_B16, src, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_HI_B16, src, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
@@ -1182,7 +1182,7 @@ namespace rocisa
                           std::optional<MUBUFModifiers>             mubuf   = std::nullopt,
                           const std::string&                        comment = "")
             : MUBUFStoreInstruction(
-                InstType::INST_D16_B16, src, vaddr, saddr, soffset, mubuf, comment)
+                  InstType::INST_D16_B16, src, vaddr, saddr, soffset, mubuf, comment)
         {
         }
 
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp
index 966406f58f4..b9c8d246668 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp
@@ -105,10 +105,12 @@ namespace rocisa
                                                  const std::string& name,
                                                  float              regNum  = 1.f,
                                                  bool               isMacro = false,
-                                                 bool               isAbs   = false)
+                                                 bool               isAbs   = false,
+                                                 bool               isOff   = false)
     {
         RegName regname = generateRegName(name);
-        return std::make_shared<RegisterContainer>(gprType, regname, isAbs, isMacro, -1, regNum);
+        return std::make_shared<RegisterContainer>(
+            gprType, regname, isAbs, isMacro, isOff, -1, regNum);
     }
 
     // Overloaded functions to create specific GPR containers with default regNum = 1.f
@@ -123,9 +125,9 @@ namespace rocisa
     }
 
     std::shared_ptr<RegisterContainer>
-        vgpr(const std::string& name, float regNum, bool isMacro, bool isAbs)
+        vgpr(const std::string& name, float regNum, bool isMacro, bool isAbs, bool isOff)
     {
-        return createGPR("v", name, regNum, isMacro, isAbs);
+        return createGPR("v", name, regNum, isMacro, isAbs, isOff);
     }
 
     std::shared_ptr<RegisterContainer> sgpr(const Holder& holder, float regNum)
@@ -187,11 +189,12 @@ void init_containers(nb::module_ m)
               nb::arg("idx"),
               nb::arg("regNum") = 1.f);
     m_con.def("vgpr",
-              nb::overload_cast<const std::string&, float, bool, bool>(&rocisa::vgpr),
+              nb::overload_cast<const std::string&, float, bool, bool, bool>(&rocisa::vgpr),
               nb::arg("name"),
               nb::arg("regNum")  = 1.f,
               nb::arg("isMacro") = false,
-              nb::arg("isAbs")   = false);
+              nb::arg("isAbs")   = false,
+              nb::arg("isOff")   = false);
 
     m_con.def("sgpr",
               nb::overload_cast<const rocisa::Holder&, float>(&rocisa::sgpr),
@@ -276,10 +279,12 @@ void init_containers(nb::module_ m)
                 std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t));
         });
     nb::class_<rocisa::FLATModifiers, rocisa::Container>(m_con, "FLATModifiers")
-        .def(nb::init<int, bool, bool, bool, bool>(),
+        .def(nb::init<int, bool, bool, bool, rocisa::CacheScope, bool, bool>(),
              nb::arg("offset12") = 0,
              nb::arg("glc")      = false,
              nb::arg("slc")      = false,
+             nb::arg("dlc")      = false,
+             nb::arg("scope")    = 0,
              nb::arg("lds")      = false,
              nb::arg("isStore")  = false)
         .def_rw("isStore", &rocisa::FLATModifiers::isStore)
@@ -290,21 +295,34 @@ void init_containers(nb::module_ m)
              })
         .def("__getstate__",
              [](const rocisa::FLATModifiers& self) {
-                 return std::make_tuple(self.offset12, self.glc, self.slc, self.lds, self.isStore);
+                 return std::make_tuple(self.offset12,
+                                        self.glc,
+                                        self.slc,
+                                        self.dlc,
+                                        self.scope,
+                                        self.lds,
+                                        self.isStore);
              })
-        .def(
-            "__setstate__",
-            [](rocisa::FLATModifiers& self, std::tuple<int, bool, bool, bool, bool> t) {
-                new(&self) rocisa::FLATModifiers(
-                    std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t));
-            });
+        .def("__setstate__",
+             [](rocisa::FLATModifiers&                                            self,
+                std::tuple<int, bool, bool, bool, rocisa::CacheScope, bool, bool> t) {
+                 new(&self) rocisa::FLATModifiers(std::get<0>(t),
+                                                  std::get<1>(t),
+                                                  std::get<2>(t),
+                                                  std::get<3>(t),
+                                                  std::get<4>(t),
+                                                  std::get<5>(t),
+                                                  std::get<6>(t));
+             });
 
     nb::class_<rocisa::MUBUFModifiers, rocisa::Container>(m_con, "MUBUFModifiers")
-        .def(nb::init<bool, int, bool, bool, bool, bool, bool>(),
+        .def(nb::init<bool, int, bool, bool, bool, rocisa::CacheScope, bool, bool, bool>(),
              nb::arg("offen")    = false,
              nb::arg("offset12") = 0,
              nb::arg("glc")      = false,
              nb::arg("slc")      = false,
+             nb::arg("dlc")      = false,
+             nb::arg("scope")    = 0,
              nb::arg("nt")       = false,
              nb::arg("lds")      = false,
              nb::arg("isStore")  = false)
@@ -314,27 +332,37 @@ void init_containers(nb::module_ m)
              [](const rocisa::MUBUFModifiers& self, nb::dict&) {
                  return rocisa::MUBUFModifiers(self);
              })
-        .def(
-            "__getstate__",
-            [](const rocisa::MUBUFModifiers& self) {
-                return std::make_tuple(
-                    self.offen, self.offset12, self.glc, self.slc, self.nt, self.lds, self.isStore);
-            })
+        .def("__getstate__",
+             [](const rocisa::MUBUFModifiers& self) {
+                 return std::make_tuple(self.offen,
+                                        self.offset12,
+                                        self.glc,
+                                        self.slc,
+                                        self.dlc,
+                                        self.scope,
+                                        self.nt,
+                                        self.lds,
+                                        self.isStore);
+             })
         .def("__setstate__",
-             [](rocisa::MUBUFModifiers&                             self,
-                std::tuple<bool, int, bool, bool, bool, bool, bool> t) {
+             [](rocisa::MUBUFModifiers&                                                       self,
+                std::tuple<bool, int, bool, bool, bool, rocisa::CacheScope, bool, bool, bool> t) {
                  new(&self) rocisa::MUBUFModifiers(std::get<0>(t),
                                                    std::get<1>(t),
                                                    std::get<2>(t),
                                                    std::get<3>(t),
                                                    std::get<4>(t),
                                                    std::get<5>(t),
-                                                   std::get<6>(t));
+                                                   std::get<6>(t),
+                                                   std::get<7>(t),
+                                                   std::get<8>(t));
              });
 
     nb::class_<rocisa::SMEMModifiers, rocisa::Container>(m_con, "SMEMModifiers")
-        .def(nb::init<bool, bool, int>(),
+        .def(nb::init<bool, bool, rocisa::CacheScope, bool, int>(),
              nb::arg("glc")    = false,
+             nb::arg("dlc")    = false,
+             nb::arg("scope")  = 0,
              nb::arg("nv")     = false,
              nb::arg("offset") = 0)
         .def("__str__", &rocisa::SMEMModifiers::toString)
@@ -344,11 +372,15 @@ void init_containers(nb::module_ m)
              })
         .def("__getstate__",
              [](const rocisa::SMEMModifiers& self) {
-                 return std::make_tuple(self.glc, self.nv, self.offset);
+                 return std::make_tuple(self.glc, self.dlc, self.scope, self.nv, self.offset);
              })
-        .def("__setstate__", [](rocisa::SMEMModifiers& self, std::tuple<bool, bool, int> t) {
-            new(&self) rocisa::SMEMModifiers(std::get<0>(t), std::get<1>(t), std::get<2>(t));
-        });
+        .def(
+            "__setstate__",
+            [](rocisa::SMEMModifiers&                                self,
+               std::tuple<bool, bool, rocisa::CacheScope, bool, int> t) {
+                new(&self) rocisa::SMEMModifiers(
+                    std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t));
+            });
 
     nb::class_<rocisa::SDWAModifiers, rocisa::Container>(m_con, "SDWAModifiers")
         .def(nb::init<rocisa::SelectBit, rocisa::UnusedBit, rocisa::SelectBit, rocisa::SelectBit>(),
diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp
index d09ed7c4108..be435f90255 100644
--- a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp
+++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp
@@ -128,6 +128,14 @@ void init_enum(nb::module_ m)
         .value("UNUSED_PRESERVE", rocisa::UnusedBit::UNUSED_PRESERVE)
         .export_values();
 
+    nb::enum_<rocisa::CacheScope>(m_enum, "CacheScope")
+        .value("SCOPE_NONE", rocisa::CacheScope::SCOPE_NONE)
+        .value("SCOPE_CU", rocisa::CacheScope::SCOPE_CU)
+        .value("SCOPE_SE", rocisa::CacheScope::SCOPE_SE)
+        .value("SCOPE_DEV", rocisa::CacheScope::SCOPE_DEV)
+        .value("SCOPE_SYS", rocisa::CacheScope::SCOPE_SYS)
+        .export_values();
+
     nb::enum_<rocisa::CvtType>(m_enum, "CvtType")
         .value("CVT_F16_to_F32", rocisa::CvtType::CVT_F16_to_F32)
         .value("CVT_F32_to_F16", rocisa::CvtType::CVT_F32_to_F16)
diff --git a/projects/hipblaslt/tensilelite/rocisa/test/test_container.py b/projects/hipblaslt/tensilelite/rocisa/test/test_container.py
index c550419085a..9fcb7ce6d78 100644
--- a/projects/hipblaslt/tensilelite/rocisa/test/test_container.py
+++ b/projects/hipblaslt/tensilelite/rocisa/test/test_container.py
@@ -41,15 +41,23 @@ def test_containers():
     assert str(ds_modifiers) == " offset:2 gds"
 
     # Test FLATModifiers
-    flat_modifiers = rocisa.container.FLATModifiers(8, True, False, True, False)
+    flat_modifiers = rocisa.container.FLATModifiers(
+        offset12=8, glc=True, slc=False, dlc=False,
+        scope=rocisa.enum.CacheScope.SCOPE_NONE,
+        lds=True, isStore=False)
     assert str(flat_modifiers) == " offset:8 glc lds"
 
     # Test MUBUFModifiers
-    mubuf_modifiers = rocisa.container.MUBUFModifiers(True, 12, True, False, True, False, True)
-    assert str(mubuf_modifiers) == " offen offset:12, glc"
+    mubuf_modifiers = rocisa.container.MUBUFModifiers(
+        offen=True, offset12=12, glc=True, slc=False,
+        dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE,
+        nt=True, lds=False, isStore=True)
+    assert str(mubuf_modifiers) == " offen offset:12 glc"
 
     # Test SMEMModifiers
-    smem_modifiers = rocisa.container.SMEMModifiers(True, False, 8)
+    smem_modifiers = rocisa.container.SMEMModifiers(
+        glc=True, dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE,
+        nv=False, offset=8)
     assert str(smem_modifiers) == " offset:8 glc"
 
     # Test SDWAModifiers
@@ -104,7 +112,7 @@ def test_containers():
 
     # Test Holder
     from rocisa.container import vgpr, sgpr, accvgpr, mgpr, Holder
-    holder = Holder("holder");
+    holder = Holder("holder")
     assert holder.idx == -1
     assert str(holder.name) == "holder"
     assert isinstance(holder, Holder)
@@ -120,6 +128,8 @@ def test_containers():
     assert str(testGpr) == "acc4"
     testGpr = mgpr(holder)
     assert str(testGpr) == "m[mgprholder]"
+    testGpr = vgpr("off", isOff=True)
+    assert str(testGpr) == "off"
 
 def test_containers_copy():
     def copy_test(name, obj):
@@ -133,9 +143,14 @@ def copy_test(name, obj):
     copy_test("DSModifiers", ds)
     flat = rocisa.container.FLATModifiers(lds=True)
     copy_test("FLATModifiers", flat)
-    mubuf = rocisa.container.MUBUFModifiers(True, 12, True, False, True, False, True)
+    mubuf = rocisa.container.MUBUFModifiers(
+        offen=True, offset12=12, glc=True, slc=False,
+        dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE,
+        nt=True, lds=False, isStore=True)
     copy_test("MUBUFModifiers", mubuf)
-    smem = rocisa.container.SMEMModifiers(True, False, 8)
+    smem = rocisa.container.SMEMModifiers(
+        glc=True, dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE,
+        nv=False, offset=8)
     copy_test("SMEMModifiers", smem)
     sdwa = rocisa.container.SDWAModifiers(
         dst_sel=rocisa.enum.SelectBit.WORD_0,
diff --git a/projects/hipblaslt/tensilelite/src/analytical/Hardware.cpp b/projects/hipblaslt/tensilelite/src/analytical/Hardware.cpp
index c65a994e6bb..e355539cb66 100644
--- a/projects/hipblaslt/tensilelite/src/analytical/Hardware.cpp
+++ b/projects/hipblaslt/tensilelite/src/analytical/Hardware.cpp
@@ -40,7 +40,13 @@ namespace TensileLite
                     8, 17, 1.21875121875121875122 * 6, 4, 4, 1.5e-2, 1.5)},
                {Hardware::Architecture::gfx950,
                 Hardware::ArchitectureConstants(
-                    8, 17, 1.21875121875121875122 * 7, 6, 4, 0.008, 1.5)}};
+                    8, 17, 1.21875121875121875122 * 7, 6, 4, 0.008, 1.5)},
+               {Hardware::Architecture::gfx1201,
+                Hardware::ArchitectureConstants(
+                    1, 5.74, 1.21875121875121875122 * 2.41, 0.464, 2, 0.17, 1.5)},
+               {Hardware::Architecture::gfx1100,
+                Hardware::ArchitectureConstants(
+                    1, 7.12, 1.21875121875121875122 * 3.48, 0.732, 2, 0.11, 1.5)}};
 
         // Schema : (MI_M, MI_N, MI_K, DataType)
         const std::unordered_map<Hardware::Architecture,
@@ -196,6 +202,46 @@ namespace TensileLite
                     // DOT2
                     {MatrixInstruction( 1,  1,  64, DataType::Half), 16}, // V_DOT2_F32_F16
                     {MatrixInstruction( 1,  1,  64, DataType::BFloat16), 16}, // V_DOT2_F32_BF16
+                }},
+               {Hardware::Architecture::gfx1201,
+                {
+                    // F16
+                    {MatrixInstruction(16, 16, 16, DataType::Half), 16}, // v_wmma_f16_16x16x16_f16/v_wmma_f32_16x16x16_f16
+                    {MatrixInstruction(16, 16, 32, DataType::Half), 16}, // v_swmmac_f16_16x16x32_f16/v_swmmac_f32_16x16x32_f16
+                    // BF16
+                    {MatrixInstruction(16, 16, 16, DataType::BFloat16), 16}, // v_wmma_bf16_16x16x16_bf16/v_wmma_f32_16x16x16_bf16
+                    {MatrixInstruction(16, 16, 32, DataType::BFloat16), 16}, // v_swmmac_bf16_16x16x32_bf16/v_swmmac_f32_16x16x32_bf16
+                    // F8
+                    {MatrixInstruction(16, 16, 16, DataType::Float8), 8}, // v_wmma_f32_16x16x16_fp8_fp8
+                    {MatrixInstruction(16, 16, 32, DataType::Float8), 8}, // v_swmmac_f32_16x16x32_fp8_fp8
+                    // F8B8
+                    {MatrixInstruction(16, 16, 16, DataType::Float8BFloat8), 8}, // v_wmma_f32_16x16x16_fp8_bf8
+                    {MatrixInstruction(16, 16, 32, DataType::Float8BFloat8), 8}, // v_swmmac_f32_16x16x32_fp8_bf8
+                    // B8F8
+                    {MatrixInstruction(16, 16, 16, DataType::BFloat8Float8), 8}, // v_wmma_f32_16x16x16_bf8_fp8
+                    {MatrixInstruction(16, 16, 32, DataType::BFloat8Float8), 8}, // v_swmmac_f32_16x16x32_bf8_fp8
+                    // B8
+                    {MatrixInstruction(16, 16, 16, DataType::BFloat8), 8}, // v_wmma_f32_16x16x16_bf8_bf8
+                    {MatrixInstruction(16, 16, 32, DataType::BFloat8), 8}, // v_swmmac_f32_16x16x32_bf8_bf8
+                    // I8
+                    {MatrixInstruction(16, 16, 16, DataType::Int8), 8}, // v_wmma_i32_16x16x16_iu8
+                    {MatrixInstruction(16, 16, 32, DataType::Int8), 8}, // v_swmmac_i32_16x16x32_iu8
+                    // I4
+                    {MatrixInstruction(16, 16, 16, DataType::Int4), 8}, // v_wmma_i32_16x16x16_iu4
+                    {MatrixInstruction(16, 16, 32, DataType::Int4), 8}, // v_wmma_i32_16x16x32_iu4
+                    {MatrixInstruction(16, 16, 32, DataType::Int4), 8}, // v_swmmac_i32_16x16x32_iu4
+                    {MatrixInstruction(16, 16, 64, DataType::Int4), 8}, // v_swmmac_i32_16x16x64_iu4
+                }},
+               {Hardware::Architecture::gfx1100,
+                {
+                    // F16
+                    {MatrixInstruction(16, 16, 16, DataType::Half), 16}, // v_wmma_f32_16x16x16_f16/v_wmma_f16_16x16x16_f16
+                    // BF16
+                    {MatrixInstruction(16, 16, 16, DataType::BFloat16), 16}, // v_wmma_f32_16x16x16_bf16/v_wmma_bf16_16x16x16_bf16
+                    // I8
+                    {MatrixInstruction(16, 16, 16, DataType::Int8), 8}, // v_wmma_i32_16x16x16_iu8
+                    // I4
+                    {MatrixInstruction(16, 16, 16, DataType::Int4), 8}, // v_wmma_i32_16x16x16_iu4
                 }}};
 
     } // namespace analytical