ROCm · AlexBrownAMD · Feb 26, 2026 · Feb 6, 2026 · Feb 25, 2026 · Feb 26, 2026
@@ -307,9 +307,9 @@ def computeLoadSrd(self, writer, kernel, tP, stmp, tileStart):
         _DepthU = kernel["_DepthU%s" % tc]
         # swizzle
         if (tP["isSwizzled"] and tc == 'A'):
-            _DepthU = (_DepthU * 16) # MI_M = 16
+            _DepthU = (_DepthU * 16)
         elif (tP["isSwizzled"] and tc == 'B'):
-            _DepthU = (_DepthU * 16) # MI_N = 16
+            _DepthU = (_DepthU * 16)
 
         gsucLabel    = Label(label=writer.labels.getNameInc(f"GSUC_{tc}"), comment="")
         gsucLabelEnd = Label(label=writer.labels.getNameInc(f"GSUC_{tc}_End"), comment="")

@@ -176,15 +176,23 @@ def skExtraIters(self, writer, kernel, sSkExtraIters, sTmp):
         return module
 
     @abc.abstractmethod
-    def computeLoadSrd(self, writer, kernel, tc, sTmp):
+    def computeLoadSrd(self, writer, kernel, tP, sTmp):
         pass
 
-    def computeLoadSrdCommon(self, writer, kernel, tc, sTmp):
+    def computeLoadSrdCommon(self, writer, kernel, tP, sTmp):
         module = Module("StreamK Common computeLoadSrd")
 
+        tc = tP["tensorChar"]
+        _DepthU = kernel["_DepthU%s" % tc]
+        # swizzle
+        if (tP["isSwizzled"] and tc == 'A'):
+            _DepthU = (_DepthU * 16)
+        elif (tP["isSwizzled"] and tc == 'B'):
+            _DepthU = (_DepthU * 16)
+
         tileStart = sTmp + 2
         # StreamK partial tile - offset to tile start index
-        module.add(SMulI32(dst=sgpr(sTmp), src0=sgpr("StreamKLocalStart"), src1=kernel["DepthU"], comment="StreamK tile start offset"))
+        module.add(SMulI32(dst=sgpr(sTmp), src0=sgpr("StreamKLocalStart"), src1=_DepthU, comment="StreamK tile start offset"))
         strideL = writer.strideRef(tc, kernel["ProblemType"]["IndicesSummation"][0])
         module.add(writer.s_mul_u64_u32(sgpr(sTmp), sgpr(sTmp+1), sgpr(sTmp), strideL, comment="StreamK tile start offset"))
         # Overflow check removed
@@ -255,9 +263,16 @@ def graAddressesCommon(self, writer, kernel, tP, vTmp):
         module = Module("StreamK Common graAddresses")
 
         tc = tP["tensorChar"]
+        _DepthU = kernel["_DepthU%s" % tc]
+        # swizzle
+        if (tP["isSwizzled"] and tc == 'A'):
+            _DepthU = (_DepthU * 16)
+        elif (tP["isSwizzled"] and tc == 'B'):
+            _DepthU = (_DepthU * 16)
+
         # StreamK partial tile - offset to tile start index
         tmpOffset = writer.sgprPool.checkOut(2, "skStartOffset")
-        module.add(SMulI32(dst=sgpr(tmpOffset), src0=sgpr("StreamKLocalStart"), src1=int(kernel["DepthU"] * tP["bpe"]), comment="StreamK tile start offset"))
+        module.add(SMulI32(dst=sgpr(tmpOffset), src0=sgpr("StreamKLocalStart"), src1=int(_DepthU * tP["bpe"]), comment="StreamK tile start offset"))
         strideL = writer.strideRef(tc, kernel["ProblemType"]["IndicesSummation"][0])
         module.add(writer.s_mul_u64_u32(sgpr(tmpOffset), sgpr(tmpOffset+1), sgpr(tmpOffset), strideL, "StreamK tile start offset"))
         # Overflow check removed
@@ -1745,7 +1760,7 @@ def graWorkGroup(self, writer, kernel, tPA, tPB):
         module = Module("StreamK Off graWorkGroup")
         return module
 
-    def computeLoadSrd(self, writer, kernel, tc, sTmp):
+    def computeLoadSrd(self, writer, kernel, tP, sTmp):
         module = Module("StreamK Off computeLoadSrd")
         return module
 
@@ -1861,9 +1876,9 @@ def graWorkGroup(self, writer, kernel, tPA, tPB):
 
         return module
 
-    def computeLoadSrd(self, writer, kernel, tc, sTmp):
+    def computeLoadSrd(self, writer, kernel, tP, sTmp):
         module = Module("StreamK Basic computeLoadSrd")
-        module.add(self.computeLoadSrdCommon(writer, kernel, tc, sTmp))
+        module.add(self.computeLoadSrdCommon(writer, kernel, tP, sTmp))
         return module
 
     def computeStoreSrdStart(self, writer, kernel):
@@ -1986,9 +2001,9 @@ def graWorkGroup(self, writer, kernel, tPA, tPB):
 
         return module
 
-    def computeLoadSrd(self, writer, kernel, tc, sTmp):
+    def computeLoadSrd(self, writer, kernel, tP, sTmp):
         module = Module("StreamK TwoTileOriginal computeLoadSrd")
-        module.add(self.computeLoadSrdCommon(writer, kernel, tc, sTmp))
+        module.add(self.computeLoadSrdCommon(writer, kernel, tP, sTmp))
         return module
 
     def computeStoreSrdStart(self, writer, kernel):
@@ -2256,9 +2271,9 @@ def graWorkGroup(self, writer, kernel, tPA, tPB):
 
         return module
 
-    def computeLoadSrd(self, writer, kernel, tc, sTmp):
+    def computeLoadSrd(self, writer, kernel, tP, sTmp):
         module = Module("StreamK TwoTileDPFirst computeLoadSrd")
-        module.add(self.computeLoadSrdCommon(writer, kernel, tc, sTmp))
+        module.add(self.computeLoadSrdCommon(writer, kernel, tP, sTmp))
         return module
 
     def computeStoreSrdStart(self, writer, kernel):

@@ -3617,7 +3617,7 @@ def computeLoadSrd(self, kernel, tP, tc, indices, bpe):
                     strideF, comment="tlu=0, scaled tile-offset by stride"))
 
         skComponent = Component.StreamK.find(self)
-        module.add(skComponent.computeLoadSrd(self, kernel, tc, stmp))
+        module.add(skComponent.computeLoadSrd(self, kernel, tP, stmp))
 
         gsuComponent = Component.GSU.find(self)
         module.add(gsuComponent.computeLoadSrd(self, kernel, tP, stmp, tileStart))

@@ -0,0 +1,109 @@
+TestParameters:
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201]
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  MinimumRequiredVersion: 5.0.0
+  PrintLevel: 1
+  # PrintSolutionRejectionReason: True
+  Device: 0
+  CMakeBuildType: Debug
+#  MergeFiles: False
+  KernelTime: True
+  MaxWorkspaceSize: 13421772800
+  DataInitTypeA: 3
+  DataInitTypeB: 3
+  DataInitTypeC: 0
+  DataInitTypeMXSA: 3
+  DataInitTypeMXSB: 3
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 0
+  NumElementsToValidate: -1
+  BoundsCheck: 0
+  KeepBuildTmp: True
+  MaxFileName: 128
+  DeviceLDS: 163840
+  MaxLDS: 163840
+
+BenchmarkProblems:
+  ########################################
+  # FP4SS
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: F4
+      DestDataType: S
+      ComputeDataType: S
+      HighPrecisionAccumulate: True
+      MXBlockA: 32
+      MXBlockB: 32
+      TransposeA: 1
+      TransposeB: 0
+      UseBeta: True
+      Batched: True
+      Activation: True
+      ActivationType: hipblaslt_all
+#      UseScaleAB: "Scalar"
+#      UseScaleCD: True
+      UseScaleAlphaVec: 1
+      UseBias: 1
+      BiasDataTypeList: [s]
+
+    - # BenchmarkProblemSizeGroup - Standard
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - MatrixInstruction:
+          # - [16, 16, 128, 1, 1, 1,1, 1,1]
+          # - [16, 16, 128, 1, 1, 4,2, 2,2]
+          - [16, 16, 128, 1, 1, 2,4, 2,2] # 64x128
+          # - [16, 16, 128, 1, 1, 8,8, 2,2]
+          # - [32, 32,  64, 1, 1, 1,1, 1,1]
+          - [32, 32,  64, 1, 1, 2,2, 2,2] # 128x128
+          # - [32, 32,  64, 1, 1, 4,2, 2,2]
+          # - [32, 32,  64, 1, 1, 2,4, 2,2]
+        - ForceDisableShadowInit: [True, False]
+        # - UseSgprForGRO: [0,1]
+        - UseSgprForGRO: [0]
+        # - DepthU: [64, 128]
+        - DepthU: [128]
+        - AssertFree0ElementMultiple: [1]
+        - AssertFree1ElementMultiple: [1]
+        - AssertSummationElementMultiple: [64]
+        - LocalReadVectorWidth: [16]
+        # - PrefetchGlobalRead: [0,1,2]
+        - PrefetchGlobalRead: [2]
+        # - PrefetchLocalRead: [0,1]
+        - PrefetchLocalRead: [1]
+        # - PreloadKernArgs: [0,1]
+        - PreloadKernArgs: [1]
+        # - ClusterLocalRead: [0,1]
+        - ClusterLocalRead: [1]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - GlobalReadVectorWidthA: [16]
+        - GlobalReadVectorWidthB: [16]
+        - ScheduleIterAlg: [3]
+        - InnerUnroll: [1]
+        - TransposeLDS: [1]
+        - WaveSeparateGlobalReadA: [0]
+        - WaveSeparateGlobalReadB: [0]
+        - 1LDSBuffer: [0]
+        - GlobalReadPerMfma: [1]
+        - LocalWritePerMfma: [-1]
+        - StoreVectorWidth: [-1]
+        - SourceSwap: [1]
+        - StreamK: [3]
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [1025, 513, 1, 2048]
+          - Exact: [1044, 532, 1, 2048]
+          - Exact: [127, 127, 1, 640] #special cleanup case
+          - Exact: [128, 128, 1, 128]
+          - Exact: [129, 129, 1, 640]
+        - BiasTypeArgs: ['s']
+        - ActivationArgs:
+          - [Enum: none]
@@ -0,0 +1,109 @@
+TestParameters:
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201]
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  MinimumRequiredVersion: 5.0.0
+  PrintLevel: 1
+  # PrintSolutionRejectionReason: True
+  Device: 0
+  CMakeBuildType: Debug
+#  MergeFiles: False
+  KernelTime: True
+  MaxWorkspaceSize: 13421772800
+  DataInitTypeA: 3
+  DataInitTypeB: 3
+  DataInitTypeC: 0
+  DataInitTypeMXSA: 3
+  DataInitTypeMXSB: 3
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 0
+  NumElementsToValidate: -1
+  BoundsCheck: 0
+  KeepBuildTmp: True
+  MaxFileName: 128
+  DeviceLDS: 163840
+  MaxLDS: 163840
+
+BenchmarkProblems:
+  ########################################
+  # FP8SS
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: F8
+      DestDataType: S
+      ComputeDataType: S
+      HighPrecisionAccumulate: True
+      MXBlockA: 32
+      MXBlockB: 32
+      TransposeA: 1
+      TransposeB: 0
+      UseBeta: True
+      Batched: True
+      Activation: True
+      ActivationType: hipblaslt_all
+#      UseScaleAB: "Scalar"
+#      UseScaleCD: True
+      UseScaleAlphaVec: 1
+      UseBias: 1
+      BiasDataTypeList: [s]
+
+    - # BenchmarkProblemSizeGroup - Standard
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - MatrixInstruction:
+          # - [16, 16, 128, 1, 1, 1,1, 1,1]
+          # - [16, 16, 128, 1, 1, 4,2, 2,2]
+          - [16, 16, 128, 1, 1, 2,4, 2,2] # 64x128
+          # - [16, 16, 128, 1, 1, 8,8, 2,2]
+          # - [32, 32,  64, 1, 1, 1,1, 1,1]
+          - [32, 32,  64, 1, 1, 2,2, 2,2] # 128x128
+          # - [32, 32,  64, 1, 1, 4,2, 2,2]
+          # - [32, 32,  64, 1, 1, 2,4, 2,2]
+        - ForceDisableShadowInit: [True, False]
+        # - UseSgprForGRO: [0,1]
+        - UseSgprForGRO: [0]
+        # - DepthU: [64, 128]
+        - DepthU: [128]
+        - AssertFree0ElementMultiple: [1]
+        - AssertFree1ElementMultiple: [1]
+        - AssertSummationElementMultiple: [64]
+        - LocalReadVectorWidth: [16]
+        # - PrefetchGlobalRead: [0,1,2]
+        - PrefetchGlobalRead: [2]
+        # - PrefetchLocalRead: [0,1]
+        - PrefetchLocalRead: [1]
+        # - PreloadKernArgs: [0,1]
+        - PreloadKernArgs: [1]
+        # - ClusterLocalRead: [0,1]
+        - ClusterLocalRead: [1]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+        - GlobalReadVectorWidthA: [16]
+        - GlobalReadVectorWidthB: [16]
+        - ScheduleIterAlg: [3]
+        - InnerUnroll: [1]
+        - TransposeLDS: [1]
+        - WaveSeparateGlobalReadA: [0]
+        - WaveSeparateGlobalReadB: [0]
+        - 1LDSBuffer: [0]
+        - GlobalReadPerMfma: [1]
+        - LocalWritePerMfma: [-1]
+        - StoreVectorWidth: [-1]
+        - SourceSwap: [1]
+        - StreamK: [3]
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [1025, 513, 1, 2048]
+          - Exact: [1044, 532, 1, 2048]
+          - Exact: [127, 127, 1, 640] #special cleanup case
+          - Exact: [128, 128, 1, 128]
+          - Exact: [129, 129, 1, 640]
+        - BiasTypeArgs: ['s']
+        - ActivationArgs:
+          - [Enum: none]