ROCm · ex-rzr · Mar 11, 2026 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025
@@ -21,6 +21,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added persistent async input scheduler for CK Tile universal GEMM kernels to support asynchronous input streaming.
 * Added FP8 block scale quantization for FMHA forward kernel.
 * Added gfx11 support for FMHA.
+* Added microscaling (MX) FP8/FP4 support on gfx950 for FMHA forward kernel ("qr" pipeline only).
 
 ### Changed
 

@@ -9,6 +9,8 @@
     "fp8fp16": "FmhaFwdFp8Fp16",
     "fp8bf16": "FmhaFwdFp8Bf16",
     "fp8fp32": "FmhaFwdFp8Fp32",
+    "mxfp8": "FmhaFwdMxFp8",
+    "mxfp4": "FmhaFwdMxFp4",
 }
 
 BWD_DTYPE_MAP = {"fp32": "FmhaBwdFp32", "fp16": "FmhaBwdFp16", "bf16": "FmhaBwdBf16"}
@@ -79,13 +81,15 @@ def get_mask_cpp_check_expr(mask: str) -> str:
     "pertensor": "ck_tile::BlockAttentionQuantScaleEnum::PERTENSOR",
     "blockscale": "ck_tile::BlockAttentionQuantScaleEnum::BLOCKSCALE",
     "kv_blockscale": "ck_tile::BlockAttentionQuantScaleEnum::KV_BLOCKSCALE",
+    "mx": "ck_tile::BlockAttentionQuantScaleEnum::MX",
 }
 
 QSCALE_CHECK_MAP = {
     "no": "quant_scale_enum::no_scale",
     "pertensor": "quant_scale_enum::pertensor",
     "blockscale": "quant_scale_enum::blockscale",
     "kv_blockscale": "quant_scale_enum::kv_blockscale",
+    "mx": "quant_scale_enum::mx",
 }
 
 BIAS_MAP = {

@@ -38,6 +38,8 @@
     "fp8bf16": 8,
     "fp8fp32": 8,
     "bf8": 8,
+    "mxfp8": 8,
+    "mxfp4": 4,
 }
 
 K0_MAX_SUBMAX_MAP = {
@@ -836,7 +838,8 @@ def get_rules(cls) -> List[CompatibilityRule]:
         def check_hdim_tile(
             problem_ctx: ProblemContext, kernel_ctx: KernelContext
         ) -> bool:
-            if problem_ctx.dtype != "fp32":
+            # FIX: too confusing that it has to know about mx types
+            if problem_ctx.dtype not in ("fp32", "mxfp8", "mxfp4"):
                 # TODO: update if >=gfx11 archs get qr_async and qr_async_trload support
                 if kernel_ctx.pipeline.tag in cls._AVAILABLE_PIPELINES and (
                     (
@@ -966,8 +969,6 @@ def get_hdim_tile_size_dict(cls, dtype: str) -> Optional[dict]:
             return {
                 (128, 128) : [FmhaFwdTileSize(128, 128,  32, 128,  32, 128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
             }  # fmt: skip
-        else:
-            raise ValueError(f"unsupported dtype={dtype}")
 
     # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
     #       support this in future
@@ -1035,9 +1036,6 @@ def get_pipelines(
                 else:
                     pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "f", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink))  # fmt: skip
                     pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink))  # fmt: skip
-        elif dtype in ["fp8", "fp8fp16", "bf8"]:
-            # TODO
-            pass
         return pipelines
 
 
@@ -1046,6 +1044,17 @@ class KernelComponentFactoryGfx950(
 ):
     arch = ArchTrait("gfx950")
 
+    _DT_MXFP8 = ("mxfp8",)
+    _DT_MXFP4 = ("mxfp4",)
+
+    @classmethod
+    def supported_dtypes(cls) -> Tuple[str]:
+        return (
+            KernelComponentFactoryGfx9.supported_dtypes()
+            + cls._DT_MXFP8
+            + cls._DT_MXFP4
+        )
+
     @classmethod
     def get_hdim_tile_size_dict(cls, dtype: str) -> Optional[dict]:
         result = KernelComponentFactoryGfx9.get_hdim_tile_size_dict(dtype)
@@ -1054,6 +1063,18 @@ def get_hdim_tile_size_dict(cls, dtype: str) -> Optional[dict]:
             if (128, 128) in result.keys():
                 result[(128, 128)].append(
                     FmhaFwdTileSize(256, 32, 128, 128, 32, 128,  8, 1, 1,  8, 1, 1,  32, 32, 16,  32, 32, 16,  -1))  # fmt: skip
+        elif dtype in cls._DT_MXFP8:
+            return {
+                #                             bm0, bn0, bk0, bn1, bk1,
+                (128, 128) : [FmhaFwdTileSize(128, 128,  64, 128,  64, 128,  4, 1, 1,  4, 1, 1,  32, 32,  64,  32, 32,  64,  -1)],
+                (256, 256) : [FmhaFwdTileSize(128, 128, 128, 256, 128, 256,  4, 1, 1,  4, 1, 1,  16, 16, 128,  16, 16, 128,  -1)],
+            }  # fmt: skip
+        elif dtype in cls._DT_MXFP4:
+            return {
+                #                             bm0, bn0, bk0, bn1, bk1,
+                (128, 128) : [FmhaFwdTileSize(128, 128,  64, 128,  64, 128,  4, 1, 1,  4, 1, 1,  32, 32,  64,  32, 32,  64,  -1)],
+                (256, 256) : [FmhaFwdTileSize(128, 128, 128, 256, 128, 256,  4, 1, 1,  4, 1, 1,  16, 16, 128,  16, 16, 128,  -1)],
+            }  # fmt: skip
         return result
 
     @classmethod
@@ -1091,6 +1112,19 @@ def get_pipelines(
                     pipelines.append(FmhaFwdPipeline("qr_async_trload_v3", "row", "t", "t", "f", "f",
                         F_logits=logits, F_bias="no", F_lse="f", F_dropout="f", F_qscale=qscale, F_mask=mask, F_skip="f", F_trload="t", F_sink="f"))  # fmt: skip
 
+        elif dtype in cls._DT_MXFP8 or dtype in cls._DT_MXFP4:
+            # no need dropout kernels
+            lse = "t"
+            dropout = "f"
+            for logits, qscale, mask, bias, sink in itertools.product(
+                ["f"],
+                ["mx"],
+                get_mask_map(mask_impl).keys(),
+                ["no"],
+                ["f", "t"],
+            ):
+                pipelines.append(FmhaFwdPipeline("qr", "col", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, "f", "f", sink))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr", "col", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, "f", "f", sink))  # fmt: skip
         return pipelines
 
 

@@ -48,8 +48,12 @@ auto create_args(int argc, char* argv[])
         .insert("scale_s", "0", "scale factor of S. 0 means equal to 1/sqrt(hdim)")
         .insert("qscale",
                 "n",
-                "n or 0, no scale\n"
-                "pt or 1, per-tensor scale\n")
+                "quant scale:\n"
+                "  n or 0, no scale\n"
+                "  pt or 1, per-tensor scale\n"
+                "  bs or 2, block scale\n"
+                "  kvbs or 3, Q per-tensor, K/V per-page block scale\n"
+                "  mx or 4, microscaling (exclusively for data types like mxfp8 and mxfp4)")
         .insert("logits_soft_cap", "0", "attention logits soft capping value.")
         .insert("iperm",
                 "1",
@@ -61,7 +65,7 @@ auto create_args(int argc, char* argv[])
                 "n or 0, no bias\n"
                 "e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s\n"
                 "a(libi) or 2, alibi with 1*h. a:1, b*h")
-        .insert("prec", "fp16", "data type. fp32/fp16/bf16/fp8/bf8")
+        .insert("prec", "fp16", "data type: fp32/fp16/bf16/fp8/fp8bf16/fp8fp32/mxfp8/mxfp4")
         .insert("mask",
                 "0",
                 "0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b')\n"
@@ -231,6 +235,10 @@ int main(int argc, char* argv[])
         {
             return run<FmhaFwdBf16>(arg_parser) == fwd_result::success ? 0 : -2;
         }
+        else if(data_type == "fp8")
+        {
+            return run<FmhaFwdFp8>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
         else if(data_type == "fp8bf16")
         {
             return run<FmhaFwdFp8Bf16>(arg_parser) == fwd_result::success ? 0 : -2;
@@ -239,6 +247,14 @@ int main(int argc, char* argv[])
         {
             return run<FmhaFwdFp8Fp32>(arg_parser) == fwd_result::success ? 0 : -2;
         }
+        else if(data_type == "mxfp8")
+        {
+            return run<FmhaFwdMxFp8>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "mxfp4")
+        {
+            return run<FmhaFwdMxFp4>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
         std::cerr << "Unsupported precision: " << data_type << std::endl;
         return -1;
     }

@@ -50,6 +50,14 @@ struct FmhaFwdFp8Fp32
 {
 };
 
+struct FmhaFwdMxFp8
+{
+};
+
+struct FmhaFwdMxFp4
+{
+};
+
 template <typename DataType>
 struct FmhaFwdTypeConfig;
 
@@ -165,6 +173,54 @@ struct FmhaFwdTypeConfig<FmhaFwdFp8Fp32>
     using ODataType             = float;
 };
 
+template <>
+struct FmhaFwdTypeConfig<FmhaFwdMxFp8>
+{
+    using QDataType             = ck_tile::fp8_t;
+    using KDataType             = ck_tile::fp8_t;
+    using VDataType             = ck_tile::fp8_t;
+    using BiasDataType          = float;
+    using RandValOutputDataType = uint8_t;
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType          = float; // data type for first gemm accumulation
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
+    using PDataType             = ck_tile::fp8_t; // data type for A matrix of second gemm
+    using OaccDataType          = float;          // data type for second gemm accumulation
+    using ODataType             = float;
+
+    using QScaleDataType = ck_tile::e8m0_t;
+    using KScaleDataType = ck_tile::e8m0_t;
+    using VScaleDataType = ck_tile::e8m0_t;
+    using PScaleDataType = ck_tile::e8m0_t;
+
+    static constexpr ck_tile::index_t kQKScaleGranularity = 32;
+    static constexpr ck_tile::index_t kVScaleGranularity  = 32;
+};
+
+template <>
+struct FmhaFwdTypeConfig<FmhaFwdMxFp4>
+{
+    using QDataType             = ck_tile::pk_fp4_t;
+    using KDataType             = ck_tile::pk_fp4_t;
+    using VDataType             = ck_tile::pk_fp4_t;
+    using BiasDataType          = float;
+    using RandValOutputDataType = uint8_t;
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType          = float; // data type for first gemm accumulation
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
+    using PDataType             = ck_tile::pk_fp4_t; // data type for A matrix of second gemm
+    using OaccDataType          = float;             // data type for second gemm accumulation
+    using ODataType             = float;
+
+    using QScaleDataType = ck_tile::e8m0_t;
+    using KScaleDataType = ck_tile::e8m0_t;
+    using VScaleDataType = ck_tile::e8m0_t;
+    using PScaleDataType = ck_tile::e8m0_t;
+
+    static constexpr ck_tile::index_t kQKScaleGranularity = 32;
+    static constexpr ck_tile::index_t kVScaleGranularity  = 32;
+};
+
 struct FmhaMasks
 {
     using NoMask      = ck_tile::GenericAttentionMask<false>;
@@ -232,6 +288,7 @@ struct fmha_fwd_args
                                            // array [batch + 1]. (Used with padding)
     const void* block_scale_seqstart_q_ptr;
     const void* block_scale_seqstart_k_ptr;
+    const void* seqstart_v_scale_ptr;
     const void* sink_ptr;
 
     ck_tile::index_t seqlen_q;
@@ -252,6 +309,9 @@ struct fmha_fwd_args
     ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
     ck_tile::index_t stride_randval;
     ck_tile::index_t stride_o;
+    ck_tile::index_t stride_q_descale;
+    ck_tile::index_t stride_k_descale;
+    ck_tile::index_t stride_v_descale;
     ck_tile::index_t nhead_stride_q;
     ck_tile::index_t nhead_stride_k;
     ck_tile::index_t nhead_stride_v;
@@ -635,6 +695,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                              args.seqlen_k_ptr,
                                              args.block_scale_seqstart_q_ptr,
                                              args.block_scale_seqstart_k_ptr,
+                                             args.seqstart_v_scale_ptr,
                                              args.hdim_q,
                                              args.hdim_v,
                                              args.nhead_q,
@@ -647,6 +708,9 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                              args.stride_bias,
                                              args.stride_randval,
                                              args.stride_o,
+                                             args.stride_q_descale,
+                                             args.stride_k_descale,
+                                             args.stride_v_descale,
                                              args.nhead_stride_q,
                                              args.nhead_stride_k,
                                              args.nhead_stride_v,
@@ -697,6 +761,9 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                              args.stride_bias,
                                              args.stride_randval,
                                              args.stride_o,
+                                             args.stride_q_descale,
+                                             args.stride_k_descale,
+                                             args.stride_v_descale,
                                              args.nhead_stride_q,
                                              args.nhead_stride_k,
                                              args.nhead_stride_v,