flashinfer-ai · aleozlx · Mar 21, 2026 · Feb 26, 2026 · Feb 27, 2026 · Feb 27, 2026
@@ -378,8 +378,8 @@ def dtype_str_to_torch_dtype(dtype_str):
         "8.6": [],
         "8.9": [],
         "9.0": [],
-        "10.0": ["cutlass", "cute-dsl"],
-        "10.3": ["cutlass", "cute-dsl"],
+        "10.0": ["cutlass", "cute-dsl", "trtllm"],
+        "10.3": ["cutlass", "cute-dsl", "trtllm"],
         "11.0": ["cutlass"],
         "12.0": [],
     },

@@ -1308,7 +1308,12 @@ def testMmMxfp8(args):
     res_dtype = args.out_dtype
     is_cuda_graph_compatible = not args.no_cuda_graph
     run_refcheck = args.refcheck
-    autotune_supported_backends = ["cutlass", "cute-dsl", "auto"]
+    autotune_supported_backends = [
+        "cutlass",
+        "cute-dsl",
+        "trtllm",
+        "auto",
+    ]
     res = []
 
     backends = filter_backends_by_compute_capability(backends, args.routine, device)
@@ -1336,42 +1341,73 @@ def testMmMxfp8(args):
         print("[ERROR] No backends to test. Exiting.")
         return res
 
-    ## Prepare input tensors
-    # Use swizzled layout for optimal performance
-    is_sf_swizzled_layout = True
-
+    inputs = {}
     input = torch.randn([m, k], device=device, dtype=torch.bfloat16)
-    input_mxfp8, input_scale = mxfp8_quantize(
-        input, is_sf_swizzled_layout=is_sf_swizzled_layout
-    )
-
     mat2 = torch.randn([n, k], device=device, dtype=torch.bfloat16)
-    mat2_mxfp8, mat2_scale = mxfp8_quantize(
-        mat2, is_sf_swizzled_layout=is_sf_swizzled_layout
-    )
+    for backend in backends:
+        ## Prepare input tensors
+        # Use swizzled layout for optimal performance
+        is_sf_swizzled_layout = backend in ["cutlass", "trtllm"]
+
+        if not is_sf_swizzled_layout:
+            sf_layout_input = flashinfer.SfLayout.layout_linear
+        elif backend == "cutlass" or args.use_128x4_sf_layout:
+            sf_layout_input = flashinfer.SfLayout.layout_128x4
+        elif backend == "trtllm":
+            if not args.use_128x4_sf_layout:
+                sf_layout_input = flashinfer.SfLayout.layout_8x4
+            else:
+                sf_layout_input = flashinfer.SfLayout.layout_128x4
+        input_mxfp8, input_scale = mxfp8_quantize(
+            input, sf_swizzle_layout=sf_layout_input
+        )
-        is_sf_swizzled_layout = backend in ["cutlass", "trtllm"]
-
-    input = torch.randn([m, k], device=device, dtype=torch.bfloat16)
-    input_mxfp8, input_scale = mxfp8_quantize(
-        input, is_sf_swizzled_layout=is_sf_swizzled_layout
-    )
-        input = torch.randn([m, k], device=device, dtype=torch.bfloat16)
-        input_mxfp8, input_scale = mxfp8_quantize(
-            input, is_sf_swizzled_layout=is_sf_swizzled_layout
-        )
+        is_sf_swizzled_layout = backend in ["cutlass", "trtllm", "cute-dsl", "auto"]
+
+        input = torch.randn([m, k], device=device, dtype=torch.bfloat16)
+        input_mxfp8, input_scale = mxfp8_quantize(
+            input, is_sf_swizzled_layout=is_sf_swizzled_layout
+        )
-        is_sf_swizzled_layout = backend in ["cutlass", "trtllm"]
-
-    input = torch.randn([m, k], device=device, dtype=torch.bfloat16)
-    input_mxfp8, input_scale = mxfp8_quantize(
-        input, is_sf_swizzled_layout=is_sf_swizzled_layout
-    )
-        input = torch.randn([m, k], device=device, dtype=torch.bfloat16)
-        input_mxfp8, input_scale = mxfp8_quantize(
-            input, is_sf_swizzled_layout=is_sf_swizzled_layout
-        )
+        is_sf_swizzled_layout = backend in ["cutlass", "trtllm", "cute-dsl", "auto"]
+
+        input = torch.randn([m, k], device=device, dtype=torch.bfloat16)
+        input_mxfp8, input_scale = mxfp8_quantize(
+            input, is_sf_swizzled_layout=is_sf_swizzled_layout
+        )
+        # when using trtllm, the shuffle_matrix_sf_a will swizzle the layout.
+        mat2_mxfp8, mat2_scale = mxfp8_quantize(
+            mat2,
+            is_sf_swizzled_layout=False
+            if backend == "trtllm"
+            else is_sf_swizzled_layout,
+        )
 
-    if args.verbose >= 2:
-        print(f"[VVERBOSE] {input_mxfp8.shape = }")
-        print(f"[VVERBOSE] {input_mxfp8.dtype = }")
-        print(f"[VVERBOSE] {mat2_mxfp8.shape = }")
-        print(f"[VVERBOSE] {mat2_mxfp8.dtype = }")
-        print(f"[VVERBOSE] {input_scale.shape = }")
-        print(f"[VVERBOSE] {input_scale.dtype = }")
-        print(f"[VVERBOSE] {mat2_scale.shape = }")
-        print(f"[VVERBOSE] {mat2_scale.dtype = }")
+        if backend == "trtllm":
+            from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
 
-    def run_backend(backend, input_mxfp8, mat2_mxfp8, input_scale, mat2_scale):
-        if backend in ["cutlass", "cute-dsl", "auto"]:
-            return flashinfer.gemm.mm_mxfp8(
-                a=input_mxfp8,
-                b=mat2_mxfp8.t(),  # mm_mxfp8 expects b.t()
-                a_descale=input_scale,
-                b_descale=mat2_scale,  # mm_mxfp8 handles swizzled 1D internally
-                out_dtype=res_dtype,
-                backend=backend,
+            mat2_mxfp8 = shuffle_matrix_a(mat2_mxfp8, 128).reshape(n, k)
+            mat2_scale = shuffle_matrix_sf_a(
+                mat2_scale.reshape(n, k // 32),
+                128,
+                num_elts_per_sf=32,
             )
-        else:
-            raise ValueError(f"Unsupported backend: {backend}")
+            mat2_scale = mat2_scale.t()
+
+        if args.verbose >= 2:
+            print(f"[VERBOSE] {backend}: {input_mxfp8.shape = }")
+            print(f"[VERBOSE] {backend}: {input_mxfp8.dtype = }")
+            print(f"[VERBOSE] {backend}: {mat2_mxfp8.shape = }")
+            print(f"[VERBOSE] {backend}: {mat2_mxfp8.dtype = }")
+            print(f"[VERBOSE] {backend}: {input_scale.shape = }")
+            print(f"[VERBOSE] {backend}: {input_scale.dtype = }")
+            print(f"[VERBOSE] {backend}: {mat2_scale.shape = }")
+            print(f"[VERBOSE] {backend}: {mat2_scale.dtype = }")
+        inputs[backend] = (input_mxfp8, mat2_mxfp8, input_scale, mat2_scale)
+
+    def run_backend(
+        backend: str,
+        inputs: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+    ) -> torch.Tensor:
+        assert backend in ["cutlass", "trtllm", "cute-dsl", "auto"], (
+            f"Unsupported backend: {backend}"
+        )
+        input_mxfp8, mat2_mxfp8, input_scale, mat2_scale = inputs
+        return flashinfer.gemm.mm_mxfp8(
+            a=input_mxfp8,
+            b=mat2_mxfp8.t(),  # mm_mxfp8 expects b.t()
+            a_descale=input_scale,
+            b_descale=mat2_scale,
+            out_dtype=res_dtype,
+            backend=backend,
+            use_8x4_sf_layout=backend == "trtllm" and not args.use_128x4_sf_layout,
+        )
 
     has_reference_output = False
     if run_refcheck:
@@ -1391,10 +1427,7 @@ def run_backend(backend, input_mxfp8, mat2_mxfp8, input_scale, mat2_scale):
                     for _ in range(warmup_iters):
                         run_backend(
                             cur_backend,
-                            input_mxfp8,
-                            mat2_mxfp8,
-                            input_scale,
-                            mat2_scale,
+                            inputs[cur_backend],
                         )
     elif cache_path:
         with autotune(False, cache=cache_path):
@@ -1406,7 +1439,7 @@ def run_backend(backend, input_mxfp8, mat2_mxfp8, input_scale, mat2_scale):
     for cur_backend in backends:
         if run_refcheck:
             outputs[cur_backend] = run_backend(
-                cur_backend, input_mxfp8, mat2_mxfp8, input_scale, mat2_scale
+                cur_backend, inputs[cur_backend]
             ).detach()
         backend_times[cur_backend] = bench_gpu_time(
             fn=run_backend,
@@ -1416,7 +1449,7 @@ def run_backend(backend, input_mxfp8, mat2_mxfp8, input_scale, mat2_scale):
             enable_cupti=args.use_cupti,
             use_cuda_graph=is_cuda_graph_compatible,
             cold_l2_cache=True,
-            input_args=(cur_backend, input_mxfp8, mat2_mxfp8, input_scale, mat2_scale),
+            input_args=(cur_backend, inputs[cur_backend]),
         )
 
     # Minimum cosine similarity for swizzled layout

@@ -555,7 +555,7 @@ def testNvfp4Quantize(args):
     Returns:
         dict: List of dictionaries containing performance results
     """
-    from flashinfer.fp4_quantization import SfLayout
+    from flashinfer import SfLayout
 
     if args.verbose >= 1:
         print("[INFO] Running testNvfp4Quantize")

@@ -27,7 +27,7 @@
 // linear layout. See QuantizationSFLayout enum for more details about the two layouts.
 // returns
 void mxfp8_quantize(TensorView input, TensorView valMxFP8, TensorView scaleFP8SF,
-                    bool isSfSwizzledLayout, int64_t alignment, bool enable_pdl) {
+                    int64_t sfSwizzleLayout, int64_t alignment, bool enable_pdl) {
   CHECK_CUDA(input);
   CHECK_CONTIGUOUS(input);
 
@@ -50,8 +50,7 @@ void mxfp8_quantize(TensorView input, TensorView valMxFP8, TensorView scaleFP8SF
 
   const thread_local int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
 
-  auto const layout = isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4
-                                         : tensorrt_llm::QuantizationSFLayout::LINEAR;
+  auto const layout = static_cast<tensorrt_llm::QuantizationSFLayout>(sfSwizzleLayout);
 
 #define LAUNCH_MXFP8_QUANTIZE_KERNEL(T)                                                            \
   tensorrt_llm::kernels::invokeMxFP8Quantization(                                                  \
@@ -94,7 +93,7 @@ inline uint8_t float_to_ue8m0(float value) {
 
 // Used in tests to quantize mxe4m3 tensors on host.
 void mxfp8_quantize_host(TensorView x_fp32, TensorView fp8_tensor, TensorView scale_tensor,
-                         bool is_sf_swizzled_layout) {
+                         int64_t sfSwizzleLayout) {
   int32_t const sf_vec_size = 32;
   auto fp32_dtype = DLDataType{kDLFloat, 32, 1};
   CHECK_INPUT_TYPE(x_fp32, fp32_dtype);
@@ -104,9 +103,7 @@ void mxfp8_quantize_host(TensorView x_fp32, TensorView fp8_tensor, TensorView sc
   int hidden_dim = data_shape[1];
   int groups_per_hidden_dim = hidden_dim / sf_vec_size;
 
-  tensorrt_llm::QuantizationSFLayout layout =
-      is_sf_swizzled_layout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4
-                            : tensorrt_llm::QuantizationSFLayout::LINEAR;
+  auto const layout = static_cast<tensorrt_llm::QuantizationSFLayout>(sfSwizzleLayout);
 
   for (size_t ti = 0; ti < static_cast<size_t>(data_shape[0]); ++ti) {
     for (int group = 0; group < groups_per_hidden_dim; ++group) {
@@ -141,7 +138,7 @@ void mxfp8_quantize_host(TensorView x_fp32, TensorView fp8_tensor, TensorView sc
 
 // Used in tests to dequantize mxe4m3 tensors on host.
 void mxfp8_dequantize_host(TensorView value_e4m3, TensorView scale_ue8m08sf,
-                           TensorView float_tensor, bool is_sf_swizzled_layout) {
+                           TensorView float_tensor, int64_t sfSwizzleLayout) {
   int32_t const sf_vec_size = 32;
   CHECK_INPUT_TYPE(value_e4m3, dl_uint8);
   CHECK_INPUT_TYPE(scale_ue8m08sf, dl_uint8);
@@ -153,9 +150,7 @@ void mxfp8_dequantize_host(TensorView value_e4m3, TensorView scale_ue8m08sf,
   int hidden_dim = data_shape[1];
   int groups_per_hidden_dim = hidden_dim / sf_vec_size;
 
-  tensorrt_llm::QuantizationSFLayout layout =
-      is_sf_swizzled_layout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4
-                            : tensorrt_llm::QuantizationSFLayout::LINEAR;
+  auto const layout = static_cast<tensorrt_llm::QuantizationSFLayout>(sfSwizzleLayout);
   for (size_t ti = 0; ti < static_cast<size_t>(data_shape[0]); ++ti) {
     for (int group = 0; group < groups_per_hidden_dim; ++group) {
       float* float_ptr =

@@ -67,14 +67,14 @@ inline int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
 // alignment: sfVecSize
 // returns fp8_quantized and block_scale_factors.
 void mxfp8_quantize(TensorView input, TensorView valMxFP8, TensorView scaleFP8SF,
-                    bool is_sf_swizzled_layout, int64_t alignment, bool enable_pdl);
+                    int64_t sfSwizzleLayout, int64_t alignment, bool enable_pdl);
 
 // x_fp32: [M, K], fp32_quantized (on the host)
 // isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in
 // linear layout. See QuantizationSFLayout enum for more details about the two layouts.
 // returns fp8_quantized and block_scale_factors (on the host).
 void mxfp8_quantize_host(TensorView x_fp32, TensorView fp8_tensor, TensorView scale_tensor,
-                         bool is_sf_swizzled_layout = true);
+                         int64_t sfSwizzleLayout = 2);
 
 void mxfp8_dequantize_host(TensorView value_e4m3, TensorView scale_ue8m08sf,
-                           TensorView float_tensor, bool is_sf_swizzled_layout = true);
+                           TensorView float_tensor, int64_t sfSwizzleLayout = 2);