diff --git a/benchmarks/bench_moe_deepseek.py b/benchmarks/bench_moe_deepseek.py
new file mode 100644
index 0000000000..149394a614
--- /dev/null
+++ b/benchmarks/bench_moe_deepseek.py
@@ -0,0 +1,1074 @@
+#!/usr/bin/env python3
+"""DeepSeek-V3 MoE Performance Benchmark - CuteDSL vs CUTLASS vs TRTLLM.
+
+Compares three NVFP4 MoE backends on DeepSeek-V3 configuration:
+- CuteDSL: FlashInfer's CuteDSL-based implementation
+- CUTLASS: NVIDIA CUTLASS-based implementation
+- TRTLLM: TensorRT-LLM's implementation
+
+Usage:
+    # Throughput benchmark (large batches: 128-4096 tokens)
+    python bench_moe_deepseek.py
+
+    # Generation phase benchmark (small batches: 1-128 tokens)
+    python bench_moe_deepseek.py --gen-phase
+
+    # With Expert Parallelism simulation
+    python bench_moe_deepseek.py --ep 1    # 256 local experts (no parallelism)
+    python bench_moe_deepseek.py --ep 8    # 32 local experts (8-way EP)
+    python bench_moe_deepseek.py --ep 16   # 16 local experts (16-way EP)
+
+    # Custom token counts
+    python bench_moe_deepseek.py --num-tokens 64,128,256
+
+    # Disable CUDA graph (useful for debugging or profiling)
+    python bench_moe_deepseek.py --no-cuda-graph
+
+    # Disable CUPTI (use CUDA events for timing instead)
+    python bench_moe_deepseek.py --no-cupti
+
+Metrics:
+    - ms: Latency in milliseconds
+    - TFLOPS: Computational throughput
+    - Speedup: CuteDSL latency / other backend latency (>1 = CuteDSL faster)
+"""
+
+import argparse
+from dataclasses import dataclass
+import numpy as np
+import torch
+
+
+@dataclass
+class DeepSeekConfig:
+    hidden_size: int = 7168
+    intermediate_size: int = 2048
+    num_experts: int = 256
+    n_group: int = 8
+    topk_group: int = 4
+    top_k: int = 8
+    routed_scaling_factor: float = 2.5
+
+
+CFG = DeepSeekConfig()
+TOKEN_COUNTS = [128, 256, 512, 1024, 2048, 4096]
+
+# Generation phase token counts (small batches typical in decode)
+GEN_PHASE_TOKENS = [1, 2, 4, 8, 16, 32, 64, 128]
+
+# Expert Parallelism configurations
+# EP=1: all 256 experts on single GPU
+# EP=8: 32 experts per GPU (256/8)
+# EP=16: 16 experts per GPU (256/16)
+EP_CONFIGS = {
+    1: {"num_local_experts": 256, "local_expert_offset": 0},
+    8: {"num_local_experts": 32, "local_expert_offset": 0},
+    16: {"num_local_experts": 16, "local_expert_offset": 0},
+}
+
+
+def is_sm100_family():
+    """Check for SM100 family (Blackwell: SM100, SM103, SM110).
+
+    CuteDSL MoE NVFP4 kernels are optimized for SM100 architecture.
+    SM120+ (Rubin) may have different shared memory/TMEM configurations.
+    """
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    return props.major == 10
+
+
+def calc_tflops(n, ms, num_local_experts=None):
+    """Calculate TFLOPS for MoE computation.
+
+    With EP, only tokens routed to local experts are computed.
+    Assumes uniform routing distribution across experts.
+    """
+    if num_local_experts is None:
+        num_local_experts = CFG.num_experts
+
+    # Fraction of work done locally (assuming uniform distribution)
+    local_fraction = num_local_experts / CFG.num_experts
+
+    flops = (
+        n
+        * CFG.top_k
+        * local_fraction  # Only local expert pairs are computed
+        * (
+            2 * CFG.hidden_size * 2 * CFG.intermediate_size
+            + 2 * CFG.intermediate_size * CFG.hidden_size
+        )
+    )
+    return flops / (ms * 1e-3) / 1e12
+
+
+def interleave(x, gs=64):
+    M, K = x.shape[-2], x.shape[-1]
+    return (
+        x.view(*x.shape[:-2], 2, M // (gs * 2), gs, K)
+        .transpose(-4, -3)
+        .contiguous()
+        .view(*x.shape)
+    )
+
+
+def create_inputs(n, dev="cuda"):
+    """Create inputs for all backends (CuteDSL, CUTLASS, TRTLLM)."""
+    from flashinfer.fp4_quantization import fp4_quantize
+
+    torch.manual_seed(42)
+    sv = 16
+    FP8M = torch.finfo(torch.float8_e4m3fn).max
+    FP4M = 6.0
+
+    # Router logits and bias
+    rl = torch.randn(n, CFG.num_experts, device=dev, dtype=torch.float32)
+    rb = torch.randn(CFG.num_experts, device=dev, dtype=torch.bfloat16)
+
+    # Hidden states
+    hb = torch.randn(n, CFG.hidden_size, device=dev, dtype=torch.bfloat16) / 10
+    hg = FP8M * FP4M / hb.abs().max().float()
+
+    # Weights (BF16)
+    w1b = (
+        torch.randn(
+            CFG.num_experts,
+            2 * CFG.intermediate_size,
+            CFG.hidden_size,
+            device=dev,
+            dtype=torch.bfloat16,
+        )
+        / 10
+    )
+    w2b = (
+        torch.randn(
+            CFG.num_experts,
+            CFG.hidden_size,
+            CFG.intermediate_size,
+            device=dev,
+            dtype=torch.bfloat16,
+        )
+        / 10
+    )
+
+    # Compute per-expert global scales
+    w1_gs_list, w2_gs_list = [], []
+    for e in range(CFG.num_experts):
+        w1_gs_list.append(FP8M * FP4M / w1b[e].abs().max().float())
+        w2_gs_list.append(FP8M * FP4M / w2b[e].abs().max().float())
+    w1_gs = torch.tensor(w1_gs_list, device=dev)
+    w2_gs = torch.tensor(w2_gs_list, device=dev)
+
+    # CUTLASS format: quantize with swizzled scale factors
+    w1_fp4_list, w1_sf_list = [], []
+    w2_fp4_list, w2_sf_list = [], []
+    for e in range(CFG.num_experts):
+        q1, s1 = fp4_quantize(w1b[e], w1_gs[e], sv, False, True)  # swizzled
+        w1_fp4_list.append(q1)
+        w1_sf_list.append(s1)
+        q2, s2 = fp4_quantize(w2b[e], w2_gs[e], sv, False, True)  # swizzled
+        w2_fp4_list.append(q2)
+        w2_sf_list.append(s2)
+
+    return {
+        "router_logits": rl,
+        "routing_bias": rb,
+        "hidden_bf16": hb,
+        "hidden_gs": hg,
+        "w1_bf16": w1b,
+        "w1_gs": w1_gs,
+        "w2_bf16": w2b,
+        "w2_gs": w2_gs,
+        # CUTLASS specific
+        "w1_fp4": torch.stack(w1_fp4_list),
+        "w1_sf": torch.stack(w1_sf_list),
+        "w2_fp4": torch.stack(w2_fp4_list),
+        "w2_sf": torch.stack(w2_sf_list),
+    }
+
+
+# =============================================================================
+# Benchmark Functions
+# =============================================================================
+
+
+def bench_cute_dsl(
+    inputs,
+    warmup=10,
+    iters=100,
+    num_local_experts=None,
+    local_expert_offset=0,
+    use_cuda_graph=True,
+    use_cupti=True,
+    use_wrapper=False,
+):
+    """Benchmark CuteDSL MoE.
+
+    Args:
+        use_wrapper: If True, use CuteDslMoEWrapper API (recommended for CUDA graph).
+                    If False, use cute_dsl_fused_moe_nvfp4 functional API.
+    """
+    from flashinfer.fused_moe import fused_topk_deepseek
+    from flashinfer.cute_dsl.utils import convert_sf_to_mma_layout
+    from flashinfer.fp4_quantization import fp4_quantize
+    from flashinfer.testing.utils import bench_gpu_time
+
+    if num_local_experts is None:
+        num_local_experts = CFG.num_experts
+
+    n, sv, dev = inputs["router_logits"].shape[0], 16, "cuda"
+    gs1 = torch.tensor([1.0], device=dev)
+
+    tv = torch.empty(n, CFG.top_k, dtype=torch.float32, device=dev)
+    ti = torch.empty(n, CFG.top_k, dtype=torch.int32, device=dev)
+
+    xf, xs = fp4_quantize(inputs["hidden_bf16"], gs1, sv, False, False)
+    xs = xs.unsqueeze(-1)
+
+    # Expert range for this EP partition
+    expert_start = local_expert_offset
+    expert_end = local_expert_offset + num_local_experts
+
+    # Slice weights to LOCAL experts only
+    w1_local = inputs["w1_bf16"][expert_start:expert_end]
+    w2_local = inputs["w2_bf16"][expert_start:expert_end]
+
+    w1i = interleave(w1_local, 64)
+    w1f = w1i.view(num_local_experts * 2 * CFG.intermediate_size, CFG.hidden_size)
+    w1q, w1s = fp4_quantize(w1f, gs1, sv, False, True)
+    w1q = w1q.view(num_local_experts, 2 * CFG.intermediate_size, CFG.hidden_size // 2)
+    w1s = convert_sf_to_mma_layout(
+        w1s, 2 * CFG.intermediate_size, CFG.hidden_size, num_local_experts, sv
+    )
+
+    w2f = w2_local.view(num_local_experts * CFG.hidden_size, CFG.intermediate_size)
+    w2q, w2s = fp4_quantize(w2f, gs1, sv, False, True)
+    w2q = w2q.view(num_local_experts, CFG.hidden_size, CFG.intermediate_size // 2)
+    w2s = convert_sf_to_mma_layout(
+        w2s, CFG.hidden_size, CFG.intermediate_size, num_local_experts, sv
+    )
+
+    # Alpha sized for LOCAL experts only
+    alpha, fc2sc = (
+        torch.ones(num_local_experts, device=dev),
+        torch.tensor([1.0], device=dev),
+    )
+
+    # Pre-convert routing bias to float32
+    routing_bias_f32 = inputs["routing_bias"].float()
+
+    if use_wrapper:
+        # Use CuteDslMoEWrapper (recommended for CUDA graph)
+        from flashinfer import CuteDslMoEWrapper
+
+        moe = CuteDslMoEWrapper(
+            num_experts=CFG.num_experts,
+            top_k=CFG.top_k,
+            hidden_size=CFG.hidden_size,
+            intermediate_size=CFG.intermediate_size,
+            use_cuda_graph=use_cuda_graph,
+            max_num_tokens=n,
+            num_local_experts=num_local_experts,
+            local_expert_offset=local_expert_offset,
+        )
+
+        def run(x, x_sf, router_logits, routing_bias, topk_values, topk_indices):
+            fused_topk_deepseek(
+                scores=router_logits,
+                bias=routing_bias,
+                n_group=CFG.n_group,
+                topk_group=CFG.topk_group,
+                topk=CFG.top_k,
+                routed_scaling_factor=CFG.routed_scaling_factor,
+                topk_values=topk_values,
+                topk_indices=topk_indices,
+            )
+            return moe.run(
+                x=x,
+                x_sf=x_sf,
+                token_selected_experts=topk_indices,
+                token_final_scales=topk_values,
+                w1_weight=w1q,
+                w1_weight_sf=w1s,
+                w1_alpha=alpha,
+                fc2_input_scale=fc2sc,
+                w2_weight=w2q,
+                w2_weight_sf=w2s,
+                w2_alpha=alpha,
+            )
+    else:
+        # Use functional API
+        from flashinfer import cute_dsl_fused_moe_nvfp4
+
+        def run(x, x_sf, router_logits, routing_bias, topk_values, topk_indices):
+            fused_topk_deepseek(
+                scores=router_logits,
+                bias=routing_bias,
+                n_group=CFG.n_group,
+                topk_group=CFG.topk_group,
+                topk=CFG.top_k,
+                routed_scaling_factor=CFG.routed_scaling_factor,
+                topk_values=topk_values,
+                topk_indices=topk_indices,
+            )
+            return cute_dsl_fused_moe_nvfp4(
+                x=x,
+                x_sf=x_sf,
+                token_selected_experts=topk_indices,
+                token_final_scales=topk_values,
+                w1_weight=w1q,
+                w1_weight_sf=w1s,
+                w1_alpha=alpha,
+                fc2_input_scale=fc2sc,
+                w2_weight=w2q,
+                w2_weight_sf=w2s,
+                w2_alpha=alpha,
+                num_experts=CFG.num_experts,
+                top_k=CFG.top_k,
+                num_local_experts=num_local_experts,
+                local_expert_offset=local_expert_offset,
+            )
+
+    # Pass input tensors via input_kwargs for cold L2 cache rotation
+    input_kwargs = {
+        "x": xf,
+        "x_sf": xs,
+        "router_logits": inputs["router_logits"],
+        "routing_bias": routing_bias_f32,
+        "topk_values": tv,
+        "topk_indices": ti,
+    }
+
+    times = bench_gpu_time(
+        run,
+        dry_run_iters=warmup,
+        repeat_iters=iters,
+        cold_l2_cache=True,
+        enable_cupti=use_cupti,
+        use_cuda_graph=use_cuda_graph,
+        input_kwargs=input_kwargs,
+    )
+    return np.median(times)
+
+
+def bench_cutlass(
+    inputs,
+    warmup=10,
+    iters=100,
+    num_local_experts=None,
+    local_expert_offset=0,
+    use_cuda_graph=True,
+    use_cupti=True,
+):
+    from flashinfer.fused_moe import fused_topk_deepseek, cutlass_fused_moe
+    from flashinfer.fp4_quantization import fp4_quantize
+    from flashinfer.testing.utils import bench_gpu_time
+
+    if num_local_experts is None:
+        num_local_experts = CFG.num_experts
+
+    n, sv, dev = inputs["router_logits"].shape[0], 16, "cuda"
+
+    tv = torch.empty(n, CFG.top_k, dtype=torch.float32, device=dev)
+    ti = torch.empty(n, CFG.top_k, dtype=torch.int32, device=dev)
+
+    # Expert range for this EP partition
+    expert_start = local_expert_offset
+    expert_end = local_expert_offset + num_local_experts
+
+    # Slice weights to LOCAL experts only (for fair EP comparison)
+    w1_fp4_local = inputs["w1_fp4"][expert_start:expert_end]
+    w1_sf_local = inputs["w1_sf"][expert_start:expert_end]
+    w1_gs_local = inputs["w1_gs"][expert_start:expert_end]
+    w2_fp4_local = inputs["w2_fp4"][expert_start:expert_end]
+    w2_sf_local = inputs["w2_sf"][expert_start:expert_end]
+    w2_gs_local = inputs["w2_gs"][expert_start:expert_end]
+
+    # Prepare CUTLASS inputs
+    a1_gs = torch.tensor(1.0, device=dev, dtype=torch.float32)
+    a2_gs = torch.tensor(1.0, device=dev, dtype=torch.float32)
+
+    quant_scales = [
+        a1_gs,
+        w1_sf_local.view(torch.int32),
+        1.0 / (a1_gs * w1_gs_local),
+        a2_gs,
+        w2_sf_local.view(torch.int32),
+        1.0 / (a2_gs * w2_gs_local),
+    ]
+
+    hidden_fp4, input_sf = fp4_quantize(inputs["hidden_bf16"], a1_gs, sv, False, True)
+    output = torch.empty(n, CFG.hidden_size, dtype=torch.bfloat16, device=dev)
+
+    # Pre-convert routing bias to float32
+    routing_bias_f32 = inputs["routing_bias"].float()
+
+    # Pre-compute values that need conversion
+    w1_fp4_view = w1_fp4_local.contiguous().view(torch.long)
+    w2_fp4_view = w2_fp4_local.contiguous().view(torch.long)
+
+    # Compute EP size from config
+    ep_size = CFG.num_experts // num_local_experts
+
+    def run(hidden, sf, router_logits, routing_bias, topk_values, topk_indices):
+        # Routing (included in timing for fair comparison with TRTLLM)
+        fused_topk_deepseek(
+            scores=router_logits,
+            bias=routing_bias,
+            n_group=CFG.n_group,
+            topk_group=CFG.topk_group,
+            topk=CFG.top_k,
+            routed_scaling_factor=CFG.routed_scaling_factor,
+            topk_values=topk_values,
+            topk_indices=topk_indices,
+        )
+        cutlass_fused_moe(
+            hidden,
+            topk_indices.to(torch.int),
+            topk_values,
+            w1_fp4_view,
+            w2_fp4_view,
+            torch.bfloat16,
+            quant_scales=quant_scales,
+            input_sf=sf,
+            output=output,
+            ep_size=ep_size,
+            ep_rank=0,  # Simulating rank 0 of EP
+        )
+        return output
+
+    input_kwargs = {
+        "hidden": hidden_fp4,
+        "sf": input_sf,
+        "router_logits": inputs["router_logits"],
+        "routing_bias": routing_bias_f32,
+        "topk_values": tv,
+        "topk_indices": ti,
+    }
+
+    times = bench_gpu_time(
+        run,
+        dry_run_iters=warmup,
+        repeat_iters=iters,
+        cold_l2_cache=True,
+        enable_cupti=use_cupti,
+        use_cuda_graph=use_cuda_graph,
+        input_kwargs=input_kwargs,
+    )
+    return np.median(times)
+
+
+def bench_trtllm(
+    inputs,
+    warmup=10,
+    iters=100,
+    num_local_experts=None,
+    local_expert_offset=0,
+    use_cuda_graph=True,
+    use_cupti=True,
+):
+    from flashinfer.fused_moe import trtllm_fp4_block_scale_moe
+    from flashinfer.fused_moe.core import (
+        RoutingMethodType,
+        _maybe_get_cached_w3_w1_permute_indices,
+        get_w2_permute_indices_with_cache,
+    )
+    from flashinfer.fp4_quantization import fp4_quantize, block_scale_interleave
+    from flashinfer.testing.utils import bench_gpu_time
+
+    if num_local_experts is None:
+        num_local_experts = CFG.num_experts
+
+    n, dev = inputs["router_logits"].shape[0], inputs["router_logits"].device
+    sv, etm, cache = 16, 128, {}
+
+    # Expert range for this EP partition
+    expert_start = local_expert_offset
+    expert_end = local_expert_offset + num_local_experts
+
+    hg = inputs["hidden_gs"]
+    hfp, hsf = fp4_quantize(inputs["hidden_bf16"], hg, sv, False, True)
+    hfp = hfp.view(torch.uint8).reshape(n, CFG.hidden_size // 2)
+    hsc = (
+        hsf.view(torch.float8_e4m3fn)
+        .flatten()[: n * CFG.hidden_size // sv]
+        .reshape(n, CFG.hidden_size // sv)
+    )
+
+    def prep(bf16, gs, M, K):
+        """Prepare weights for LOCAL experts only."""
+        fl, sl = [], []
+        for e in range(expert_start, expert_end):
+            q, s = fp4_quantize(bf16[e], gs[e], sv, False, False)
+            fl.append(q.view(torch.uint8).reshape(M, K // 2))
+            sl.append(s.view(torch.float8_e4m3fn).reshape(M, K // sv))
+        return torch.stack(fl), torch.stack(sl)
+
+    w1f, w1s = prep(
+        inputs["w1_bf16"], inputs["w1_gs"], 2 * CFG.intermediate_size, CFG.hidden_size
+    )
+    w2f, w2s = prep(
+        inputs["w2_bf16"], inputs["w2_gs"], CFG.hidden_size, CFG.intermediate_size
+    )
+
+    def shuf(fp4, sf, perm_fn):
+        """Shuffle weights for LOCAL experts only."""
+        fsh, ssh = [], []
+        for i in range(num_local_experts):
+            p = perm_fn(cache, fp4[i], etm)
+            fsh.append(fp4[i][p.to(dev)].contiguous())
+            ps = perm_fn(cache, sf[i].view(torch.uint8), etm, sv)
+            ssh.append(
+                block_scale_interleave(sf[i].view(torch.uint8)[ps.to(dev)].contiguous())
+            )
+        return torch.stack(fsh), torch.stack(ssh)
+
+    w1f, w1s = shuf(w1f, w1s, _maybe_get_cached_w3_w1_permute_indices)
+    w2f, w2s = shuf(w2f, w2s, get_w2_permute_indices_with_cache)
+    w1s = w1s.view(torch.float8_e4m3fn).reshape(
+        num_local_experts, 2 * CFG.intermediate_size, CFG.hidden_size // sv
+    )
+    w2s = w2s.view(torch.float8_e4m3fn).reshape(
+        num_local_experts, CFG.hidden_size, CFG.intermediate_size // sv
+    )
+
+    # Scale tensors sized for LOCAL experts only
+    sc = torch.ones(num_local_experts, device=dev, dtype=torch.float32)
+
+    def run(routing_logits, routing_bias, hidden_states, hidden_states_scale):
+        return trtllm_fp4_block_scale_moe(
+            routing_logits=routing_logits,
+            routing_bias=routing_bias,
+            hidden_states=hidden_states,
+            hidden_states_scale=hidden_states_scale,
+            gemm1_weights=w1f,
+            gemm1_weights_scale=w1s,
+            gemm1_bias=None,
+            gemm1_alpha=None,
+            gemm1_beta=None,
+            gemm1_clamp_limit=None,
+            gemm2_weights=w2f,
+            gemm2_weights_scale=w2s,
+            gemm2_bias=None,
+            output1_scale_scalar=sc,
+            output1_scale_gate_scalar=sc,
+            output2_scale_scalar=sc,
+            num_experts=CFG.num_experts,
+            top_k=CFG.top_k,
+            n_group=CFG.n_group,
+            topk_group=CFG.topk_group,
+            intermediate_size=CFG.intermediate_size,
+            local_expert_offset=local_expert_offset,
+            local_num_experts=num_local_experts,
+            routed_scaling_factor=CFG.routed_scaling_factor,
+            routing_method_type=RoutingMethodType.DeepSeekV3,
+            do_finalize=True,
+        )
+
+    input_kwargs = {
+        "routing_logits": inputs["router_logits"],
+        "routing_bias": inputs["routing_bias"],
+        "hidden_states": hfp,
+        "hidden_states_scale": hsc,
+    }
+
+    times = bench_gpu_time(
+        run,
+        dry_run_iters=warmup,
+        repeat_iters=iters,
+        cold_l2_cache=True,
+        enable_cupti=use_cupti,
+        use_cuda_graph=use_cuda_graph,
+        input_kwargs=input_kwargs,
+    )
+    return np.median(times)
+
+
+# =============================================================================
+# Autotune
+# =============================================================================
+
+
+def run_autotune(inputs, verbose=True):
+    from flashinfer.fused_moe import (
+        fused_topk_deepseek,
+        cutlass_fused_moe,
+        trtllm_fp4_block_scale_moe,
+    )
+    from flashinfer.fused_moe.core import (
+        RoutingMethodType,
+        _maybe_get_cached_w3_w1_permute_indices,
+        get_w2_permute_indices_with_cache,
+    )
+    from flashinfer import cute_dsl_fused_moe_nvfp4
+    from flashinfer.cute_dsl.utils import convert_sf_to_mma_layout
+    from flashinfer.fp4_quantization import fp4_quantize, block_scale_interleave
+    from flashinfer.autotuner import autotune
+
+    if verbose:
+        print("\nRunning autotune warmup for all backends...")
+        print("-" * 80)
+
+    n, sv, dev = inputs["router_logits"].shape[0], 16, "cuda"
+    gs1 = torch.tensor([1.0], device=dev)
+
+    tv = torch.empty(n, CFG.top_k, dtype=torch.float32, device=dev)
+    ti = torch.empty(n, CFG.top_k, dtype=torch.int32, device=dev)
+    fused_topk_deepseek(
+        scores=inputs["router_logits"],
+        bias=inputs["routing_bias"].float(),
+        n_group=CFG.n_group,
+        topk_group=CFG.topk_group,
+        topk=CFG.top_k,
+        routed_scaling_factor=CFG.routed_scaling_factor,
+        topk_values=tv,
+        topk_indices=ti,
+    )
+
+    # -------------------------------------------------------------------------
+    # CuteDSL autotune
+    # -------------------------------------------------------------------------
+    if verbose:
+        print("Autotuning CuteDSL...")
+
+    xf, xs = fp4_quantize(inputs["hidden_bf16"], gs1, sv, False, False)
+    xs = xs.unsqueeze(-1)
+
+    w1i = interleave(inputs["w1_bf16"], 64)
+    w1f = w1i.view(CFG.num_experts * 2 * CFG.intermediate_size, CFG.hidden_size)
+    w1q, w1s = fp4_quantize(w1f, gs1, sv, False, True)
+    w1q = w1q.view(CFG.num_experts, 2 * CFG.intermediate_size, CFG.hidden_size // 2)
+    w1s = convert_sf_to_mma_layout(
+        w1s, 2 * CFG.intermediate_size, CFG.hidden_size, CFG.num_experts, sv
+    )
+
+    w2f = inputs["w2_bf16"].view(
+        CFG.num_experts * CFG.hidden_size, CFG.intermediate_size
+    )
+    w2q, w2s = fp4_quantize(w2f, gs1, sv, False, True)
+    w2q = w2q.view(CFG.num_experts, CFG.hidden_size, CFG.intermediate_size // 2)
+    w2s = convert_sf_to_mma_layout(
+        w2s, CFG.hidden_size, CFG.intermediate_size, CFG.num_experts, sv
+    )
+
+    alpha, fc2sc = (
+        torch.ones(CFG.num_experts, device=dev),
+        torch.tensor([1.0], device=dev),
+    )
+
+    with autotune(True):
+        for _ in range(10):
+            cute_dsl_fused_moe_nvfp4(
+                x=xf,
+                x_sf=xs,
+                token_selected_experts=ti,
+                token_final_scales=tv,
+                w1_weight=w1q,
+                w1_weight_sf=w1s,
+                w1_alpha=alpha,
+                fc2_input_scale=fc2sc,
+                w2_weight=w2q,
+                w2_weight_sf=w2s,
+                w2_alpha=alpha,
+                num_experts=CFG.num_experts,
+                top_k=CFG.top_k,
+                num_local_experts=CFG.num_experts,
+                local_expert_offset=0,
+            )
+    torch.cuda.synchronize()
+
+    # -------------------------------------------------------------------------
+    # CUTLASS autotune
+    # -------------------------------------------------------------------------
+    if verbose:
+        print("Autotuning CUTLASS...")
+
+    a1_gs = torch.tensor(1.0, device=dev, dtype=torch.float32)
+    a2_gs = torch.tensor(1.0, device=dev, dtype=torch.float32)
+    quant_scales = [
+        a1_gs,
+        inputs["w1_sf"].view(torch.int32),
+        1.0 / (a1_gs * inputs["w1_gs"]),
+        a2_gs,
+        inputs["w2_sf"].view(torch.int32),
+        1.0 / (a2_gs * inputs["w2_gs"]),
+    ]
+    hidden_fp4, input_sf = fp4_quantize(inputs["hidden_bf16"], a1_gs, sv, False, True)
+    output_cutlass = torch.empty(n, CFG.hidden_size, dtype=torch.bfloat16, device=dev)
+
+    with autotune(True):
+        for _ in range(10):
+            cutlass_fused_moe(
+                hidden_fp4,
+                ti.to(torch.int),
+                tv,
+                inputs["w1_fp4"].contiguous().view(torch.long),
+                inputs["w2_fp4"].contiguous().view(torch.long),
+                torch.bfloat16,
+                quant_scales=quant_scales,
+                input_sf=input_sf,
+                output=output_cutlass,
+            )
+    torch.cuda.synchronize()
+
+    # -------------------------------------------------------------------------
+    # TRTLLM Gen autotune
+    # -------------------------------------------------------------------------
+    if verbose:
+        print("Autotuning TRTLLM Gen...")
+
+    etm, cache = 128, {}
+    hg = inputs["hidden_gs"]
+    hfp, hsf = fp4_quantize(inputs["hidden_bf16"], hg, sv, False, True)
+    hfp = hfp.view(torch.uint8).reshape(n, CFG.hidden_size // 2)
+    hsc = (
+        hsf.view(torch.float8_e4m3fn)
+        .flatten()[: n * CFG.hidden_size // sv]
+        .reshape(n, CFG.hidden_size // sv)
+    )
+
+    def prep(bf16, gs, M, K):
+        fl, sl = [], []
+        for e in range(CFG.num_experts):
+            q, s = fp4_quantize(bf16[e], gs[e], sv, False, False)
+            fl.append(q.view(torch.uint8).reshape(M, K // 2))
+            sl.append(s.view(torch.float8_e4m3fn).reshape(M, K // sv))
+        return torch.stack(fl), torch.stack(sl)
+
+    w1f_trt, w1s_trt = prep(
+        inputs["w1_bf16"], inputs["w1_gs"], 2 * CFG.intermediate_size, CFG.hidden_size
+    )
+    w2f_trt, w2s_trt = prep(
+        inputs["w2_bf16"], inputs["w2_gs"], CFG.hidden_size, CFG.intermediate_size
+    )
+
+    def shuf(fp4, sf, perm_fn):
+        fsh, ssh = [], []
+        for i in range(CFG.num_experts):
+            p = perm_fn(cache, fp4[i], etm)
+            fsh.append(fp4[i][p.to(dev)].contiguous())
+            ps = perm_fn(cache, sf[i].view(torch.uint8), etm, sv)
+            ssh.append(
+                block_scale_interleave(sf[i].view(torch.uint8)[ps.to(dev)].contiguous())
+            )
+        return torch.stack(fsh), torch.stack(ssh)
+
+    w1f_trt, w1s_trt = shuf(w1f_trt, w1s_trt, _maybe_get_cached_w3_w1_permute_indices)
+    w2f_trt, w2s_trt = shuf(w2f_trt, w2s_trt, get_w2_permute_indices_with_cache)
+    w1s_trt = w1s_trt.view(torch.float8_e4m3fn).reshape(
+        CFG.num_experts, 2 * CFG.intermediate_size, CFG.hidden_size // sv
+    )
+    w2s_trt = w2s_trt.view(torch.float8_e4m3fn).reshape(
+        CFG.num_experts, CFG.hidden_size, CFG.intermediate_size // sv
+    )
+
+    sc = torch.ones(CFG.num_experts, device=dev, dtype=torch.float32)
+
+    with autotune(True):
+        for _ in range(10):
+            trtllm_fp4_block_scale_moe(
+                routing_logits=inputs["router_logits"],
+                routing_bias=inputs["routing_bias"],
+                hidden_states=hfp,
+                hidden_states_scale=hsc,
+                gemm1_weights=w1f_trt,
+                gemm1_weights_scale=w1s_trt,
+                gemm1_bias=None,
+                gemm1_alpha=None,
+                gemm1_beta=None,
+                gemm1_clamp_limit=None,
+                gemm2_weights=w2f_trt,
+                gemm2_weights_scale=w2s_trt,
+                gemm2_bias=None,
+                output1_scale_scalar=sc,
+                output1_scale_gate_scalar=sc,
+                output2_scale_scalar=sc,
+                num_experts=CFG.num_experts,
+                top_k=CFG.top_k,
+                n_group=CFG.n_group,
+                topk_group=CFG.topk_group,
+                intermediate_size=CFG.intermediate_size,
+                local_expert_offset=0,
+                local_num_experts=CFG.num_experts,
+                routed_scaling_factor=CFG.routed_scaling_factor,
+                routing_method_type=RoutingMethodType.DeepSeekV3,
+                do_finalize=True,
+            )
+    torch.cuda.synchronize()
+
+    if verbose:
+        print("-" * 80)
+        print("Autotune complete for all backends.\n")
+
+
+# =============================================================================
+# Main Benchmark
+# =============================================================================
+
+
+@dataclass
+class BenchResult:
+    """Single benchmark result for one backend at one token count."""
+
+    backend: str
+    tokens: int
+    latency_ms: float
+    tflops: float
+
+
+def run_benchmark(
+    token_counts,
+    warmup=10,
+    iters=100,
+    ep_config=1,
+    do_autotune=True,
+    verbose=True,
+    use_cuda_graph=True,
+    use_cupti=True,
+    use_wrapper=True,
+):
+    """
+    Unified benchmark for DeepSeek-V3 MoE backends.
+
+    Args:
+        token_counts: List of token counts to benchmark
+        warmup: Warmup iterations
+        iters: Benchmark iterations
+        ep_config: Expert Parallelism config (1, 8, or 16)
+        do_autotune: Whether to run autotune before benchmarking
+        verbose: Print results to stdout
+        use_cuda_graph: Whether to use CUDA graph for benchmarking
+        use_cupti: Whether to use CUPTI for accurate GPU timing
+        use_wrapper: Whether to use CuteDslMoEWrapper API (recommended)
+
+    Returns:
+        List of BenchResult objects
+    """
+    # Get EP configuration
+    ep_cfg = EP_CONFIGS.get(ep_config, EP_CONFIGS[1])
+    num_local = ep_cfg["num_local_experts"]
+    local_offset = ep_cfg["local_expert_offset"]
+
+    # Run autotune if requested (BEFORE printing header to avoid interleaved output)
+    if do_autotune:
+        run_autotune(create_inputs(max(token_counts)), verbose=verbose)
+
+    # Print header AFTER autotune completes
+    if verbose:
+        _print_header(ep_config, num_local, use_cuda_graph, use_cupti)
+
+    # Run benchmarks
+    results = []
+    for n in token_counts:
+        row = _benchmark_single(
+            n,
+            warmup,
+            iters,
+            num_local,
+            local_offset,
+            use_cuda_graph,
+            use_cupti,
+            use_wrapper=use_wrapper,
+        )
+        results.extend(row)
+        if verbose:
+            _print_row(row)
+
+    # Print footer
+    if verbose:
+        _print_footer(ep_config, num_local)
+
+    return results
+
+
+def _benchmark_single(
+    n,
+    warmup,
+    iters,
+    num_local,
+    local_offset,
+    use_cuda_graph,
+    use_cupti,
+    use_wrapper=True,
+):
+    """Benchmark all backends for a single token count.
+
+    Args:
+        use_wrapper: If True, use CuteDslMoEWrapper API for CuteDSL.
+    """
+    inputs = create_inputs(n)
+
+    # Run all three backends
+    lat = {
+        "CuteDSL": bench_cute_dsl(
+            inputs,
+            warmup,
+            iters,
+            num_local,
+            local_offset,
+            use_cuda_graph,
+            use_cupti,
+            use_wrapper=use_wrapper,
+        ),
+        "CUTLASS": bench_cutlass(
+            inputs, warmup, iters, num_local, local_offset, use_cuda_graph, use_cupti
+        ),
+        "TRTLLM": bench_trtllm(
+            inputs, warmup, iters, num_local, local_offset, use_cuda_graph, use_cupti
+        ),
+    }
+
+    # Build results
+    results = []
+    for backend, latency in lat.items():
+        results.append(
+            BenchResult(
+                backend=backend,
+                tokens=n,
+                latency_ms=latency,
+                tflops=calc_tflops(n, latency, num_local),
+            )
+        )
+    return results
+
+
+def _print_header(ep_config, num_local, use_cuda_graph, use_cupti):
+    """Print benchmark header."""
+    print("\n" + "=" * 100)
+    print(f"DeepSeek-V3 MoE Benchmark: CuteDSL vs CUTLASS vs TRTLLM (EP={ep_config})")
+    print("=" * 100)
+    print(
+        f"Model: hidden={CFG.hidden_size}, intermediate={CFG.intermediate_size}, "
+        f"experts={CFG.num_experts}, top_k={CFG.top_k}"
+    )
+    print(
+        f"EP Config: {num_local} local experts (simulating {CFG.num_experts // num_local}-way parallelism)"
+    )
+    print(
+        f"CUDA Graph: {'enabled' if use_cuda_graph else 'disabled'}, CUPTI: {'enabled' if use_cupti else 'disabled'}"
+    )
+    print("-" * 100)
+    print(
+        f"{'Tokens':>6} | "
+        f"{'CuteDSL':^15} | "
+        f"{'CUTLASS':^15} | "
+        f"{'TRTLLM':^15} | "
+        f"{'Speedup (CuteDSL/X)':^18} | "
+        f"{'Winner':^8}"
+    )
+    print(
+        f"{'':>6} | "
+        f"{'ms':>7} {'TFLOPS':>7} | "
+        f"{'ms':>7} {'TFLOPS':>7} | "
+        f"{'ms':>7} {'TFLOPS':>7} | "
+        f"{'CUTLASS':>8} {'TRTLLM':>8} |"
+    )
+    print("-" * 100)
+
+
+def _print_row(results):
+    """Print a single row of benchmark results."""
+    # Extract values by backend
+    r = {r.backend: r for r in results}
+    cute, cutlass, trtllm = r["CuteDSL"], r["CUTLASS"], r["TRTLLM"]
+
+    # Calculate speedups (> 1.0 means CuteDSL is faster)
+    speedup_cutlass = cutlass.latency_ms / cute.latency_ms
+    speedup_trtllm = trtllm.latency_ms / cute.latency_ms
+
+    # Find winner
+    winner = min(r.values(), key=lambda x: x.latency_ms).backend
+
+    print(
+        f"{cute.tokens:>6} | "
+        f"{cute.latency_ms:>7.3f} {cute.tflops:>7.1f} | "
+        f"{cutlass.latency_ms:>7.3f} {cutlass.tflops:>7.1f} | "
+        f"{trtllm.latency_ms:>7.3f} {trtllm.tflops:>7.1f} | "
+        f"{speedup_cutlass:>7.2f}x {speedup_trtllm:>7.2f}x | "
+        f"{winner:^8}"
+    )
+
+
+def _print_footer(ep_config, num_local):
+    """Print benchmark footer."""
+    print("-" * 100)
+    print("Speedup > 1.0 means CuteDSL is faster than that backend")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="DeepSeek-V3 MoE Performance Benchmark"
+    )
+    parser.add_argument(
+        "--num-tokens",
+        type=str,
+        default=None,
+        help="Comma-separated token counts (default: 128-4096 for throughput, 1-128 for gen-phase)",
+    )
+    parser.add_argument("--warmup", type=int, default=10, help="Warmup iterations")
+    parser.add_argument("--iters", type=int, default=100, help="Benchmark iterations")
+    parser.add_argument("--no-autotune", action="store_true", help="Disable autotune")
+    parser.add_argument("--quiet", action="store_true", help="Minimal output")
+    parser.add_argument(
+        "--gen-phase",
+        action="store_true",
+        help="Use generation phase token counts (1-128 instead of 128-4096)",
+    )
+    parser.add_argument(
+        "--ep",
+        type=int,
+        default=1,
+        choices=[1, 8, 16],
+        help="Expert Parallelism: 1 (256 local), 8 (32 local), 16 (16 local)",
+    )
+    parser.add_argument(
+        "--no-cuda-graph",
+        action="store_true",
+        help="Disable CUDA graph for benchmarking (enabled by default)",
+    )
+    parser.add_argument(
+        "--no-cupti",
+        action="store_true",
+        help="Disable CUPTI for GPU timing (enabled by default)",
+    )
+    parser.add_argument(
+        "--functional-api",
+        action="store_true",
+        help="Use functional API instead of CuteDslMoEWrapper for CuteDSL benchmark",
+    )
+    args = parser.parse_args()
+
+    if not is_sm100_family():
+        print("ERROR: Requires SM100 family GPU (Blackwell: SM100, SM103, SM110)")
+        return 1
+
+    # Determine token counts
+    if args.num_tokens:
+        tokens = [int(x) for x in args.num_tokens.split(",")]
+    elif args.gen_phase:
+        tokens = GEN_PHASE_TOKENS  # [1, 2, 4, 8, 16, 32, 64, 128]
+    else:
+        tokens = TOKEN_COUNTS  # [128, 256, 512, 1024, 2048, 4096]
+
+    print("\nDeepSeek-V3 MoE Performance Benchmark")
+    print(f"GPU: {torch.cuda.get_device_name(0)}")
+    print(f"CuteDSL API: {'Functional' if args.functional_api else 'Wrapper'}")
+
+    run_benchmark(
+        token_counts=tokens,
+        warmup=args.warmup,
+        iters=args.iters,
+        ep_config=args.ep,
+        do_autotune=not args.no_autotune,
+        verbose=not args.quiet,
+        use_cuda_graph=not args.no_cuda_graph,
+        use_cupti=not args.no_cupti,
+        use_wrapper=not args.functional_api,
+    )
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/csrc/moe_utils_binding.cu b/csrc/moe_utils_binding.cu
new file mode 100644
index 0000000000..ced63cb71b
--- /dev/null
+++ b/csrc/moe_utils_binding.cu
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#ifdef ENABLE_FP8
+#include <cuda_fp8.h>
+#endif
+#ifdef ENABLE_FP4
+#include <cuda_fp4.h>
+#endif
+
+#include "flashinfer/trtllm/fused_moe/RoutingKernel.h"
+#include "tensorrt_llm/kernels/cuteDslKernels/moeUtils.h"
+#include "tvm_ffi_utils.h"
+
+using namespace tensorrt_llm::kernels::cute_dsl;
+
+namespace {
+// Helper function to compute log2 of a value (returns -1 if not power of 2)
+inline int32_t computeLog2(int32_t val) {
+  int32_t n = val;
+  int32_t out = 0;
+  while (n >>= 1) {
+    ++out;
+  }
+  if ((1 << out) != val) {
+    out = -1;
+  }
+  return out;
+}
+}  // namespace
+
+// ============================ moePermute bindings ============================
+
+void moe_permute_fp16(int64_t input_ptr, int64_t permuted_output_ptr, int64_t input_sf_ptr,
+                      int64_t permuted_sf_ptr, int64_t tile_idx_to_mn_limit_ptr,
+                      int64_t permuted_idx_to_expanded_idx_ptr, int64_t num_non_exiting_tiles_ptr,
+                      int32_t max_num_permuted_tokens, int32_t hidden_size, int32_t top_k,
+                      int32_t tile_size, bool enable_pdl) {
+  moePermute<half, uint8_t>(
+      reinterpret_cast<half const*>(input_ptr), reinterpret_cast<half*>(permuted_output_ptr),
+      reinterpret_cast<uint8_t const*>(input_sf_ptr), reinterpret_cast<uint8_t*>(permuted_sf_ptr),
+      reinterpret_cast<int32_t const*>(tile_idx_to_mn_limit_ptr),
+      reinterpret_cast<int32_t const*>(permuted_idx_to_expanded_idx_ptr),
+      reinterpret_cast<int32_t const*>(num_non_exiting_tiles_ptr), max_num_permuted_tokens,
+      hidden_size, top_k, tile_size, enable_pdl, get_current_stream());
+}
+
+#ifdef ENABLE_BF16
+void moe_permute_bf16(int64_t input_ptr, int64_t permuted_output_ptr, int64_t input_sf_ptr,
+                      int64_t permuted_sf_ptr, int64_t tile_idx_to_mn_limit_ptr,
+                      int64_t permuted_idx_to_expanded_idx_ptr, int64_t num_non_exiting_tiles_ptr,
+                      int32_t max_num_permuted_tokens, int32_t hidden_size, int32_t top_k,
+                      int32_t tile_size, bool enable_pdl) {
+  moePermute<__nv_bfloat16, uint8_t>(
+      reinterpret_cast<__nv_bfloat16 const*>(input_ptr),
+      reinterpret_cast<__nv_bfloat16*>(permuted_output_ptr),
+      reinterpret_cast<uint8_t const*>(input_sf_ptr), reinterpret_cast<uint8_t*>(permuted_sf_ptr),
+      reinterpret_cast<int32_t const*>(tile_idx_to_mn_limit_ptr),
+      reinterpret_cast<int32_t const*>(permuted_idx_to_expanded_idx_ptr),
+      reinterpret_cast<int32_t const*>(num_non_exiting_tiles_ptr), max_num_permuted_tokens,
+      hidden_size, top_k, tile_size, enable_pdl, get_current_stream());
+}
+#endif
+
+#ifdef ENABLE_FP8
+void moe_permute_fp8(int64_t input_ptr, int64_t permuted_output_ptr, int64_t input_sf_ptr,
+                     int64_t permuted_sf_ptr, int64_t tile_idx_to_mn_limit_ptr,
+                     int64_t permuted_idx_to_expanded_idx_ptr, int64_t num_non_exiting_tiles_ptr,
+                     int32_t max_num_permuted_tokens, int32_t hidden_size, int32_t top_k,
+                     int32_t tile_size, bool enable_pdl) {
+  moePermute<__nv_fp8_e4m3, uint8_t>(
+      reinterpret_cast<__nv_fp8_e4m3 const*>(input_ptr),
+      reinterpret_cast<__nv_fp8_e4m3*>(permuted_output_ptr),
+      reinterpret_cast<uint8_t const*>(input_sf_ptr), reinterpret_cast<uint8_t*>(permuted_sf_ptr),
+      reinterpret_cast<int32_t const*>(tile_idx_to_mn_limit_ptr),
+      reinterpret_cast<int32_t const*>(permuted_idx_to_expanded_idx_ptr),
+      reinterpret_cast<int32_t const*>(num_non_exiting_tiles_ptr), max_num_permuted_tokens,
+      hidden_size, top_k, tile_size, enable_pdl, get_current_stream());
+}
+#endif
+
+#ifdef ENABLE_FP4
+void moe_permute_fp4(int64_t input_ptr, int64_t permuted_output_ptr, int64_t input_sf_ptr,
+                     int64_t permuted_sf_ptr, int64_t tile_idx_to_mn_limit_ptr,
+                     int64_t permuted_idx_to_expanded_idx_ptr, int64_t num_non_exiting_tiles_ptr,
+                     int32_t max_num_permuted_tokens, int32_t hidden_size, int32_t top_k,
+                     int32_t tile_size, bool enable_pdl) {
+  moePermute<__nv_fp4_e2m1, uint8_t>(
+      reinterpret_cast<__nv_fp4_e2m1 const*>(input_ptr),
+      reinterpret_cast<__nv_fp4_e2m1*>(permuted_output_ptr),
+      reinterpret_cast<uint8_t const*>(input_sf_ptr), reinterpret_cast<uint8_t*>(permuted_sf_ptr),
+      reinterpret_cast<int32_t const*>(tile_idx_to_mn_limit_ptr),
+      reinterpret_cast<int32_t const*>(permuted_idx_to_expanded_idx_ptr),
+      reinterpret_cast<int32_t const*>(num_non_exiting_tiles_ptr), max_num_permuted_tokens,
+      hidden_size, top_k, tile_size, enable_pdl, get_current_stream());
+}
+#endif
+
+// ============================ moeUnpermute bindings ============================
+
+void moe_unpermute_fp16_float_scale(int64_t permuted_input_ptr, int64_t output_ptr,
+                                    int64_t expanded_idx_to_permuted_idx_ptr,
+                                    int64_t topk_scales_ptr, int32_t num_tokens,
+                                    int32_t hidden_size, int32_t top_k, bool enable_pdl) {
+  moeUnpermute<half, float>(reinterpret_cast<half const*>(permuted_input_ptr),
+                            reinterpret_cast<half*>(output_ptr),
+                            reinterpret_cast<int32_t const*>(expanded_idx_to_permuted_idx_ptr),
+                            reinterpret_cast<float const*>(topk_scales_ptr), num_tokens,
+                            hidden_size, top_k, enable_pdl, get_current_stream());
+}
+
+void moe_unpermute_fp16_half_scale(int64_t permuted_input_ptr, int64_t output_ptr,
+                                   int64_t expanded_idx_to_permuted_idx_ptr,
+                                   int64_t topk_scales_ptr, int32_t num_tokens, int32_t hidden_size,
+                                   int32_t top_k, bool enable_pdl) {
+  moeUnpermute<half, half>(reinterpret_cast<half const*>(permuted_input_ptr),
+                           reinterpret_cast<half*>(output_ptr),
+                           reinterpret_cast<int32_t const*>(expanded_idx_to_permuted_idx_ptr),
+                           reinterpret_cast<half const*>(topk_scales_ptr), num_tokens, hidden_size,
+                           top_k, enable_pdl, get_current_stream());
+}
+
+#ifdef ENABLE_BF16
+void moe_unpermute_bf16_float_scale(int64_t permuted_input_ptr, int64_t output_ptr,
+                                    int64_t expanded_idx_to_permuted_idx_ptr,
+                                    int64_t topk_scales_ptr, int32_t num_tokens,
+                                    int32_t hidden_size, int32_t top_k, bool enable_pdl) {
+  moeUnpermute<__nv_bfloat16, float>(
+      reinterpret_cast<__nv_bfloat16 const*>(permuted_input_ptr),
+      reinterpret_cast<__nv_bfloat16*>(output_ptr),
+      reinterpret_cast<int32_t const*>(expanded_idx_to_permuted_idx_ptr),
+      reinterpret_cast<float const*>(topk_scales_ptr), num_tokens, hidden_size, top_k, enable_pdl,
+      get_current_stream());
+}
+
+void moe_unpermute_bf16_bf16_scale(int64_t permuted_input_ptr, int64_t output_ptr,
+                                   int64_t expanded_idx_to_permuted_idx_ptr,
+                                   int64_t topk_scales_ptr, int32_t num_tokens, int32_t hidden_size,
+                                   int32_t top_k, bool enable_pdl) {
+  moeUnpermute<__nv_bfloat16, __nv_bfloat16>(
+      reinterpret_cast<__nv_bfloat16 const*>(permuted_input_ptr),
+      reinterpret_cast<__nv_bfloat16*>(output_ptr),
+      reinterpret_cast<int32_t const*>(expanded_idx_to_permuted_idx_ptr),
+      reinterpret_cast<__nv_bfloat16 const*>(topk_scales_ptr), num_tokens, hidden_size, top_k,
+      enable_pdl, get_current_stream());
+}
+#endif
+
+// ============================ moeOutputMemset bindings ============================
+
+void moe_output_memset_fp16(int64_t input_ptr, int64_t tile_idx_to_mn_limit_ptr,
+                            int64_t expanded_idx_to_permuted_idx_ptr,
+                            int64_t permuted_idx_to_expanded_idx_ptr,
+                            int64_t num_non_exiting_tiles_ptr, int32_t max_num_permuted_tokens,
+                            int32_t hidden_size, int32_t top_k, int32_t tile_size,
+                            bool enable_pdl) {
+  moeOutputMemset<half>(reinterpret_cast<half*>(input_ptr),
+                        reinterpret_cast<int32_t const*>(tile_idx_to_mn_limit_ptr),
+                        reinterpret_cast<int32_t const*>(expanded_idx_to_permuted_idx_ptr),
+                        reinterpret_cast<int32_t const*>(permuted_idx_to_expanded_idx_ptr),
+                        reinterpret_cast<int32_t const*>(num_non_exiting_tiles_ptr),
+                        max_num_permuted_tokens, hidden_size, top_k, tile_size, enable_pdl,
+                        get_current_stream());
+}
+
+#ifdef ENABLE_BF16
+void moe_output_memset_bf16(int64_t input_ptr, int64_t tile_idx_to_mn_limit_ptr,
+                            int64_t expanded_idx_to_permuted_idx_ptr,
+                            int64_t permuted_idx_to_expanded_idx_ptr,
+                            int64_t num_non_exiting_tiles_ptr, int32_t max_num_permuted_tokens,
+                            int32_t hidden_size, int32_t top_k, int32_t tile_size,
+                            bool enable_pdl) {
+  moeOutputMemset<__nv_bfloat16>(reinterpret_cast<__nv_bfloat16*>(input_ptr),
+                                 reinterpret_cast<int32_t const*>(tile_idx_to_mn_limit_ptr),
+                                 reinterpret_cast<int32_t const*>(expanded_idx_to_permuted_idx_ptr),
+                                 reinterpret_cast<int32_t const*>(permuted_idx_to_expanded_idx_ptr),
+                                 reinterpret_cast<int32_t const*>(num_non_exiting_tiles_ptr),
+                                 max_num_permuted_tokens, hidden_size, top_k, tile_size, enable_pdl,
+                                 get_current_stream());
+}
+#endif
+
+// ============================ moeActivation bindings ============================
+
+void moe_activation_fp16(int64_t input_ptr, int64_t output_ptr, int64_t tile_idx_to_mn_limit_ptr,
+                         int64_t num_non_exiting_tiles_ptr, int32_t activation_type,
+                         int32_t max_num_permuted_tokens, int32_t interm_size, int32_t tile_size,
+                         bool enable_pdl) {
+  moeActivation<half>(reinterpret_cast<half const*>(input_ptr), reinterpret_cast<half*>(output_ptr),
+                      reinterpret_cast<int32_t const*>(tile_idx_to_mn_limit_ptr),
+                      reinterpret_cast<int32_t const*>(num_non_exiting_tiles_ptr),
+                      static_cast<MoeActivationType>(activation_type), max_num_permuted_tokens,
+                      interm_size, tile_size, enable_pdl, get_current_stream());
+}
+
+#ifdef ENABLE_BF16
+void moe_activation_bf16(int64_t input_ptr, int64_t output_ptr, int64_t tile_idx_to_mn_limit_ptr,
+                         int64_t num_non_exiting_tiles_ptr, int32_t activation_type,
+                         int32_t max_num_permuted_tokens, int32_t interm_size, int32_t tile_size,
+                         bool enable_pdl) {
+  moeActivation<__nv_bfloat16>(reinterpret_cast<__nv_bfloat16 const*>(input_ptr),
+                               reinterpret_cast<__nv_bfloat16*>(output_ptr),
+                               reinterpret_cast<int32_t const*>(tile_idx_to_mn_limit_ptr),
+                               reinterpret_cast<int32_t const*>(num_non_exiting_tiles_ptr),
+                               static_cast<MoeActivationType>(activation_type),
+                               max_num_permuted_tokens, interm_size, tile_size, enable_pdl,
+                               get_current_stream());
+}
+#endif
+
+// ============================ TVM FFI Registration ============================
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_permute_fp16, moe_permute_fp16);
+#ifdef ENABLE_BF16
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_permute_bf16, moe_permute_bf16);
+#endif
+#ifdef ENABLE_FP8
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_permute_fp8, moe_permute_fp8);
+#endif
+#ifdef ENABLE_FP4
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_permute_fp4, moe_permute_fp4);
+#endif
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_unpermute_fp16_float_scale,
+                              moe_unpermute_fp16_float_scale);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_unpermute_fp16_half_scale,
+                              moe_unpermute_fp16_half_scale);
+#ifdef ENABLE_BF16
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_unpermute_bf16_float_scale,
+                              moe_unpermute_bf16_float_scale);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_unpermute_bf16_bf16_scale,
+                              moe_unpermute_bf16_bf16_scale);
+#endif
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_output_memset_fp16, moe_output_memset_fp16);
+#ifdef ENABLE_BF16
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_output_memset_bf16, moe_output_memset_bf16);
+#endif
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_activation_fp16, moe_activation_fp16);
+#ifdef ENABLE_BF16
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_activation_bf16, moe_activation_bf16);
+#endif
+
+// ============================ moeSort bindings ============================
+// moe_sort - Sort tokens by expert assignment and generate mapping tensors
+// This uses DeepSeekV3 routing method with pre-computed expert selections
+//
+// Returns via output pointers:
+// - tile_idx_to_expert_idx: [max_num_tiles], mapping from tile to local expert index
+// - tile_idx_to_mn_limit: [max_num_tiles], M/N limit for each tile
+// - expanded_idx_to_permuted_idx: [num_tokens, top_k], mapping from expanded to permuted index
+// - permuted_idx_to_expanded_idx: [max_num_permuted_tokens], mapping from permuted to expanded
+// - total_num_padded_tokens: [1], total number of padded tokens
+// - num_non_exiting_tiles: [1], number of non-exiting tiles
+
+void moe_sort(
+    // Inputs
+    int64_t token_selected_experts_ptr,  // [num_tokens, top_k], int32
+    int64_t token_final_scales_ptr,      // [num_tokens, top_k], float32 or bf16
+    int32_t num_tokens, int32_t num_experts, int32_t top_k, int32_t local_expert_offset,
+    int32_t num_local_experts, int32_t tile_tokens_dim, bool use_pdl,
+    // Outputs (pre-allocated buffers)
+    int64_t tile_idx_to_expert_idx_ptr, int64_t tile_idx_to_mn_limit_ptr,
+    int64_t expanded_idx_to_permuted_idx_ptr, int64_t permuted_idx_to_expanded_idx_ptr,
+    int64_t total_num_padded_tokens_ptr, int64_t num_non_exiting_tiles_ptr,
+    // Optional: expert counts buffer for large token counts (>1024)
+    // Should be size 2 * num_experts, int32
+    int64_t expert_counts_ptr,
+    // Optional: explicit CUDA stream pointer for CUDA graph compatibility
+    // If 0, uses TVM FFI's current stream
+    int64_t cuda_stream_ptr) {
+  // Set up the routing data structure
+  moe::dev::routing::routingDeepSeek::Data routingData;
+
+  // Configure dtypes
+  routingData.mDtypeExpW = batchedGemm::trtllm::gen::Dtype::Bfloat16;
+  routingData.mDtypeBias = batchedGemm::trtllm::gen::Dtype::Bfloat16;
+  routingData.mDtypeScore = batchedGemm::trtllm::gen::Dtype::Fp32;
+  routingData.mUsePdl = use_pdl;
+
+  // Input tensors (pre-computed expert selections)
+  routingData.mPtrTopKIds = reinterpret_cast<int32_t*>(token_selected_experts_ptr);
+  routingData.mPtrTopKWeights = reinterpret_cast<void*>(token_final_scales_ptr);
+  routingData.mPtrScores = nullptr;       // Not using routing logits
+  routingData.mPtrRoutingBias = nullptr;  // Not using bias
+
+  // Output tensors
+  routingData.mPtrCtaIdxXyToBatchIdx = reinterpret_cast<int32_t*>(tile_idx_to_expert_idx_ptr);
+  routingData.mPtrCtaIdxXyToMnLimit = reinterpret_cast<int32_t*>(tile_idx_to_mn_limit_ptr);
+  routingData.mPtrExpandedIdxToPermutedIdx =
+      reinterpret_cast<int32_t*>(expanded_idx_to_permuted_idx_ptr);
+  routingData.mPtrPermutedIdxToTokenIdx =
+      reinterpret_cast<int32_t*>(permuted_idx_to_expanded_idx_ptr);
+  routingData.mPtrPermutedIdxSize = reinterpret_cast<int32_t*>(total_num_padded_tokens_ptr);
+  routingData.mPtrNumNonExitingCtas = reinterpret_cast<int32_t*>(num_non_exiting_tiles_ptr);
+
+  // Not using packed format since we have explicit TopK IDs
+  routingData.mPtrTopKPacked = nullptr;
+
+  // Expert counts buffer: required when num_tokens > 1024
+  // The kernel will set this to nullptr internally for small token counts
+  routingData.mPtrExpertCounts = reinterpret_cast<int32_t*>(expert_counts_ptr);
+
+  // Metadata
+  routingData.mNumTokens = num_tokens;
+  routingData.mNumExperts = num_experts;
+  routingData.mTopK = top_k;
+  routingData.mPaddingLog2 = computeLog2(tile_tokens_dim);
+  routingData.mTileTokensDim = tile_tokens_dim;
+  routingData.mLocalExpertsStartIdx = local_expert_offset;
+  routingData.mLocalExpertsStrideLog2 = 0;
+  routingData.mNumLocalExperts = num_local_experts;
+
+  // DeepSeekV3 specific parameters
+  // For moe_sort, we use n_group=1, topk_group=1 since experts are already selected
+  routingData.mNumExpertGroups = 1;
+  routingData.mNumLimitedGroups = 1;
+  routingData.mRouteScale = 1.0f;
+  routingData.mUseRoutingSoftmax = false;
+
+  // Run the routing kernel
+  // Use explicit stream if provided (for CUDA graph compatibility), otherwise fall back to TVM FFI
+  // stream
+  cudaStream_t stream =
+      cuda_stream_ptr != 0 ? reinterpret_cast<cudaStream_t>(cuda_stream_ptr) : get_current_stream();
+  moe::dev::routing::routingDeepSeek::run(routingData, stream);
+}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(flashinfer_moe_sort, moe_sort);
diff --git a/csrc/nv_internal/include/tensorrt_llm/common/config.h b/csrc/nv_internal/include/tensorrt_llm/common/config.h
new file mode 100644
index 0000000000..cb157f6140
--- /dev/null
+++ b/csrc/nv_internal/include/tensorrt_llm/common/config.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#ifndef TRTLLM_CONFIG_H
+#define TRTLLM_CONFIG_H
+
+/**
+ * \def TRTLLM_ABI_NAMESPACE
+ * This macro is used to open an implicitly inline namespace block for the ABI version.
+ * This macro can be overridden to change the ABI version.
+ * The default ABI version is _v1.
+ */
+#ifndef TRTLLM_ABI_NAMESPACE
+#define TRTLLM_ABI_NAMESPACE _v1
+#endif
+
+#ifndef TRTLLM_ABI_NAMESPACE_BEGIN
+#define TRTLLM_ABI_NAMESPACE_BEGIN inline namespace TRTLLM_ABI_NAMESPACE {
+#endif
+
+#ifndef TRTLLM_ABI_NAMESPACE_END
+#define TRTLLM_ABI_NAMESPACE_END }
+#endif
+
+/**
+ * \def TRTLLM_NAMESPACE_BEGIN
+ * This macro is used to open a `tensorrt_llm::` namespace block, along with any
+ * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by TensorRT-LLM and may not be overridden.
+ */
+#define TRTLLM_NAMESPACE_BEGIN \
+  namespace tensorrt_llm {     \
+  TRTLLM_ABI_NAMESPACE_BEGIN
+
+/**
+ * \def TRTLLM_NAMESPACE_END
+ * This macro is used to close a `tensorrt_llm::` namespace block, along with any
+ * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by TensorRT-LLM and may not be overridden.
+ */
+#define TRTLLM_NAMESPACE_END \
+  TRTLLM_ABI_NAMESPACE_END   \
+  } /* end namespace tensorrt_llm */
+
+#endif  // TRTLLM_CONFIG_H
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu b/csrc/nv_internal/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu
new file mode 100644
index 0000000000..0144d0885c
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu
@@ -0,0 +1,485 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/kernels/cuteDslKernels/moeUtils.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh"
+
+#ifdef ENABLE_FP4
+#include <cuda_fp4.h>
+#endif
+#include <cute/numeric/numeric_types.hpp>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace kernels::cute_dsl {
+namespace {
+using ElemCopyType = uint4;
+using SFCopyType = uint32_t;
+
+template <typename T>
+auto constexpr bitsPerElem() {
+#ifdef ENABLE_FP4
+  return std::is_same_v<T, __nv_fp4_e2m1> ? 4 : cute::sizeof_bits_v<T>;
+#else
+  return cute::sizeof_bits_v<T>;
+#endif
+}
+
+template <typename T>
+auto constexpr elemPerCopy() {
+  return bitsPerElem<ElemCopyType>() / bitsPerElem<T>();
+}
+
+template <typename T>
+auto constexpr sfElemPerCopy() {
+  return bitsPerElem<SFCopyType>() / bitsPerElem<T>();
+}
+
+// Helper to get max active blocks per SM
+template <typename KernelFunc>
+int32_t getMaxActiveBlocksPerSM(KernelFunc kernel, int32_t threadsPerBlock,
+                                size_t dynamicSmemSize) {
+  int numBlocks = 0;
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, threadsPerBlock,
+                                                dynamicSmemSize);
+  return numBlocks;
+}
+
+}  // namespace
+
+template <typename InputType, typename SFType, int32_t kSFVecSize, int32_t kThreadsPerBlock>
+__global__ void moePermuteKernel(InputType const* input, InputType* permuted_output,
+                                 SFType const* input_sf, SFType* permuted_sf,
+                                 int32_t const* tile_idx_to_mn_limit,
+                                 int32_t const* permuted_idx_to_expanded_idx,
+                                 int32_t const* num_non_exiting_tiles, int32_t const hidden_size,
+                                 int32_t const top_k, int32_t const tile_size) {
+  int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+  [[maybe_unused]] int32_t constexpr kSFElemPerCopy = sfElemPerCopy<SFType>();
+  // Need int64_t to prevent overflow when computing pointer offsets.
+  int64_t const kCopyPerToken = hidden_size / kElemPerCopy;
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaGridDependencySynchronize();
+#endif
+
+  int32_t const num_tokens = num_non_exiting_tiles[0] * tile_size;
+  for (int32_t permuted_idx = blockIdx.x; permuted_idx < num_tokens; permuted_idx += gridDim.x) {
+    int32_t const tile_idx = permuted_idx / tile_size;
+    if (permuted_idx >= tile_idx_to_mn_limit[tile_idx]) {
+      continue;
+    }
+    int32_t const expanded_idx = permuted_idx_to_expanded_idx[permuted_idx];
+    int32_t const token_idx = expanded_idx / top_k;
+
+    auto const* src_ptr = reinterpret_cast<ElemCopyType const*>(input) + token_idx * kCopyPerToken;
+    auto* dst_ptr = reinterpret_cast<ElemCopyType*>(permuted_output) + permuted_idx * kCopyPerToken;
+    for (int32_t i = threadIdx.x; i < kCopyPerToken; i += kThreadsPerBlock) {
+      dst_ptr[i] = src_ptr[i];
+    }
+
+    // Note: FP4 scale factor handling is deferred to Phase 3
+  }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaTriggerProgrammaticLaunchCompletion();
+#endif
+}
+
+template <typename InputType, typename SFType>
+void moePermute(InputType const* input, InputType* permuted_output, SFType const* input_sf,
+                SFType* permuted_sf, int32_t const* tile_idx_to_mn_limit,
+                int32_t const* permuted_idx_to_expanded_idx, int32_t const* num_non_exiting_tiles,
+                int32_t const max_num_permuted_tokens, int32_t const hidden_size,
+                int32_t const top_k, int32_t const tile_size, bool enable_pdl,
+                cudaStream_t stream) {
+  int32_t constexpr kThreadsPerBlock = 256;
+  int32_t constexpr kSFVecSize = 16;
+  int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+  TLLM_CHECK_WITH_INFO(hidden_size % kElemPerCopy == 0, "hidden_size must be divisible by %d.",
+                       kElemPerCopy);
+
+  auto kernel = &moePermuteKernel<InputType, SFType, kSFVecSize, kThreadsPerBlock>;
+  static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
+  int32_t const maxBlocksPerSM = getMaxActiveBlocksPerSM(kernel, kThreadsPerBlock, 0);
+  int32_t const blocks = std::min(smCount * maxBlocksPerSM, max_num_permuted_tokens);
+  int32_t const threads = kThreadsPerBlock;
+
+  cudaLaunchConfig_t config;
+  config.gridDim = blocks;
+  config.blockDim = threads;
+  config.dynamicSmemBytes = 0;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(&config, kernel, input, permuted_output, input_sf, permuted_sf,
+                     tile_idx_to_mn_limit, permuted_idx_to_expanded_idx, num_non_exiting_tiles,
+                     hidden_size, top_k, tile_size);
+}
+
+#define INSTANTIATE_MOE_PERMUTE(InputType, SFType)                                           \
+  template void moePermute<InputType, SFType>(                                               \
+      InputType const* input, InputType* permuted_output, SFType const* input_sf,            \
+      SFType* permuted_sf, int32_t const* tile_idx_to_mn_limit,                              \
+      int32_t const* permuted_idx_to_expanded_idx, int32_t const* num_non_exiting_tiles,     \
+      int32_t const max_num_permuted_tokens, int32_t const hidden_size, int32_t const top_k, \
+      int32_t const tile_size, bool enable_pdl, cudaStream_t stream)
+
+INSTANTIATE_MOE_PERMUTE(half, uint8_t);
+#ifdef ENABLE_BF16
+INSTANTIATE_MOE_PERMUTE(__nv_bfloat16, uint8_t);
+#endif
+#ifdef ENABLE_FP8
+INSTANTIATE_MOE_PERMUTE(__nv_fp8_e4m3, uint8_t);
+#endif
+#ifdef ENABLE_FP4
+INSTANTIATE_MOE_PERMUTE(__nv_fp4_e2m1, uint8_t);
+#endif
+#undef INSTANTIATE_MOE_PERMUTE
+
+template <typename InputType, typename TopKScaleType, int32_t kThreadsPerBlock>
+__global__ void moeUnpermuteKernel(InputType const* permuted_input, InputType* output,
+                                   int32_t const* expanded_idx_to_permuted_idx,
+                                   TopKScaleType const* topk_scales, int32_t const hidden_size,
+                                   int32_t const top_k) {
+  using AccumType = float;
+  int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+  // Need int64_t to prevent overflow when computing pointer offsets.
+  int64_t const kCopyPerToken = hidden_size / kElemPerCopy;
+  InputType rmem[kElemPerCopy];
+  AccumType rmemAccum[kElemPerCopy];
+
+  int32_t const token_idx = blockIdx.x;
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaGridDependencySynchronize();
+#endif
+
+  auto* dst_ptr = reinterpret_cast<ElemCopyType*>(output) + token_idx * kCopyPerToken;
+  for (int32_t i = threadIdx.x; i < kCopyPerToken; i += kThreadsPerBlock) {
+#pragma unroll
+    for (int32_t j = 0; j < kElemPerCopy; j++) {
+      rmemAccum[j] = 0;
+    }
+    for (int32_t k = 0; k < top_k; k++) {
+      int32_t const permuted_idx = expanded_idx_to_permuted_idx[token_idx * top_k + k];
+      if (permuted_idx < 0) {
+        continue;
+      }
+      auto const* src_ptr =
+          reinterpret_cast<ElemCopyType const*>(permuted_input) + permuted_idx * kCopyPerToken;
+      *reinterpret_cast<ElemCopyType*>(rmem) = src_ptr[i];
+      TopKScaleType const scale = topk_scales[token_idx * top_k + k];
+
+#pragma unroll
+      for (int32_t j = 0; j < kElemPerCopy; j++) {
+        rmemAccum[j] += static_cast<AccumType>(rmem[j]) * static_cast<AccumType>(scale);
+      }
+    }
+#pragma unroll
+    for (int32_t j = 0; j < kElemPerCopy; j++) {
+      rmem[j] = static_cast<InputType>(rmemAccum[j]);
+    }
+    dst_ptr[i] = *reinterpret_cast<ElemCopyType*>(rmem);
+  }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaTriggerProgrammaticLaunchCompletion();
+#endif
+}
+
+template <typename InputType, typename TopKScaleType>
+void moeUnpermute(InputType const* permuted_input, InputType* output,
+                  int32_t const* expanded_idx_to_permuted_idx, TopKScaleType const* topk_scales,
+                  int32_t const num_tokens, int32_t const hidden_size, int32_t const top_k,
+                  bool enable_pdl, cudaStream_t stream) {
+  int32_t constexpr kThreadsPerBlock = 256;
+  int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+  TLLM_CHECK_WITH_INFO(hidden_size % kElemPerCopy == 0, "hidden_size must be divisible by %d.",
+                       kElemPerCopy);
+
+  int32_t const blocks = num_tokens;
+  int32_t const threads = kThreadsPerBlock;
+
+  auto kernel = &moeUnpermuteKernel<InputType, TopKScaleType, kThreadsPerBlock>;
+
+  cudaLaunchConfig_t config;
+  config.gridDim = blocks;
+  config.blockDim = threads;
+  config.dynamicSmemBytes = 0;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(&config, kernel, permuted_input, output, expanded_idx_to_permuted_idx,
+                     topk_scales, hidden_size, top_k);
+}
+
+#define INSTANTIATE_MOE_UNPERMUTE(InputType, TopKScaleType)                                  \
+  template void moeUnpermute<InputType>(InputType const* permuted_input, InputType* output,  \
+                                        int32_t const* expanded_idx_to_permuted_idx,         \
+                                        TopKScaleType const* topk_scales,                    \
+                                        int32_t const num_tokens, int32_t const hidden_size, \
+                                        int32_t const top_k, bool enable_pdl, cudaStream_t stream)
+
+INSTANTIATE_MOE_UNPERMUTE(half, float);
+INSTANTIATE_MOE_UNPERMUTE(half, half);
+#ifdef ENABLE_BF16
+INSTANTIATE_MOE_UNPERMUTE(__nv_bfloat16, float);
+INSTANTIATE_MOE_UNPERMUTE(__nv_bfloat16, __nv_bfloat16);
+#endif
+#undef INSTANTIATE_MOE_UNPERMUTE
+
+template <typename InputType, int32_t kThreadsPerBlock>
+__global__ void moeOutputMemsetKernel(InputType* input, int32_t const* tile_idx_to_mn_limit,
+                                      int32_t const* expanded_idx_to_permuted_idx,
+                                      int32_t const* permuted_idx_to_expanded_idx,
+                                      int32_t const* num_non_exiting_tiles,
+                                      int32_t const hidden_size, int32_t const top_k,
+                                      int32_t const tile_size) {
+  int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+  int64_t const kCopyPerToken = hidden_size / kElemPerCopy;
+
+  InputType rmem[kElemPerCopy];
+#pragma unroll
+  for (int32_t j = 0; j < kElemPerCopy; j++) {
+    rmem[j] = 0;
+  }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaGridDependencySynchronize();
+#endif
+
+  int32_t const num_tokens = num_non_exiting_tiles[0] * tile_size;
+  for (int32_t permuted_idx = blockIdx.x; permuted_idx < num_tokens; permuted_idx += gridDim.x) {
+    int32_t const tile_idx = permuted_idx / tile_size;
+    if (permuted_idx >= tile_idx_to_mn_limit[tile_idx]) {
+      continue;
+    }
+    int32_t const expanded_idx = permuted_idx_to_expanded_idx[permuted_idx];
+    int32_t const token_idx = expanded_idx / top_k;
+    int32_t const topk_idx = expanded_idx % top_k;
+
+    bool is_first_in_topk = true;
+    for (int32_t k = 0; k < topk_idx; k++) {
+      if (expanded_idx_to_permuted_idx[token_idx * top_k + k] >= 0) {
+        is_first_in_topk = false;
+        break;
+      }
+    }
+    if (!is_first_in_topk) {
+      continue;
+    }
+
+    auto* dst_ptr = reinterpret_cast<ElemCopyType*>(input) + token_idx * kCopyPerToken;
+    for (int32_t i = threadIdx.x; i < kCopyPerToken; i += kThreadsPerBlock) {
+      dst_ptr[i] = *reinterpret_cast<ElemCopyType*>(rmem);
+    }
+  }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaTriggerProgrammaticLaunchCompletion();
+#endif
+}
+
+template <typename InputType>
+void moeOutputMemset(InputType* input, int32_t const* tile_idx_to_mn_limit,
+                     int32_t const* expanded_idx_to_permuted_idx,
+                     int32_t const* permuted_idx_to_expanded_idx,
+                     int32_t const* num_non_exiting_tiles, int32_t const max_num_permuted_tokens,
+                     int32_t const hidden_size, int32_t const top_k, int32_t const tile_size,
+                     bool enable_pdl, cudaStream_t stream) {
+  int32_t constexpr kThreadsPerBlock = 256;
+  int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+  TLLM_CHECK_WITH_INFO(hidden_size % kElemPerCopy == 0, "hidden_size must be divisible by %d.",
+                       kElemPerCopy);
+
+  auto kernel = &moeOutputMemsetKernel<InputType, kThreadsPerBlock>;
+  static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
+  int32_t const maxBlocksPerSM = getMaxActiveBlocksPerSM(kernel, kThreadsPerBlock, 0);
+  int32_t const blocks = std::min(smCount * maxBlocksPerSM, max_num_permuted_tokens);
+  int32_t const threads = kThreadsPerBlock;
+
+  cudaLaunchConfig_t config;
+  config.gridDim = blocks;
+  config.blockDim = threads;
+  config.dynamicSmemBytes = 0;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(&config, kernel, input, tile_idx_to_mn_limit, expanded_idx_to_permuted_idx,
+                     permuted_idx_to_expanded_idx, num_non_exiting_tiles, hidden_size, top_k,
+                     tile_size);
+}
+
+#define INSTANTIATE_MOE_OUTPUT_MEMSET(InputType)                                                \
+  template void moeOutputMemset<InputType>(                                                     \
+      InputType * input, int32_t const* tile_idx_to_mn_limit,                                   \
+      int32_t const* expanded_idx_to_permuted_idx, int32_t const* permuted_idx_to_expanded_idx, \
+      int32_t const* num_non_exiting_tiles, int32_t const max_num_permuted_tokens,              \
+      int32_t const hidden_size, int32_t const top_k, int32_t const tile_size, bool enable_pdl, \
+      cudaStream_t stream)
+
+INSTANTIATE_MOE_OUTPUT_MEMSET(half);
+#ifdef ENABLE_BF16
+INSTANTIATE_MOE_OUTPUT_MEMSET(__nv_bfloat16);
+#endif
+#undef INSTANTIATE_MOE_OUTPUT_MEMSET
+
+// ============================== Activation Kernels ==============================
+
+template <typename InputType, typename ActFn, int32_t kThreadsPerBlock>
+__global__ void moeActivationKernel(InputType const* input, InputType* output,
+                                    int32_t const* tile_idx_to_mn_limit,
+                                    int32_t const* num_non_exiting_tiles, int32_t const interm_size,
+                                    int32_t const tile_size) {
+  using ComputeType = float;
+  int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+  // Need int64_t to prevent overflow when computing pointer offsets.
+  int64_t const kCopyPerToken = interm_size / kElemPerCopy;
+  InputType rmem[kElemPerCopy];
+  InputType rmemGate[kElemPerCopy];
+  ActFn act{};
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaGridDependencySynchronize();
+#endif
+
+  int32_t const num_tokens = num_non_exiting_tiles[0] * tile_size;
+  for (int32_t permuted_idx = blockIdx.x; permuted_idx < num_tokens; permuted_idx += gridDim.x) {
+    int32_t const tile_idx = permuted_idx / tile_size;
+    if (permuted_idx >= tile_idx_to_mn_limit[tile_idx]) {
+      continue;
+    }
+    auto const* src_ptr = reinterpret_cast<ElemCopyType const*>(input) +
+                          permuted_idx * kCopyPerToken * (ActFn::IS_GLU ? 2 : 1);
+    auto* dst_ptr = reinterpret_cast<ElemCopyType*>(output) + permuted_idx * kCopyPerToken;
+    for (int32_t i = threadIdx.x; i < kCopyPerToken; i += kThreadsPerBlock) {
+      *reinterpret_cast<ElemCopyType*>(rmem) = src_ptr[i];
+      if constexpr (ActFn::IS_GLU) {
+        *reinterpret_cast<ElemCopyType*>(rmemGate) = src_ptr[i + kCopyPerToken];
+#pragma unroll
+        for (int32_t j = 0; j < kElemPerCopy; j++) {
+          rmem[j] = static_cast<InputType>(
+              act(static_cast<ComputeType>(rmemGate[j]), static_cast<ComputeType>(rmem[j])));
+        }
+      } else {
+#pragma unroll
+        for (int32_t j = 0; j < kElemPerCopy; j++) {
+          rmem[j] = static_cast<InputType>(act(static_cast<ComputeType>(rmem[j])));
+        }
+      }
+
+      dst_ptr[i] = *reinterpret_cast<ElemCopyType*>(rmem);
+    }
+  }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaTriggerProgrammaticLaunchCompletion();
+#endif
+}
+
+template <typename InputType>
+void moeActivation(InputType const* input, InputType* output, int32_t const* tile_idx_to_mn_limit,
+                   int32_t const* num_non_exiting_tiles, MoeActivationType activation_type,
+                   int32_t const max_num_permuted_tokens, int32_t const interm_size,
+                   int32_t const tile_size, bool enable_pdl, cudaStream_t stream) {
+  int32_t constexpr kThreadsPerBlock = 256;
+  int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+  TLLM_CHECK_WITH_INFO(interm_size % kElemPerCopy == 0, "interm_size must be divisible by %d.",
+                       kElemPerCopy);
+
+  using namespace cutlass_kernels;
+
+  auto get_act_kernel = [](MoeActivationType act_type) -> void (*)(InputType const*, InputType*,
+                                                                   int32_t const*, int32_t const*,
+                                                                   int32_t const, int32_t const) {
+    switch (act_type) {
+      case MoeActivationType::Identity:
+        return &moeActivationKernel<InputType, IdentityAdaptor<cutlass::epilogue::thread::Identity>,
+                                    kThreadsPerBlock>;
+      case MoeActivationType::Gelu:
+        return &moeActivationKernel<InputType, IdentityAdaptor<cutlass::epilogue::thread::GELU>,
+                                    kThreadsPerBlock>;
+      case MoeActivationType::Geglu:
+        return &moeActivationKernel<InputType, GLUAdaptor<cutlass::epilogue::thread::GELU>,
+                                    kThreadsPerBlock>;
+      case MoeActivationType::Relu:
+        return &moeActivationKernel<InputType, IdentityAdaptor<cutlass::epilogue::thread::ReLu>,
+                                    kThreadsPerBlock>;
+      case MoeActivationType::Silu:
+        return &moeActivationKernel<InputType, IdentityAdaptor<cutlass::epilogue::thread::SiLu>,
+                                    kThreadsPerBlock>;
+      case MoeActivationType::Swiglu:
+        return &moeActivationKernel<InputType, GLUAdaptor<cutlass::epilogue::thread::SiLu>,
+                                    kThreadsPerBlock>;
+      default:
+        TLLM_CHECK_WITH_INFO(false, "Unsupported activation type: %d", static_cast<int>(act_type));
+        return nullptr;
+    }
+  };
+
+  auto kernel = get_act_kernel(activation_type);
+
+  static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
+  int32_t const maxBlocksPerSM = getMaxActiveBlocksPerSM(kernel, kThreadsPerBlock, 0);
+  int32_t const blocks = std::min(smCount * maxBlocksPerSM, max_num_permuted_tokens);
+  int32_t const threads = kThreadsPerBlock;
+
+  cudaLaunchConfig_t config;
+  config.gridDim = blocks;
+  config.blockDim = threads;
+  config.dynamicSmemBytes = 0;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(&config, kernel, input, output, tile_idx_to_mn_limit, num_non_exiting_tiles,
+                     interm_size, tile_size);
+}
+
+#define INSTANTIATE_MOE_ACTIVATION(InputType)                                                    \
+  template void moeActivation<InputType>(                                                        \
+      InputType const* input, InputType* output, int32_t const* tile_idx_to_mn_limit,            \
+      int32_t const* num_non_exiting_tiles, MoeActivationType activation_type,                   \
+      int32_t const max_num_permuted_tokens, int32_t const interm_size, int32_t const tile_size, \
+      bool enable_pdl, cudaStream_t stream)
+
+INSTANTIATE_MOE_ACTIVATION(half);
+#ifdef ENABLE_BF16
+INSTANTIATE_MOE_ACTIVATION(__nv_bfloat16);
+#endif
+#undef INSTANTIATE_MOE_ACTIVATION
+
+// Note: moeActivationQuantize (fused activation + FP4 quantization) will be added later
+// when NVFP4 output support is needed.
+
+}  // namespace kernels::cute_dsl
+
+TRTLLM_NAMESPACE_END
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h b/csrc/nv_internal/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h
new file mode 100644
index 0000000000..0ea18a5bf7
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cuda_runtime.h>
+
+#include <cstdint>
+
+#include "tensorrt_llm/common/config.h"
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace kernels::cute_dsl {
+
+// Activation type enum for standalone moeActivation kernel
+// Note: Matches ActivationType in cutlass_kernels/include/common.h
+enum class MoeActivationType {
+  Gelu = 0,
+  Relu = 1,
+  Silu = 2,
+  Swiglu = 3,
+  Geglu = 4,
+  Identity = 5,
+};
+
+template <typename InputType, typename SFType>
+void moePermute(InputType const* input, InputType* permuted_output, SFType const* input_sf,
+                SFType* permuted_sf, int32_t const* tile_idx_to_mn_limit,
+                int32_t const* permuted_idx_to_expanded_idx, int32_t const* num_non_exiting_tiles,
+                int32_t const max_num_permuted_tokens, int32_t const hidden_size,
+                int32_t const top_k, int32_t const tile_size, bool enable_pdl, cudaStream_t stream);
+
+template <typename InputType, typename TopKScaleType>
+void moeUnpermute(InputType const* permuted_input, InputType* output,
+                  int32_t const* expanded_idx_to_permuted_idx, TopKScaleType const* topk_scales,
+                  int32_t const num_tokens, int32_t const hidden_size, int32_t const top_k,
+                  bool enable_pdl, cudaStream_t stream);
+
+template <typename InputType>
+void moeOutputMemset(InputType* input, int32_t const* tile_idx_to_mn_limit,
+                     int32_t const* expanded_idx_to_permuted_idx,
+                     int32_t const* permuted_idx_to_expanded_idx,
+                     int32_t const* num_non_exiting_tiles, int32_t const max_num_permuted_tokens,
+                     int32_t const hidden_size, int32_t const top_k, int32_t const tile_size,
+                     bool enable_pdl, cudaStream_t stream);
+
+// ============================== Activation Kernels ==============================
+
+/**
+ * @brief Apply activation function to MoE intermediate outputs.
+ *
+ * For GLU activations (Swiglu, Geglu), input shape is (num_tokens, 2 * interm_size)
+ * where first half is linear projection and second half is gate.
+ * Output shape is (num_tokens, interm_size).
+ *
+ * For non-GLU activations (Gelu, Relu, Silu, Identity), input and output shape
+ * are both (num_tokens, interm_size).
+ *
+ * @param input Input tensor
+ * @param output Output tensor (same dtype as input for non-FP4 output)
+ * @param tile_idx_to_mn_limit Valid token count per tile
+ * @param num_non_exiting_tiles Number of valid tiles (scalar on device)
+ * @param activation_type Type of activation to apply
+ * @param max_num_permuted_tokens Maximum number of permuted tokens
+ * @param interm_size Intermediate size (output hidden dimension)
+ * @param tile_size Tile size for scheduling
+ * @param enable_pdl Enable Programmatic Dependent Launch
+ * @param stream CUDA stream
+ */
+template <typename InputType>
+void moeActivation(InputType const* input, InputType* output, int32_t const* tile_idx_to_mn_limit,
+                   int32_t const* num_non_exiting_tiles, MoeActivationType activation_type,
+                   int32_t const max_num_permuted_tokens, int32_t const interm_size,
+                   int32_t const tile_size, bool enable_pdl, cudaStream_t stream);
+
+/**
+ * @brief Fused activation with NVFP4 dynamic quantization.
+ *
+ * Combines activation function with per-block NVFP4 quantization in a single kernel pass.
+ * Output is packed FP4 with swizzled scale factors.
+ *
+ * @param input Input tensor (bf16/fp16)
+ * @param output Output tensor (packed FP4, uint8)
+ * @param global_sf Global scale factor for quantization
+ * @param output_sf Per-block scale factors (FP8 E4M3, swizzled layout)
+ * @param tile_idx_to_mn_limit Valid token count per tile
+ * @param num_non_exiting_tiles Number of valid tiles
+ * @param activation_type Type of activation to apply
+ * @param max_num_permuted_tokens Maximum number of permuted tokens
+ * @param interm_size Intermediate size
+ * @param tile_size Tile size for scheduling
+ * @param enable_pdl Enable Programmatic Dependent Launch
+ * @param stream CUDA stream
+ */
+template <typename InputType, typename OutputType, typename SFType>
+void moeActivationQuantize(InputType const* input, OutputType* output, float const* global_sf,
+                           SFType* output_sf, int32_t const* tile_idx_to_mn_limit,
+                           int32_t const* num_non_exiting_tiles, MoeActivationType activation_type,
+                           int32_t const max_num_permuted_tokens, int32_t const interm_size,
+                           int32_t const tile_size, bool enable_pdl, cudaStream_t stream);
+
+}  // namespace kernels::cute_dsl
+
+TRTLLM_NAMESPACE_END
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh
new file mode 100644
index 0000000000..8c96d64808
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <limits>
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "tensorrt_llm/common/config.h"
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace kernels::cutlass_kernels {
+// ============================== Activation Adaptors =================================
+
+template <template <class> class ActFn>
+struct IdentityAdaptor {
+  constexpr static bool IS_GLU = false;
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  float limit = std::numeric_limits<float>::infinity();
+
+  template <class T>
+  __device__ T operator()(T const& x) const {
+    ActFn<T> fn{};
+    return fn(x);
+  }
+};
+
+template <template <class> class ActFn>
+struct GLUAdaptor {
+  constexpr static bool IS_GLU = true;
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  float limit = std::numeric_limits<float>::infinity();
+
+  template <class T>
+  __device__ T operator()(T const& gate, T const& linear) const {
+    ActFn<T> fn{};
+    return fn(gate) * linear;
+  }
+};
+
+struct SwigluBiasAdaptor {
+  constexpr static bool IS_GLU = true;
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  float limit = std::numeric_limits<float>::infinity();
+
+  template <class T>
+  __device__ T operator()(T const& gate, T const& linear) const {
+    cutlass::epilogue::thread::Sigmoid<T> fn{};
+    T linear_clamped = cutlass::maximum<T>{}(cutlass::minimum<T>{}(linear, limit), -limit);
+    T gate_clamped = cutlass::minimum<T>{}(gate, limit);
+    return gate_clamped * fn(gate_clamped * alpha) * (linear_clamped + beta);
+  }
+};
+
+}  // namespace kernels::cutlass_kernels
+
+TRTLLM_NAMESPACE_END
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
index c78ceb215b..53bcc9e3bb 100644
--- a/flashinfer/__init__.py
+++ b/flashinfer/__init__.py
@@ -14,6 +14,7 @@
 limitations under the License.
 """
 
+import contextlib
 import importlib.util
 
 from .version import __version__ as __version__
@@ -84,6 +85,13 @@
     trtllm_fp8_block_scale_moe,
     trtllm_fp8_per_tensor_scale_moe,
 )
+
+# CuteDSL MoE high-level APIs (conditionally if cute_dsl available)
+with contextlib.suppress(ImportError):
+    from .fused_moe import (
+        cute_dsl_fused_moe_nvfp4 as cute_dsl_fused_moe_nvfp4,
+        CuteDslMoEWrapper as CuteDslMoEWrapper,
+    )
 from .gdn_prefill import chunk_gated_delta_rule as chunk_gated_delta_rule
 from .gemm import SegmentGEMMWrapper as SegmentGEMMWrapper
 from .gemm import bmm_bf16 as bmm_bf16
diff --git a/flashinfer/cute_dsl/__init__.py b/flashinfer/cute_dsl/__init__.py
index 1cd587b5aa..940031453d 100644
--- a/flashinfer/cute_dsl/__init__.py
+++ b/flashinfer/cute_dsl/__init__.py
@@ -25,7 +25,15 @@
     removed in a future release.
 """
 
-from .utils import is_cute_dsl_available, make_ptr, get_cutlass_dtype, get_num_sm
+from .utils import (
+    is_cute_dsl_available,
+    make_ptr,
+    get_cutlass_dtype,
+    get_num_sm,
+    convert_sf_to_mma_layout,
+    convert_sf_from_mma_layout,
+    get_mma_sf_shape,
+)
 
 # Conditionally import CuTe-DSL kernels
 if is_cute_dsl_available():
@@ -52,6 +60,10 @@
     "make_ptr",
     "get_cutlass_dtype",
     "get_num_sm",
+    # Scale factor layout conversion utilities
+    "convert_sf_to_mma_layout",
+    "convert_sf_from_mma_layout",
+    "get_mma_sf_shape",
 ]
 
 if is_cute_dsl_available():
diff --git a/flashinfer/cute_dsl/utils.py b/flashinfer/cute_dsl/utils.py
index 79673ef9cd..b61f83ba57 100644
--- a/flashinfer/cute_dsl/utils.py
+++ b/flashinfer/cute_dsl/utils.py
@@ -17,7 +17,7 @@
 import ctypes
 import functools
 import importlib.util
-from typing import Union
+from typing import Union, Tuple
 
 import cutlass
 import cutlass._mlir.dialects.cute as _cute_ir
@@ -26,6 +26,11 @@
 from cutlass.cute.typing import AddressSpace, Numeric, Pointer, Type
 
 
+def ceil_div(a: int, b: int) -> int:
+    """Ceiling division."""
+    return (a + b - 1) // b
+
+
 def is_cute_dsl_available() -> bool:
     return (
         importlib.util.find_spec("cutlass") is not None
@@ -60,6 +65,7 @@ def cutlass_to_torch_dtype(cutlass_dtype):
         cutlass.Float8E5M2: torch.float8_e5m2,
         cutlass.Float8E4M3FN: torch.float8_e4m3fn,
         cutlass.Float8E4M3B11FNUZ: torch.float8_e4m3fnuz,
+        cutlass.Float4E2M1FN: torch.float4_e2m1fn_x2,  # FP4 packed (2 values per byte)
     }
     if torch_dtype is None:
         torch_dtype = torch_type_map.get(cutlass_dtype)
@@ -75,6 +81,35 @@ def get_num_sm(device: torch.device) -> int:
     return torch.cuda.get_device_properties(device).multi_processor_count
 
 
+# Cache for HardwareInfo - it's expensive to create on every call
+_hardware_info_cache: "cutlass.utils.HardwareInfo | None" = None
+
+
+def get_hardware_info() -> "cutlass.utils.HardwareInfo":
+    """Get cached HardwareInfo singleton.
+
+    HardwareInfo queries CUDA device capabilities, which can be expensive.
+    This function caches the singleton to avoid repeated queries.
+    """
+    global _hardware_info_cache
+    if _hardware_info_cache is None:
+        _hardware_info_cache = cutlass.utils.HardwareInfo()
+    return _hardware_info_cache
+
+
+@functools.cache
+def get_max_active_clusters(cluster_size: int) -> int:
+    """Get max active clusters for a given cluster size (cached).
+
+    Args:
+        cluster_size: Product of cluster_shape_mn dimensions.
+
+    Returns:
+        Maximum number of active clusters supported by hardware.
+    """
+    return get_hardware_info().get_max_active_clusters(cluster_size)
+
+
 # WAR for CuTeDSL make_ptr implementation for flashinfer
 class _Pointer(Pointer):
     """Runtime representation of a pointer that can inter-operate with
@@ -221,3 +256,141 @@ def make_ptr(
         )
 
     return _Pointer(address_value, dtype, mem_space, assumed_align=assumed_align)
+
+
+def convert_sf_to_mma_layout(
+    sf: torch.Tensor,
+    m: int,
+    k: int,
+    num_groups: int = 1,
+    sf_vec_size: int = 16,
+) -> torch.Tensor:
+    """Convert scale factors from swizzled 2D layout to 6D MMA-compatible layout.
+
+    This function converts scale factors produced by `fp4_quantize(..., is_sf_swizzled_layout=True)`
+    to the 6D layout expected by CuteDSL grouped GEMM kernels.
+
+    The swizzled scale factors from `fp4_quantize` have shape `(M, K/sf_vec_size)` but are
+    stored in a swizzled pattern internally. This function reshapes them to the explicit
+    6D MMA-compatible layout: `(32, 4, m_tiles, 4, k_tiles, num_groups)` with the
+    physical storage order `(num_groups, m_tiles, k_tiles, 32, 4, 4)`.
+
+    Layout mapping (from linear (m, k) position):
+        - m_tile = m // 128
+        - outer_m = m % 32
+        - inner_m = (m % 128) // 32
+        - k_tile = k // 4
+        - inner_k = k % 4
+        - 6D position: (outer_m, inner_m, m_tile, inner_k, k_tile, group)
+
+    Args:
+        sf: Scale factor tensor from `fp4_quantize(..., is_sf_swizzled_layout=True)`.
+            Shape: `(M, K/sf_vec_size)` or `(num_groups * M, K/sf_vec_size)`.
+        m: The M dimension (rows) of the original matrix before quantization.
+        k: The K dimension (columns) of the original matrix before quantization.
+        num_groups: Number of groups (e.g., experts). Default: 1.
+        sf_vec_size: Scale factor vector size. Default: 16.
+
+    Returns:
+        Scale factors in 6D MMA layout: `(32, 4, m_tiles, 4, k_tiles, num_groups)`.
+        This is a strided view (not contiguous) with physical storage order
+        `(num_groups, m_tiles, k_tiles, 32, 4, 4)`.
+
+    Example:
+        >>> # Quantize weight tensor
+        >>> w_q, w_sf = fp4_quantize(weight, global_scale=gs, is_sf_swizzled_layout=True)
+        >>> # Convert scale factors to MMA layout
+        >>> w_sf_mma = convert_sf_to_mma_layout(w_sf, m=weight.shape[0], k=weight.shape[1])
+
+    Note:
+        - The input `sf` must be produced with `is_sf_swizzled_layout=True`.
+        - M and K dimensions must be multiples of 128 and 64 respectively for proper alignment.
+        - For grouped tensors (e.g., expert weights), reshape to `(num_groups * M, K)`
+          before quantization, then use this function with the appropriate `num_groups`.
+        - The returned tensor is a strided view, NOT contiguous. This is intentional as
+          the CuteDSL kernel expects the specific physical memory layout.
+    """
+    sf_k = ceil_div(k, sf_vec_size)
+    m_tiles = ceil_div(m, 128)
+    k_tiles = ceil_div(sf_k, 4)
+
+    # Verify input shape
+    expected_elements = num_groups * m_tiles * k_tiles * 32 * 4 * 4
+    actual_elements = sf.numel()
+    if actual_elements != expected_elements:
+        raise ValueError(
+            f"Scale factor tensor has {actual_elements} elements, "
+            f"expected {expected_elements} for m={m}, k={k}, num_groups={num_groups}"
+        )
+
+    # Reshape from flat 2D to 6D physical storage order
+    # Physical storage: (num_groups, m_tiles, k_tiles, 32, 4, 4)
+    sf_6d = sf.view(num_groups, m_tiles, k_tiles, 32, 4, 4)
+
+    # Permute to MMA logical order: (32, 4, m_tiles, 4, k_tiles, num_groups)
+    # This creates a strided view (non-contiguous), which is what the kernel expects
+    sf_6d = sf_6d.permute(3, 4, 1, 5, 2, 0)
+
+    return sf_6d  # Return strided view, NOT contiguous
+
+
+def convert_sf_from_mma_layout(
+    sf_6d: torch.Tensor,
+    m: int,
+    k: int,
+    num_groups: int = 1,
+    sf_vec_size: int = 16,
+) -> torch.Tensor:
+    """Convert scale factors from 6D MMA layout back to 2D swizzled layout.
+
+    This is the inverse of `convert_sf_to_mma_layout`.
+
+    Args:
+        sf_6d: Scale factors in 6D MMA layout: `(32, 4, m_tiles, 4, k_tiles, num_groups)`.
+               Can be either a strided view or contiguous.
+        m: The M dimension (rows) of the original matrix.
+        k: The K dimension (columns) of the original matrix.
+        num_groups: Number of groups. Default: 1.
+        sf_vec_size: Scale factor vector size. Default: 16.
+
+    Returns:
+        Scale factors in 2D swizzled layout: `(num_groups * M_padded, K_padded/sf_vec_size)`.
+    """
+    sf_k = ceil_div(k, sf_vec_size)
+    m_tiles = ceil_div(m, 128)
+    k_tiles = ceil_div(sf_k, 4)
+
+    # Permute from MMA logical order back to storage order
+    # From: (32, 4, m_tiles, 4, k_tiles, num_groups)
+    # To: (num_groups, m_tiles, k_tiles, 32, 4, 4)
+    sf_storage = sf_6d.permute(5, 2, 4, 0, 1, 3).contiguous()
+
+    # Reshape to 2D
+    padded_m = m_tiles * 128
+    padded_sf_k = k_tiles * 4
+    sf_2d = sf_storage.reshape(num_groups * padded_m, padded_sf_k)
+
+    return sf_2d
+
+
+def get_mma_sf_shape(
+    m: int,
+    k: int,
+    num_groups: int = 1,
+    sf_vec_size: int = 16,
+) -> Tuple[int, int, int, int, int, int]:
+    """Get the 6D MMA-compatible scale factor shape.
+
+    Args:
+        m: The M dimension (rows) of the matrix.
+        k: The K dimension (columns) of the matrix.
+        num_groups: Number of groups. Default: 1.
+        sf_vec_size: Scale factor vector size. Default: 16.
+
+    Returns:
+        Shape tuple: (32, 4, m_tiles, 4, k_tiles, num_groups)
+    """
+    sf_k = ceil_div(k, sf_vec_size)
+    m_tiles = ceil_div(m, 128)
+    k_tiles = ceil_div(sf_k, 4)
+    return (32, 4, m_tiles, 4, k_tiles, num_groups)
diff --git a/flashinfer/fused_moe/__init__.py b/flashinfer/fused_moe/__init__.py
index a077ea82d5..fac19433fe 100644
--- a/flashinfer/fused_moe/__init__.py
+++ b/flashinfer/fused_moe/__init__.py
@@ -39,6 +39,17 @@
     fused_topk_deepseek as fused_topk_deepseek,
 )
 
+# CuteDSL MoE APIs (conditionally imported if cute_dsl available)
+try:
+    from .cute_dsl import (
+        cute_dsl_fused_moe_nvfp4,
+        CuteDslMoEWrapper,
+    )
+
+    _cute_dsl_available = True
+except ImportError:
+    _cute_dsl_available = False
+
 __all__ = [
     "ActivationType",
     "RoutingMethodType",
@@ -60,3 +71,10 @@
     "trtllm_mxint4_block_scale_moe",
     "fused_topk_deepseek",
 ]
+
+# Add CuteDSL exports if available
+if _cute_dsl_available:
+    __all__ += [
+        "cute_dsl_fused_moe_nvfp4",
+        "CuteDslMoEWrapper",
+    ]
diff --git a/flashinfer/fused_moe/cute_dsl/__init__.py b/flashinfer/fused_moe/cute_dsl/__init__.py
new file mode 100644
index 0000000000..0be353a03d
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/__init__.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CuteDSL-based Fused MoE Kernels for NVFP4 on Blackwell GPUs.
+"""
+
+from ...cute_dsl.utils import is_cute_dsl_available
+
+# Conditionally import CuTe-DSL kernels
+if is_cute_dsl_available():
+    from .fused_moe import (
+        cute_dsl_fused_moe_nvfp4,
+        CuteDslMoEWrapper,
+    )
+
+__all__ = [
+    "is_cute_dsl_available",
+]
+
+if is_cute_dsl_available():
+    __all__ += [
+        "cute_dsl_fused_moe_nvfp4",
+        "CuteDslMoEWrapper",
+    ]
diff --git a/flashinfer/fused_moe/cute_dsl/blackwell/__init__.py b/flashinfer/fused_moe/cute_dsl/blackwell/__init__.py
new file mode 100644
index 0000000000..c3423ca04c
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/blackwell/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Blackwell (SM100) CuteDSL Kernels
+=================================
+
+This module contains CuteDSL kernels optimized for NVIDIA Blackwell architecture.
+These kernels are adapted from TensorRT-LLM.
+"""
+
+from .blockscaled_contiguous_grouped_gemm import (
+    Sm100BlockScaledContiguousGroupedGemmKernel,
+    cvt_sf_MKL_to_M32x4xrm_K4xrk_L,
+)
+from .blockscaled_contiguous_grouped_gemm_swiglu_fusion import (
+    Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel,
+)
+from .blockscaled_contiguous_grouped_gemm_finalize_fusion import (
+    Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel,
+)
+from .blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion import (
+    BlockScaledContiguousGatherGroupedGemmKernel,
+)
+from .utils import (
+    TRTLLM_ENABLE_PDL,
+    griddepcontrol_launch_dependents,
+    griddepcontrol_wait,
+    is_power_of_2,
+)
diff --git a/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py
new file mode 100644
index 0000000000..9cd92fb0a6
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py
@@ -0,0 +1,3609 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import Optional, Tuple, Type, Union
+
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import cutlass.pipeline as pipeline
+import cutlass.utils as utils
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass._mlir.dialects import math
+from cutlass.cute.nvgpu import cpasync, tcgen05
+from cutlass.cutlass_dsl import Int32
+
+from .custom_pipeline import PipelineCpAsyncUmma
+from .utils import (
+    TRTLLM_ENABLE_PDL,
+    fmin,
+    griddepcontrol_launch_dependents,
+    griddepcontrol_wait,
+    is_power_of_2,
+    silu_f32,
+)
+
+"""
+High-performance persistent blockscaled contiguous grouped dense GEMM with gather and SwiGLU fusion
+(C = up * silu(gate), where up and gate come from interleaved weight matrix B)
+example for the NVIDIA Blackwell architecture using CUTE DSL.
+
+This kernel performs FC1 layer computation with SwiGLU activation fusion:
+1. GEMM: acc = alpha * (SFA * A[token_ids]) * (SFB * B)
+2. SwiGLU: C = up * silu(gate), where up/gate are extracted from interleaved acc (granularity=64)
+3. Optional Quant: When c_dtype is Float4E2M1FN, generates scale factor C and quantizes output
+
+- Matrix A is MxKx1, A can be row-major("K"), ValidM is composed of valid m in different groups
+- Matrix B is NxKxL, B can be column-major("K"), L is grouped dimension (number of experts)
+  - B weights are interleaved: [up_0:64, gate_64:128, up_128:192, gate_192:256, ...]
+- Matrix C is Mx(N/2)x1, C can be row-major("N"), N is halved due to SwiGLU fusion
+- Matrix SFA layout is filled internally according to A shape and BlockScaledBasicChunk,
+  which has M×ceil_div(K, sf_vec_size)×1 elements
+- Matrix SFB layout is filled internally according to B shape and BlockScaledBasicChunk,
+  which has N×ceil_div(K, sf_vec_size)×L elements
+- Token ID mapping tensor enables gather operation for A and SFA
+
+Matrix A/C Memory Layout Diagrams:
+
+   ```
+    Group 0    Group 1   Group 2
+   -+---------+---------+---------+
+    |         |         |         |
+   K| ValidM0 | ValidM1 | ValidM2 |
+    |         |         |         |
+   -+---------+---------+---------+
+    |<-        ValidM           ->|
+   ```
+   Note: the Group(L) dimension will be flatted into M dimension, and the rest Group(L) size is 1.
+         each ValidM will be aligned to 256 or 128. The alignment is determined by the mma_tiler_mn parameter.
+         For NVFP4, 2CTA, the alignment is 256. For NVFP4, 1CTA, the alignment is 128.
+
+This GEMM kernel supports the following features:
+    - Utilizes LDGSTS (Load Global to Shared with Swizzle) for A and SFA with gather operation
+    - Utilizes Tensor Memory Access (TMA) for B and SFB matrices
+    - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Support persistent tile scheduling to better overlap memory load/store with mma between tiles
+    - Support warp specialization to avoid explicit pipelining between mainloop load and mma
+
+This GEMM works as follows:
+1. SCHEDULER warp (warp 10): Dispatches tile information to all consumer warps via tile_info_pipeline.
+2. LDGSTS A/SFA warps (warps 4-7):
+    - Load A matrix from global memory (GMEM) to shared memory (SMEM) using LDGSTS instructions with gather.
+    - Load SFA (scale factor A) from GMEM to SMEM using LDGSTS instructions.
+    - Uses token_id_mapping to perform permutation/gather during load.
+3. TMA B/SFB warp (warp 9):
+    - Load B and SFB matrices from GMEM to SMEM using TMA operations with multicast.
+4. MMA warp (warp 8):
+    - Load scale factor A/B from shared memory (SMEM) to tensor memory (TMEM) using tcgen05.cp instruction.
+    - Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+5. EPILOGUE warps (warps 0-3):
+    - Load two accumulator subtiles (up and gate) from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+    - Apply alpha scaling: up_scaled = alpha * up, gate_scaled = alpha * gate
+    - Compute SwiGLU activation: output = up_scaled * silu(gate_scaled), where silu(x) = x * sigmoid(x)
+    - If c_dtype is Float4E2M1FN: generate scale factor C (SFC) and quantize output
+    - Type convert output to c_dtype.
+    - Store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations.
+
+SM100 tcgen05.mma.kind.block_scale instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Read scalefactor A from TMEM
+- Read scalefactor B from TMEM
+- Write accumulator to TMEM
+The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+
+Constraints:
+* Supported input data types: mxf8, mxf4, nvf4
+  see detailed valid dtype combinations in below Sm100BlockScaledPersistentDenseGemmKernel class documentation
+* A/B tensor must have the same data type, mixed data type is not supported (e.g., mxf8 x mxf4)
+* Mma tiler M must be 128 or 256(use_2cta_instrs)
+* Mma tiler N must be 64/128/192/256
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+* Cluster shape M must be multiple of 2 if Mma tiler M is 256(use_2cta_instrs)
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 16 and 32 for Float8 and Float4, respectively.
+
+CUDA Graph Support:
+* For CUDA graph support, the tile_idx_to_expert_idx, token_id_mapping, A/C matrices,
+  and scale factor A can be padded to a larger size
+  (e.g., permuted_m = m*topK + num_local_experts*(256-1),
+  example: 4096*8 + (256/32)*255 = 34808)
+* Use create_tensors() with permuted_m parameter to automatically pad:
+  - tile_idx_to_expert_idx: padded for invalid tiles (set to -2e9 for padding tiles)
+  - token_id_mapping: padded to permuted_m size (invalid tokens set to -1)
+  - A matrix: padded to permuted_m rows (padding rows contain dummy data)
+  - C matrix: padded to permuted_m rows (output buffer for cuda_graph)
+  - Scale factor A: padded to match A matrix dimensions
+* Kernel handling of padding:
+  - Scheduler warp checks if tile_idx >= num_non_exiting_tiles to exit
+  - Only valid tiles (tile_idx < num_non_exiting_tiles) are written to tile_info pipeline
+  - LDGSTS warps use token_id_mapping predicates to skip invalid tokens (token_id == -1)
+  - When no more valid tiles exist, outer loop exits and calls producer_tail()
+  - Consumer warps process only valid tiles from pipeline
+  - No deadlock or synchronization issues
+* Consumer warps check initial tile against num_non_exiting_tiles and set
+  is_valid_tile=False if tile_idx >= num_non_exiting_tiles
+* Only rows within (aligned_groupm[0]+aligned_groupm[1]+...) contain valid data
+* Padding rows in C matrix will not be written by the kernel
+"""
+
+
+# TODO: Remove this hook helper function after nvidia-cutlass-dsl 4.4 is released.
+def hooked_PersistentTileSchedulerParams_init(
+    self,
+    problem_shape_ntile_mnl: cute.Shape,
+    cluster_shape_mnk: cute.Shape,
+    swizzle_size: int = 1,
+    raster_along_m: bool = True,
+    *,
+    loc=None,
+    ip=None,
+):
+    if cluster_shape_mnk[2] != 1:
+        raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}")
+    if swizzle_size < 1:
+        raise ValueError(f"expect swizzle_size >= 1, but get {swizzle_size}")
+
+    self.problem_shape_ntile_mnl = problem_shape_ntile_mnl
+    # cluster_shape_mnk is kept for reconstruction
+    self._cluster_shape_mnk = cluster_shape_mnk
+    self.cluster_shape_mn = cluster_shape_mnk[:2]
+    self.swizzle_size = swizzle_size
+    self._raster_along_m = raster_along_m
+    self._loc = loc
+
+    # Apply swizzle if swizzle_size > 1
+    if swizzle_size > 1:
+        problem_shape_ncluster_mnl = cute.round_up(
+            self.problem_layout_ncluster_mnl.shape,
+            (1, swizzle_size, 1) if raster_along_m else (swizzle_size, 1, 1),
+        )
+
+        if raster_along_m:
+            self.problem_layout_ncluster_mnl = cute.make_layout(
+                (
+                    problem_shape_ncluster_mnl[0],
+                    (swizzle_size, problem_shape_ncluster_mnl[1] // swizzle_size),
+                    problem_shape_ncluster_mnl[2],
+                ),
+                stride=(
+                    swizzle_size,
+                    (1, swizzle_size * problem_shape_ncluster_mnl[0]),
+                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
+                ),
+                loc=loc,
+                ip=ip,
+            )
+        else:
+            self.problem_layout_ncluster_mnl = cute.make_layout(
+                (
+                    (swizzle_size, problem_shape_ncluster_mnl[0] // swizzle_size),
+                    problem_shape_ncluster_mnl[1],
+                    problem_shape_ncluster_mnl[2],
+                ),
+                stride=(
+                    (1, swizzle_size * problem_shape_ncluster_mnl[1]),
+                    swizzle_size,
+                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
+                ),
+                loc=loc,
+                ip=ip,
+            )
+
+    # Create FastDivmod divisors (only when swizzle_size == 1 for correctness)
+    # FastDivmod assumes simple col-major/row-major layout, incompatible with swizzled layouts
+    if swizzle_size == 1:
+        problem_shape_ncluster_mnl = cute.ceil_div(
+            self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip
+        )
+        if raster_along_m:
+            self.problem_layout_ncluster_mnl = cute.make_layout(
+                problem_shape_ncluster_mnl,
+                stride=(
+                    1,
+                    problem_shape_ncluster_mnl[0],
+                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
+                ),
+                loc=loc,
+                ip=ip,
+            )
+        else:
+            self.problem_layout_ncluster_mnl = cute.make_layout(
+                problem_shape_ncluster_mnl,
+                stride=(
+                    problem_shape_ncluster_mnl[1],
+                    1,
+                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
+                ),
+                loc=loc,
+                ip=ip,
+            )
+        problem_layout_size = cute.size(
+            self.problem_layout_ncluster_mnl, loc=loc, ip=ip
+        )
+        cluster_count_m = self.problem_layout_ncluster_mnl.shape[0]
+        cluster_count_n = self.problem_layout_ncluster_mnl.shape[1]
+
+        # batch_fdd: Used to map linear_idx to work_unit_id (handles persistent scheduling)
+        self.batch_fdd = cute.fast_divmod_create_divisor(
+            problem_layout_size, loc=loc, ip=ip
+        )
+
+        # cluster_shape_m_fdd: Used to decode work_unit_id to cluster coordinates
+        self.cluster_shape_m_fdd = cute.fast_divmod_create_divisor(
+            cluster_count_m, loc=loc, ip=ip
+        )
+
+        # cluster_shape_n_fdd: Used for the second level decomposition
+        self.cluster_shape_n_fdd = cute.fast_divmod_create_divisor(
+            cluster_count_n, loc=loc, ip=ip
+        )
+    else:
+        # FastDivmod not applicable with swizzling, set to None
+        self.batch_fdd = None
+        self.cluster_shape_m_fdd = None
+        self.cluster_shape_n_fdd = None
+
+
+def hooked_get_cluster_work_idx_with_fastdivmod(
+    self, current_work_linear_idx: Int32, *, loc=None, ip=None
+) -> Tuple[Int32, Int32, Int32]:
+    work_iteration, work_unit_id = divmod(
+        current_work_linear_idx, self.params.batch_fdd
+    )
+
+    if self.params._raster_along_m:
+        # raster_along_m=True means column major (m is fastest)
+        # First, get cluster_m using cluster_shape_m_fdd
+        cluster_n_batch, cluster_m = divmod(
+            work_unit_id, self.params.cluster_shape_m_fdd
+        )
+
+        # Then decode cluster_n_batch to get cluster_n and batch_l using FastDivmod
+        batch_l, cluster_n = divmod(cluster_n_batch, self.params.cluster_shape_n_fdd)
+    else:
+        # raster_along_m=False means row major (n is fastest)
+        # First, get cluster_n using cluster_shape_n_fdd
+        cluster_m_batch, cluster_n = divmod(
+            work_unit_id, self.params.cluster_shape_n_fdd
+        )
+
+        # Then decode cluster_m_batch to get cluster_m and batch_l using FastDivmod
+        batch_l, cluster_m = divmod(cluster_m_batch, self.params.cluster_shape_m_fdd)
+
+    return (cluster_m, cluster_n, batch_l)
+
+
+cutlass.utils.PersistentTileSchedulerParams.__init__ = (
+    hooked_PersistentTileSchedulerParams_init
+)
+cutlass.utils.StaticPersistentTileScheduler._get_cluster_work_idx_with_fastdivmod = (
+    hooked_get_cluster_work_idx_with_fastdivmod
+)
+
+
+class BlockScaledContiguousGatherGroupedGemmKernel:
+    """This class implements contiguous grouped matrix multiplication with gather operation and SwiGLU fusion
+    for FC1 layer computation (C = up * silu(gate), where up/gate come from interleaved GEMM result).
+
+    The computation flow:
+    1. GEMM: acc = alpha * (SFA * A[token_ids]) * (SFB * B)
+    2. SwiGLU: C = up * silu(gate), extracted from interleaved acc with granularity=64
+    3. Optional Quant: When c_dtype is Float4E2M1FN, generates SFC and quantizes output
+
+    Note: Output C has N/2 columns since pairs of (up, gate) are combined by SwiGLU.
+
+    Key Features:
+    - Uses LDGSTS instructions for loading A and SFA matrices with gather/permutation capability
+    - Uses TMA (Tensor Memory Access) for loading B and SFB matrices with multicast
+    - Token ID mapping enables efficient gather operation during A/SFA load
+    - SwiGLU activation fusion in epilogue (up * silu(gate) with interleaved weights)
+    - Optional quantization fusion for Float4E2M1FN output with scale factor generation
+    - Warp specialization: Scheduler (warp 10), A Sync Transform (warp 11, only used when
+      use_2cta_instrs is True), LDGSTS A/SFA (warps 4-7), TMA B/SFB (warp 9), MMA (warp 8),
+      Epilogue (warps 0-3)
+
+    :param sf_vec_size: Scalefactor vector size (16 for NVF4, 32 for MXF4/MXF8).
+    :type sf_vec_size: int
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N).
+        Note: use_2cta_instrs is automatically inferred from mma_tiler_mn[0]
+        (True when M=256, False when M=128).
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+    :param vectorized_f32: Whether to use vectorized f32x2 operations for better performance.
+    :type vectorized_f32: bool
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
+        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16
+
+    :note: Supported accumulator data types:
+        - Float32
+
+    :note: Supported C data types:
+        - Float32
+        - Float16/BFloat16
+        - Float8E4M3FN/Float8E5M2
+        # Note: Float4E2M1FN output includes SFC generation and quantization support for internal testing.
+        - Float4E2M1FN (with scale factor generation)
+
+    :note: Constraints:
+        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
+        - MMA tiler N must be 64/128/192/256
+        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors
+
+    Example:
+        >>> # Note: use_2cta_instrs is auto-inferred from mma_tiler_mn[0]
+        >>> # (True when M=256, False when M=128)
+        >>> gemm = BlockScaledContiguousGatherGroupedGemmKernel(
+        ...     sf_vec_size=16,
+        ...     mma_tiler_mn=(256, 128),  # use_2cta_instrs=True since M=256
+        ...     cluster_shape_mn=(2, 1),
+        ...     vectorized_f32=True,
+        ... )
+        >>> gemm(
+        ...     a=a_tensor,
+        ...     b=b_tensor,
+        ...     c=c_tensor,
+        ...     sfa=sfa_tensor,
+        ...     sfb=sfb_tensor,
+        ...     sfc_tensor=None,
+        ...     norm_const_tensor=None,
+        ...     tile_idx_to_expert_idx=tile_idx_to_expert_idx,
+        ...     tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+        ...     token_id_mapping_tensor=token_id_mapping_tensor,
+        ...     num_non_exiting_tiles=num_non_exiting_tiles,
+        ...     alpha=alpha,
+        ...     max_active_clusters=max_active_clusters,
+        ...     stream=stream,
+        ... )
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        vectorized_f32: bool,
+        topk: cutlass.Int64,
+        raster_along_m: bool = False,
+    ):
+        """Initializes the configuration for a Blackwell blockscaled dense GEMM kernel with
+        gather operation and SwiGLU fusion.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+            - use_2cta_instrs: Automatically inferred from mma_tiler_mn[0]
+              (True when M=256, False when M=128).
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        3.  Scale Factor Configuration:
+            - sf_vec_size: Vector size for block-scaled quantization.
+
+        4.  Performance Optimization:
+            - vectorized_f32: Enable vectorized f32x2 operations.
+
+        5.  MoE Configuration:
+            - topk: Number of experts selected per token (used for token ID mapping).
+
+        :param sf_vec_size: Vector size for scale factors (16 for NVF4, 32 for MXF4/MXF8).
+        :type sf_vec_size: int
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+            use_2cta_instrs is automatically set based on M (True if M=256, False if M=128).
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        :param vectorized_f32: Enable vectorized f32x2 operations for better performance.
+        :type vectorized_f32: bool
+        :param topk: Number of experts selected per token (used for token ID mapping).
+        :type topk: cutlass.Int64
+        """
+
+        self.sf_vec_size = sf_vec_size
+        self.topk = topk
+        self.acc_dtype = cutlass.Float32
+        self.use_2cta_instrs = mma_tiler_mn[0] == 256
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+        self.raster_along_m = raster_along_m
+
+        self.cta_group = (
+            tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+        )
+
+        self.occupancy = 1
+        self.epilog_warp_id = (0, 1, 2, 3)
+        self.ldgsts_a_warp_id = (
+            4,
+            5,
+            6,
+            7,
+        )
+        self.mma_warp_id = 8
+        self.tma_b_warp_id = 9
+        self.sched_warp_id = 10
+        self.sync_transform_warp_id = 11
+        self.threads_per_warp = 32
+        self.threads_per_cta = self.threads_per_warp * len(
+            (
+                self.mma_warp_id,
+                *self.ldgsts_a_warp_id,
+                self.tma_b_warp_id,
+                *self.epilog_warp_id,
+                self.sched_warp_id,
+                self.sync_transform_warp_id,
+            )
+        )
+        self.warps_wo_sched = (
+            len(
+                (
+                    *self.epilog_warp_id,
+                    self.mma_warp_id,
+                    self.tma_b_warp_id,
+                    self.sync_transform_warp_id,
+                    *self.ldgsts_a_warp_id,
+                )
+            )
+            if self.use_2cta_instrs
+            else len(
+                (
+                    *self.epilog_warp_id,
+                    self.mma_warp_id,
+                    self.tma_b_warp_id,
+                    *self.ldgsts_a_warp_id,
+                )
+            )
+        )
+        self.threads_wo_sched = self.threads_per_warp * self.warps_wo_sched
+
+        # Set barrier for cta sync, epilogue sync and tmem ptr sync
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=1,
+            num_threads=self.threads_per_cta,
+        )
+        self.epilog_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=2,
+            num_threads=32 * len(self.epilog_warp_id),
+        )
+        self.tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=3,
+            num_threads=32 * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+        self.sched_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=4,
+            num_threads=self.threads_per_warp,
+        )
+
+        self.num_smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        SM100_TMEM_CAPACITY_COLUMNS = 512
+        self.num_tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS
+
+        self.vectorized_f32 = vectorized_f32
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B
+        - Computing epilogue subtile
+        - Setting up A/B/C stage counts in shared memory
+        - Computing A/B/C shared memory layout
+        - Computing tensor memory allocation columns
+        """
+
+        self.mma_inst_shape_mn = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+        )
+        # (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K)
+        self.mma_inst_shape_mn_sfb = (
+            self.mma_inst_shape_mn[0] // (2 if self.use_2cta_instrs else 1),
+            cute.round_up(self.mma_inst_shape_mn[1], 128),
+        )
+
+        # Configure tiled mma
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.mma_tiler_sfa = (
+            self.mma_inst_shape_mn[0],
+            self.mma_inst_shape_mn[1],
+            mma_inst_shape_k * mma_inst_tile_k // 16,
+        )
+
+        self.mma_tiler_sfb = (
+            self.mma_inst_shape_mn_sfb[0],
+            self.mma_inst_shape_mn_sfb[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.mma_tiler_c = (
+            self.mma_inst_shape_mn[0],
+            self.mma_inst_shape_mn[1] // 2,
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+
+        self.cta_tile_shape_mnk_sfa = (
+            self.mma_tiler_sfa[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_sfa[1],
+            self.mma_tiler_sfa[2],
+        )
+
+        self.cta_tile_shape_mnk_sfb = (
+            self.mma_tiler_sfb[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_sfb[1],
+            self.mma_tiler_sfb[2],
+        )
+
+        self.cta_tile_shape_mnk_c = (
+            self.mma_tiler_c[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_c[1],
+            self.mma_tiler_c[2],
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        self.cluster_layout_sfb_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma_sfb.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+
+        # Compute epilogue subtile
+        self.epi_tile = (128, 64)
+        self.epi_tile_cnt = (
+            self.cta_tile_shape_mnk_c[0] // self.epi_tile[0],
+            self.cta_tile_shape_mnk_c[1] // self.epi_tile[1],
+        )
+
+        # Setup A/B/C/Scale stage count in shared memory and ACC stage count in tensor memory
+        (
+            self.num_acc_stage,
+            self.num_ab_stage,
+            self.num_c_stage,
+            self.num_tile_stage,
+        ) = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.epi_tile,
+            self.c_dtype,
+            self.c_layout,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.num_smem_capacity,
+            self.occupancy,
+        )
+
+        # Compute A/B/C/Scale shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+
+        self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile,
+            self.num_c_stage,
+        )
+
+        # Overlap and double buffer accumulator when num_acc_stage == 1 for cta_tile_n = 256 case
+        self.overlapping_accum = self.num_acc_stage == 1
+
+        # Compute number of TMEM columns for SFA/SFB/Accumulator
+        sf_atom_mn = 32
+        self.num_sfa_tmem_cols = (
+            self.cta_tile_shape_mnk[0] // sf_atom_mn
+        ) * mma_inst_tile_k
+        self.num_sfb_tmem_cols = (
+            self.cta_tile_shape_mnk_sfb[1] // sf_atom_mn
+        ) * mma_inst_tile_k
+        self.num_sf_tmem_cols = self.num_sfa_tmem_cols + self.num_sfb_tmem_cols
+        self.num_accumulator_tmem_cols = (
+            self.cta_tile_shape_mnk[1] * self.num_acc_stage
+            if not self.overlapping_accum
+            else self.cta_tile_shape_mnk[1] * 2 - self.num_sf_tmem_cols
+        )
+
+        self.epi_tile_n_required = 2 * cute.size(self.epi_tile[1])
+        # Only when overlapping_accum is enabled, we need to release accumulator buffer early in epilogue
+        self.iter_acc_early_release_in_epilogue = (
+            self.num_sf_tmem_cols // self.epi_tile_n_required
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a: cute.Tensor,
+        b: cute.Tensor,
+        c: cute.Tensor,
+        sfa: cute.Tensor,
+        sfb: cute.Tensor,
+        sfc_tensor: Optional[cute.Tensor],
+        norm_const_tensor: Optional[cute.Tensor],
+        tile_idx_to_expert_idx: cute.Tensor,
+        tile_idx_to_mn_limit: cute.Tensor,
+        token_id_mapping_tensor: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        alpha: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the contiguous grouped GEMM with gather operation and SwiGLU fusion.
+
+        This method performs FC1 layer computation:
+        1. GEMM: acc = alpha * (SFA * A[token_ids]) * (SFB * B)
+        2. SwiGLU: C = up * silu(gate), where up/gate are extracted from interleaved acc (granularity=64)
+        3. Optional Quant: When c_dtype is Float4E2M1FN, generates SFC and quantizes output
+
+        Data loading:
+        - A and SFA are loaded using LDGSTS instructions with token-based gather
+        - B and SFB are loaded using TMA instructions with multicast
+        - B weights are interleaved: [up_0:64, gate_64:128, up_128:192, gate_192:256, ...]
+
+        Execution steps:
+        1. Setup static attributes before smem/grid computation
+        2. Setup TMA load/store atoms for B, SFB, and C (no TMA for A/SFA)
+        3. Compute grid size with regard to hardware constraints
+        4. Define shared storage for kernel
+        5. Launch the kernel synchronously with warp specialization:
+           - Scheduler warp: Dispatches tile information
+           - LDGSTS warps: Load A and SFA with gather
+           - A Sync Transform warps: Transform the sync signal of A and SFA from global to
+             shared memory when use_2cta_instrs is True
+           - TMA warp: Load B and SFB with multicast
+           - MMA warp: Perform matrix multiply-accumulate
+           - Epilogue warps: Apply SwiGLU activation, optional quantization, and store results
+
+        :param a: Input tensor A (MxKx1), will be gathered using token_id_mapping
+        :type a: cute.Tensor
+        :param b: Input tensor B (NxKxL), L is the number of experts/groups, weights are interleaved for SwiGLU
+        :type b: cute.Tensor
+        :param c: Output tensor C (Mx(N/2)x1), N is halved due to SwiGLU fusion
+        :type c: cute.Tensor
+        :param sfa: Scale factor tensor A, will be gathered using token_id_mapping
+        :type sfa: cute.Tensor
+        :param sfb: Scale factor tensor B
+        :type sfb: cute.Tensor
+        :param sfc_tensor: Scale factor tensor C for quantized output (None if not quantizing)
+        :type sfc_tensor: Optional[cute.Tensor]
+        :param norm_const_tensor: Normalization constant for scale factor generation
+            (None if not quantizing)
+        :type norm_const_tensor: Optional[cute.Tensor]
+        :param tile_idx_to_expert_idx: Mapping from tile index to expert ID,
+            shape (permuted_m/cta_tile_m,) where cta_tile_m is the CTA tile M size
+        :type tile_idx_to_expert_idx: cute.Tensor
+        :param tile_idx_to_mn_limit: Mapping from tile index to M-N dimension limit
+            for boundary checking, shape (permuted_m/cta_tile_m,)
+        :type tile_idx_to_mn_limit: cute.Tensor
+        :param token_id_mapping_tensor: Token ID mapping for gather operation, shape (permuted_m,)
+        :type token_id_mapping_tensor: cute.Tensor
+        :param num_non_exiting_tiles: Number of valid tiles to process (valid_m/cta_tile_m), shape (1,)
+        :type num_non_exiting_tiles: cute.Tensor
+        :param alpha: Alpha tensor for each group
+        :type alpha: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c.element_type
+        self.sf_dtype: Type[cutlass.Numeric] = sfa.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(c)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        # Setup sfb tensor by filling B tensor to scale factor atom layout
+        # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+        sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b.shape, self.sf_vec_size)
+        sfb = cute.make_tensor(sfb.iterator, sfb_layout)
+
+        # Setup sfc tensor by filling C tensor to scale factor atom layout
+        self.generate_sfc = sfc_tensor is not None and norm_const_tensor is not None
+        if cutlass.const_expr(self.generate_sfc):
+            sfc_layout = blockscaled_utils.tile_atom_to_shape_SF(
+                c.shape, self.sf_vec_size
+            )
+            sfc_tensor = cute.make_tensor(sfc_tensor.iterator, sfc_layout)
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        # For 2CTA blockscaled kernels, SFB needs to be replicated across peer CTAs.
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for B
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            b_op,
+            b,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for SFB
+        sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        sfb_smem_layout = cute.slice_(
+            self.sfb_smem_layout_staged, (None, None, None, 0)
+        )
+        tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+            sfb_op,
+            sfb,
+            sfb_smem_layout,
+            self.mma_tiler_sfb,
+            tiled_mma_sfb,
+            self.cluster_layout_sfb_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # This modifies the layout to handle overlapping 256x(# of scale factors for a single column of B (nNSF))
+        # logical blocks for SFB when cta_tile_shape_n=192.
+        if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+            x = tma_tensor_sfb.stride[0][1]
+            y = cute.ceil_div(tma_tensor_sfb.shape[0][1], 4)
+
+            new_shape = (
+                (tma_tensor_sfb.shape[0][0], ((2, 2), y)),
+                tma_tensor_sfb.shape[1],
+                tma_tensor_sfb.shape[2],
+            )
+            # Use right multiplication for ScaledBasis (3 * x instead of x * 3)
+            x_times_3 = 3 * x
+            new_stride = (
+                (tma_tensor_sfb.stride[0][0], ((x, x), x_times_3)),
+                tma_tensor_sfb.stride[1],
+                tma_tensor_sfb.stride[2],
+            )
+            tma_tensor_sfb_new_layout = cute.make_layout(new_shape, stride=new_stride)
+            tma_tensor_sfb = cute.make_tensor(
+                tma_tensor_sfb.iterator, tma_tensor_sfb_new_layout
+            )
+
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
+        self.num_tma_load_bytes = (b_copy_size + sfb_copy_size) * atom_thr_size
+
+        # Setup TMA store for C
+        tma_atom_c = None
+        tma_tensor_c = None
+        epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+        tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            c,
+            epi_smem_layout,
+            self.epi_tile,
+        )
+
+        # Compute grid size
+        self.tile_sched_params, grid = self._compute_grid(
+            c,
+            self.cta_tile_shape_mnk_c,
+            self.cluster_shape_mn,
+            max_active_clusters,
+            self.raster_along_m,
+        )
+
+        self.buffer_align_bytes = 1024
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage1cta:
+            # (bidx, bidy, bidz, valid, mn_limit)
+            sInfo: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Int32, 5 * self.num_tile_stage],
+                # 1 byte alignment
+                1,
+            ]
+            a_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+            b_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+            acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+            tile_info_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.num_tile_stage * 2
+            ]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.c_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+
+        @cute.struct
+        class SharedStorage2cta:
+            # (bidx, bidy, bidz, valid, mn_limit)
+            sInfo: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Int32, 5 * self.num_tile_stage],
+                # 1 byte alignment
+                1,
+            ]
+            a_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+            a_sync_transform_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.num_ab_stage * 2
+            ]
+            b_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+            acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+            tile_info_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.num_tile_stage * 2
+            ]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.c_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = (
+            SharedStorage2cta
+            if cutlass.const_expr(self.use_2cta_instrs)
+            else SharedStorage1cta
+        )
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tiled_mma_sfb,
+            a,
+            tma_atom_b,
+            tma_tensor_b,
+            sfa,
+            tma_atom_sfb,
+            tma_tensor_sfb,
+            tma_atom_c,
+            tma_tensor_c,
+            sfc_tensor,
+            norm_const_tensor,
+            tile_idx_to_expert_idx,
+            tile_idx_to_mn_limit,
+            token_id_mapping_tensor,
+            num_non_exiting_tiles,
+            alpha,
+            self.cluster_layout_vmnk,
+            self.cluster_layout_sfb_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.sfa_smem_layout_staged,
+            self.sfb_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.epi_tile,
+            self.tile_sched_params,
+            epilogue_op,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            smem=self.shared_storage.size_in_bytes(),  # type: ignore[union-attr]
+            stream=stream,
+            min_blocks_per_mp=1,
+            use_pdl=TRTLLM_ENABLE_PDL,
+        )
+        return
+
+    def mainloop_s2t_copy_and_partition(
+        self,
+        sSF: cute.Tensor,
+        tSF: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for smem to tmem load for scale factor tensor, then use it to
+        partition smem memory (source) and tensor memory (destination).
+
+        :param sSF: The scale factor tensor in smem
+        :type sSF: cute.Tensor
+        :param tSF: The scale factor tensor in tmem
+        :type tSF: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
+            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
+            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
+            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # (MMA, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact = cute.filter_zeros(sSF)
+        # (MMA, MMA_MN, MMA_K)
+        tCtSF_compact = cute.filter_zeros(tSF)
+
+        # Make S2T CopyAtom and tiledCopy
+        copy_atom_s2t = cute.make_copy_atom(
+            tcgen05.Cp4x32x128bOp(self.cta_group),
+            self.sf_dtype,
+        )
+        tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact)
+        thr_copy_s2t = tiled_copy_s2t.get_slice(0)
+
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+            tiled_copy_s2t, tCsSF_compact_s2t_
+        )
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+        tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact)
+
+        return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tiled_mma_sfb: cute.TiledMma,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        mSFA_mkl: cute.Tensor,
+        tma_atom_sfb: cute.CopyAtom,
+        mSFB_nkl: cute.Tensor,
+        tma_atom_c: cute.CopyAtom,
+        mC_mnl: cute.Tensor,
+        mSFC_mnl: Optional[cute.Tensor],
+        norm_const_tensor: Optional[cute.Tensor],
+        tile_idx_to_expert_idx: cute.Tensor,
+        tile_idx_to_mn_limit: cute.Tensor,
+        token_id_mapping_tensor: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        alpha: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        cluster_layout_sfb_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        sfa_smem_layout_staged: cute.Layout,
+        sfb_smem_layout_staged: cute.Layout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        epi_tile: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_b_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_sfb)
+            cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+
+        block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        # Pipeline Init: Initialize A pipeline for LDGSTS operations
+        # Producer: 4 warps (warps 4-7) with 128 threads total for LDGSTS operations
+        # Consumer: MMA warp for consuming A/SFA data
+        a_pipeline_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_per_warp * 4,
+        )
+
+        a_pipeline = PipelineCpAsyncUmma.create(
+            barrier_storage=storage.a_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=a_pipeline_producer_group,
+            consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
+            cta_layout_vmnk=cluster_layout_vmnk,
+            defer_sync=True,
+        )
+
+        # Pipeline Init: Initialize A SYNC Transform pipeline when use_2cta_instrs is True
+        # Producer: 1 warp (warp 11) for LDGSTS SYNC transformation operations
+        # Consumer: MMA warp for consuming A/SFA data
+        if cutlass.const_expr(self.use_2cta_instrs):
+            a_sync_transform_pipeline_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * cute.size(cluster_layout_vmnk, mode=[0]),
+            )
+            a_sync_transform_pipeline = pipeline.PipelineAsyncUmma.create(
+                barrier_storage=storage.a_sync_transform_mbar_ptr.data_ptr(),
+                num_stages=self.num_ab_stage,
+                producer_group=a_sync_transform_pipeline_producer_group,
+                consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
+                cta_layout_vmnk=cluster_layout_vmnk,
+                defer_sync=True,
+            )
+
+        # Pipeline Init: Initialize B pipeline for TMA operations
+        # Using PipelineTmaUmma for B/SFB since they use TMA load with multicast support
+        # Producer: TMA B/SFB warp (warp 9) - 1 warp issuing TMA operations
+        # Consumer: MMA warp for consuming B/SFB data
+        b_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_b
+        b_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_tma_producer
+        )
+        b_pipeline = pipeline.PipelineTmaUmma.create(
+            barrier_storage=storage.b_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=b_pipeline_producer_group,
+            consumer_group=b_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,  # Total bytes loaded by TMA (B + SFB)
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Pipeline Init: Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (
+            2 if use_2cta_instrs else 1
+        )
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_acc_consumer_threads
+        )
+        acc_pipeline = pipeline.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Pipeline Init:Initialize tile info pipeline (barrier) and states
+        tile_info_pipeline_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_per_warp * 1,
+        )
+        tile_info_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_wo_sched,
+        )
+        tile_info_pipeline = pipeline.PipelineAsync.create(
+            barrier_storage=storage.tile_info_mbar_ptr.data_ptr(),
+            num_stages=self.num_tile_stage,
+            producer_group=tile_info_pipeline_producer_group,
+            consumer_group=tile_info_pipeline_consumer_group,
+        )
+
+        # Tensor memory dealloc barrier init
+        tmem = utils.TmemAllocator(
+            storage.tmem_holding_buf,
+            barrier_for_retrieve=self.tmem_alloc_barrier,
+            allocator_warp_id=self.epilog_warp_id[0],
+            is_two_cta=use_2cta_instrs,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+        )
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/C/Scale
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(
+            c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner
+        )
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(
+            a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+        )
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(
+            b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+        )
+        # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+        sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+        # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+        sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
+        # (bidx, bidy, bidz, valid, mn_limit)
+        info_layout = cute.make_layout((5, self.num_tile_stage), stride=(1, 5))
+        sInfo = storage.sInfo.get_tensor(info_layout)
+
+        #
+        # Compute multicast mask for A/B buffer full
+        #
+        b_full_mcast_mask = None
+        sfb_full_mcast_mask = None
+        if cutlass.const_expr(self.is_b_mcast or use_2cta_instrs):
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
+            )
+            sfb_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1
+            )
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, loopM, loopK, loopL)
+        gA_mkl = cute.local_tile(
+            mA_mkl,
+            cute.slice_(self.cta_tile_shape_mnk, (None, 0, None)),
+            (None, None, None),
+        )
+        # (bN, bK, loopN, loopK, loopL)
+        gB_nkl = cute.local_tile(
+            mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
+        )
+
+        # (bM, bK, RestM, RestK, RestL)
+        gSFA_mkl = cute.local_tile(
+            mSFA_mkl,
+            cute.slice_(self.cta_tile_shape_mnk_sfa, (None, 0, None)),
+            (None, None, None),
+        )
+
+        # (bN, bK, RestN, RestK, RestL)
+        gSFB_nkl = cute.local_tile(
+            mSFB_nkl,
+            cute.slice_(self.mma_tiler_sfb, (0, None, None)),
+            (None, None, None),
+        )
+
+        gToken_ml = cute.local_tile(
+            token_id_mapping_tensor,
+            cute.slice_(self.cta_tile_shape_mnk, (None, 0, 0)),
+            (None,),
+        )
+
+        # (bM, bN, loopM, loopN, loopL)
+        gC_mnl = cute.local_tile(
+            mC_mnl, cute.slice_(self.mma_tiler_c, (None, None, 0)), (None, None, None)
+        )
+        k_tile_cnt = cutlass.Int32(cute.size(gA_mkl, mode=[3]))
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_N, MMA_K, loopN, loopK, loopL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
+        # (MMA, MMA_M, MMA_N, loopM, loopN, loopL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Partition global/shared tensor for TMA load B
+        #
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        # TMA load SFB partition_S/D
+        sfb_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfb,
+            block_in_cluster_coord_sfb_vmnk[1],
+            sfb_cta_layout,
+            cute.group_modes(sSFB, 0, 3),
+            cute.group_modes(tCgSFB, 0, 3),
+        )
+        tBsSFB = cute.filter_zeros(tBsSFB)
+        tBgSFB = cute.filter_zeros(tBgSFB)
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        if cutlass.const_expr(self.overlapping_accum):
+            num_acc_stage_overlapped = 2
+            tCtAcc_fake = tiled_mma.make_fragment_C(
+                cute.append(acc_shape, num_acc_stage_overlapped)
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_fake = cute.make_tensor(
+                tCtAcc_fake.iterator,
+                cute.make_layout(
+                    tCtAcc_fake.shape,
+                    stride=(
+                        tCtAcc_fake.stride[0],
+                        tCtAcc_fake.stride[1],
+                        tCtAcc_fake.stride[2],
+                        (256 - self.num_sf_tmem_cols) * tCtAcc_fake.stride[0][1],
+                    ),
+                ),
+            )
+        else:
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_fake = tiled_mma.make_fragment_C(
+                cute.append(acc_shape, self.num_acc_stage)
+            )
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            self.cta_sync_barrier.arrive_and_wait()
+
+        griddepcontrol_wait()
+
+        #
+        # Specialized Schedule Warp
+        #
+        if warp_idx == self.sched_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            tile_info_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_tile_stage
+            )
+
+            num_non_exiting_tiles_value = num_non_exiting_tiles[0]
+
+            if cutlass.const_expr(self.raster_along_m):
+                while work_tile.is_valid_tile:
+                    cur_tile_coord = work_tile.tile_idx
+                    mma_tile_coord_m = cur_tile_coord[0] // cute.size(
+                        tiled_mma.thr_id.shape
+                    )
+                    if mma_tile_coord_m < num_non_exiting_tiles_value:
+                        tile_info_pipeline.producer_acquire(tile_info_producer_state)
+                        cur_tile_coord = work_tile.tile_idx
+                        expert_idx = tile_idx_to_expert_idx[mma_tile_coord_m]
+                        mn_limit = tile_idx_to_mn_limit[mma_tile_coord_m]
+                        with cute.arch.elect_one():
+                            sInfo[(0, tile_info_producer_state.index)] = cur_tile_coord[
+                                0
+                            ]
+                            sInfo[(1, tile_info_producer_state.index)] = cur_tile_coord[
+                                1
+                            ]
+                            sInfo[(2, tile_info_producer_state.index)] = expert_idx
+                            sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(
+                                work_tile.is_valid_tile
+                            )
+                            sInfo[(4, tile_info_producer_state.index)] = mn_limit
+                            # fence view async shared
+                        cute.arch.fence_proxy(
+                            cute.arch.ProxyKind.async_shared,
+                            space=cute.arch.SharedSpace.shared_cta,
+                        )
+
+                        self.sched_sync_barrier.arrive_and_wait()
+                        tile_info_pipeline.producer_commit(tile_info_producer_state)
+                        tile_info_producer_state.advance()
+
+                    tile_sched.advance_to_next_work()
+                    work_tile = tile_sched.get_current_work()
+            else:
+                is_continue = cutlass.Boolean(1)
+                while work_tile.is_valid_tile and is_continue:
+                    cur_tile_coord = work_tile.tile_idx
+                    mma_tile_coord_m = cur_tile_coord[0] // cute.size(
+                        tiled_mma.thr_id.shape
+                    )
+                    if mma_tile_coord_m < num_non_exiting_tiles_value:
+                        tile_info_pipeline.producer_acquire(tile_info_producer_state)
+                        cur_tile_coord = work_tile.tile_idx
+                        expert_idx = tile_idx_to_expert_idx[mma_tile_coord_m]
+                        mn_limit = tile_idx_to_mn_limit[mma_tile_coord_m]
+                        with cute.arch.elect_one():
+                            sInfo[(0, tile_info_producer_state.index)] = cur_tile_coord[
+                                0
+                            ]
+                            sInfo[(1, tile_info_producer_state.index)] = cur_tile_coord[
+                                1
+                            ]
+                            sInfo[(2, tile_info_producer_state.index)] = expert_idx
+                            sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(
+                                work_tile.is_valid_tile
+                            )
+                            sInfo[(4, tile_info_producer_state.index)] = mn_limit
+                            # fence view async shared
+                        cute.arch.fence_proxy(
+                            cute.arch.ProxyKind.async_shared,
+                            space=cute.arch.SharedSpace.shared_cta,
+                        )
+
+                        self.sched_sync_barrier.arrive_and_wait()
+                        tile_info_pipeline.producer_commit(tile_info_producer_state)
+                        tile_info_producer_state.advance()
+                    else:
+                        is_continue = cutlass.Boolean(0)
+
+                    tile_sched.advance_to_next_work()
+                    work_tile = tile_sched.get_current_work()
+
+            tile_info_pipeline.producer_acquire(tile_info_producer_state)
+            with cute.arch.elect_one():
+                sInfo[(0, tile_info_producer_state.index)] = work_tile.tile_idx[0]
+                sInfo[(1, tile_info_producer_state.index)] = work_tile.tile_idx[1]
+                sInfo[(2, tile_info_producer_state.index)] = -1
+                sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(0)
+                sInfo[(4, tile_info_producer_state.index)] = -1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            self.sched_sync_barrier.arrive_and_wait()
+            tile_info_pipeline.producer_commit(tile_info_producer_state)
+            tile_info_producer_state.advance()
+            tile_info_pipeline.producer_tail(tile_info_producer_state)
+
+        #
+        # Specialized LDGSTS A/SFA warps (warps 4-7)
+        # These warps use LDGSTS instructions to load A and SFA from global to shared memory
+        # with gather/permutation capability enabled by token_id_mapping
+        #
+        if (
+            warp_idx <= self.ldgsts_a_warp_id[-1]
+            and warp_idx >= self.ldgsts_a_warp_id[0]
+        ):
+            #
+            # Setup LDGSTS copy atoms for A and SFA
+            # A: 8x LDGSTS.128 per thread with swizzle_128B for A matrix (32 elements per thread)
+            # SFA: 4x LDGSTS.32 per thread with 512-element block swizzling for scale factor A (4 elements per thread)
+            #
+            a_atom_copy = cute.make_copy_atom(
+                cute.nvgpu.cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL),
+                mA_mkl.element_type,
+                num_bits_per_copy=128,
+            )
+            a_thread_layout = cute.make_layout((16, 8), stride=(8, 1))
+            a_value_layout = cute.make_layout((1, 32), stride=(32, 1))
+            a_tiled_copy = cute.make_tiled_copy_tv(
+                a_atom_copy,
+                a_thread_layout,
+                a_value_layout,
+            )
+
+            sfa_atom_copy = cute.make_copy_atom(
+                cute.nvgpu.cpasync.CopyG2SOp(),
+                mSFA_mkl.element_type,
+                num_bits_per_copy=32,
+            )
+            tidx_in_warpgroup = tidx % 128
+
+            sA_tiled = cute.make_tensor(
+                sA.iterator,
+                layout=cute.make_layout(
+                    (
+                        self.cta_tile_shape_mnk[0],
+                        self.cta_tile_shape_mnk[2],
+                        self.num_ab_stage,
+                    ),
+                    stride=(
+                        self.cta_tile_shape_mnk[2],
+                        1,
+                        self.cta_tile_shape_mnk[0] * self.cta_tile_shape_mnk[2],
+                    ),
+                ),
+            )
+            a_thr_copy = a_tiled_copy.get_slice(tidx_in_warpgroup)
+            tAsA_tiled = a_thr_copy.partition_D(sA_tiled)
+
+            a_token_offset_tensor = cute.make_rmem_tensor(
+                cute.make_layout((8,)),
+                cutlass.Int32,
+            )
+            a_predicate_tensor = cute.make_rmem_tensor(
+                cute.make_layout((8,)),
+                cutlass.Boolean,
+            )
+            sfa_token_offset_tensor = cute.make_rmem_tensor(
+                cute.make_layout((1,)),
+                cutlass.Int32,
+            )
+            sfa_predicate_tensor = cute.make_rmem_tensor(
+                cute.make_layout((1,)),
+                cutlass.Boolean,
+            )
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            a_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_ab_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((5,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(5, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                # Load token IDs for gather operation
+                # For A matrix: each thread loads 8 token offsets (for 8 LDGSTS.128 operations)
+                # For SFA matrix: each thread loads 1 token offset (for 4 LDGSTS.32 operations)
+                gToken_ml_tile = gToken_ml[(None, tile_info[0])]
+                for i in range(8):
+                    token_ml_tile_offset = (tidx_in_warpgroup // 8) + i * 16
+                    a_token_offset_tensor[i] = gToken_ml_tile[token_ml_tile_offset]
+                    a_predicate_tensor[i] = (
+                        cutlass.Boolean(1)
+                        if tile_info[0] * self.cta_tile_shape_mnk[0]
+                        + token_ml_tile_offset
+                        < tile_info[4]
+                        else cutlass.Boolean(0)
+                    )
+                    a_token_offset_tensor[i] = (
+                        a_token_offset_tensor[i] // self.topk
+                        if tile_info[0] * self.cta_tile_shape_mnk[0]
+                        + token_ml_tile_offset
+                        < tile_info[4]
+                        else 0
+                    )
+
+                token_ml_tile_offset = (
+                    8 * (tidx_in_warpgroup // 32)
+                    + 32 * ((tidx_in_warpgroup % 32) // 8)
+                    + (tidx_in_warpgroup % 8)
+                )
+                sfa_token_offset_tensor[0] = (
+                    gToken_ml_tile[token_ml_tile_offset] // self.topk
+                )
+                sfa_predicate_tensor[0] = (
+                    cutlass.Boolean(1)
+                    if tile_info[0] * self.cta_tile_shape_mnk[0] + token_ml_tile_offset
+                    < tile_info[4]
+                    else cutlass.Boolean(0)
+                )
+                relative_sfa_token_offset = sfa_token_offset_tensor[0]
+
+                tAgA = gA_mkl[(None, None, 0, None, 0)]
+                A_gmem_thread_offset = cute.assume(
+                    (tidx_in_warpgroup % 8) * 32, divby=32
+                )
+                tAgSFA = gSFA_mkl[(relative_sfa_token_offset, None, 0, None, 0)]
+
+                tAsSFA = sSFA[
+                    (
+                        (
+                            (
+                                (
+                                    8 * (tidx_in_warpgroup // 32)
+                                    + (tidx_in_warpgroup % 8),
+                                    (tidx_in_warpgroup % 32) // 8,
+                                ),
+                                None,
+                            ),
+                            None,
+                        ),
+                        None,
+                        None,
+                        None,
+                    )
+                ]
+
+                # Peek (try_wait) SCALE buffer empty
+                a_producer_state.reset_count()
+                peek_a_empty_status = cutlass.Boolean(1)
+                if a_producer_state.count < k_tile_cnt:
+                    peek_a_empty_status = a_pipeline.producer_try_acquire(
+                        a_producer_state
+                    )
+
+                #
+                # Load A and SFA with LDGSTS and gather/permutation
+                # Each K-tile iteration loads one K-tile of A and SFA from GMEM to SMEM
+                # using LDGSTS instructions with token-based gather addressing
+                #
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):  # noqa: B007
+                    # Conditionally wait for AB buffer empty
+                    a_pipeline.producer_acquire(a_producer_state, peek_a_empty_status)
+
+                    tAgA_ktile = tAgA[(None, None, a_producer_state.count)]
+                    tAsA_ktile = tAsA_tiled[(None, None, None, a_producer_state.index)]
+
+                    tAgSFA_ktile = tAgSFA[(None, a_producer_state.count)]
+                    tAsSFA_ktile = tAsSFA[
+                        (
+                            None,
+                            None,
+                            None,
+                            None,
+                            a_producer_state.index,
+                        )
+                    ]
+
+                    for i in range(8):
+                        #
+                        # Load A matrix: 8x LDGSTS.128 per thread with swizzle_128B
+                        # Each LDGSTS.128 loads 32 elements (128 bits) from GMEM to SMEM
+                        # Global memory address is computed using token offset for gather operation
+                        # Predicate mask guards against invalid token IDs (padding tokens marked as -1)
+                        #
+                        A_gmem_slice_offset = A_gmem_thread_offset + cute.assume(
+                            a_token_offset_tensor[i] * tAgA_ktile.layout[0].stride,
+                            divby=32,
+                        )
+                        A_gmem_slice_offset = cute.assume(A_gmem_slice_offset, divby=32)
+                        tAgA_slice_ptr = tAgA_ktile.iterator + A_gmem_slice_offset
+                        tAgA_slice = cute.make_tensor(
+                            tAgA_slice_ptr, layout=cute.make_layout((32,))
+                        )
+
+                        tAsA_slice = cute.make_tensor(
+                            tAsA_ktile[(None, i, None)].iterator,
+                            layout=cute.make_layout((32,)),
+                        )
+                        a_predicate_slice = cute.make_rmem_tensor(
+                            cute.make_layout((1,)), cutlass.Boolean
+                        )
+                        a_predicate_slice[0] = a_predicate_tensor[i]
+
+                        cute.copy_atom_call(
+                            a_atom_copy, tAgA_slice, tAsA_slice, pred=a_predicate_slice
+                        )
+
+                    for i in range(4):
+                        #
+                        # Load SFA: 4x LDGSTS.32 per thread with 512-element block swizzling
+                        # Each LDGSTS.32 loads 4 scale factor elements (32 bits) from GMEM to SMEM
+                        # Uses same token offset as A matrix for consistent gather operation
+                        #
+                        swizzled_iterator = (tidx_in_warpgroup % 32) // 8 ^ i
+                        tAgSFA_slice_ptr = tAgSFA_ktile.iterator + 4 * swizzled_iterator
+                        tAgSFA_slice = cute.make_tensor(
+                            tAgSFA_slice_ptr, layout=cute.make_layout((4,))
+                        )
+
+                        tAsSFA_slice_ptr = (
+                            tAsSFA_ktile.iterator + 512 * swizzled_iterator
+                        )
+                        tAsSFA_slice = cute.make_tensor(
+                            tAsSFA_slice_ptr, cute.make_layout((4,))
+                        )
+
+                        cute.copy_atom_call(
+                            sfa_atom_copy,
+                            tAgSFA_slice,
+                            tAsSFA_slice,
+                            pred=sfa_predicate_tensor,
+                        )
+
+                    a_pipeline.producer_commit(a_producer_state)
+
+                    # Peek (try_wait) A buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1
+                    a_producer_state.advance()
+                    peek_a_empty_status = cutlass.Boolean(1)
+                    if a_producer_state.count < k_tile_cnt:
+                        peek_a_empty_status = a_pipeline.producer_try_acquire(
+                            a_producer_state
+                        )
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(5, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+
+            #
+            # Wait A pipeline buffer empty
+            #
+            a_pipeline.producer_tail(a_producer_state)
+
+        #
+        # Specialized A/SFA Sync Transform Warp (warp 11) when use_2cta_instrs is True
+        # This warp serve as sync transformation for A and SFA
+        #
+        if warp_idx == self.sync_transform_warp_id:
+            if cutlass.const_expr(self.use_2cta_instrs):
+                #
+                # Persistent tile scheduling loop
+                #
+                tile_sched = utils.StaticPersistentTileScheduler.create(
+                    tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+                )
+                # First tile
+                work_tile = tile_sched.initial_work_tile_info()
+
+                a_consumer_state = pipeline.make_pipeline_state(
+                    pipeline.PipelineUserType.Consumer, self.num_ab_stage
+                )
+                a_sync_transform_producer_state = pipeline.make_pipeline_state(
+                    pipeline.PipelineUserType.Producer, self.num_ab_stage
+                )
+                tile_info_consumer_state = pipeline.make_pipeline_state(
+                    pipeline.PipelineUserType.Consumer, self.num_tile_stage
+                )
+
+                # Get the first tile info
+                valid_tile_info = cute.make_rmem_tensor((1,), cutlass.Int32)
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                valid_tile_info[0] = sInfo[(3, tile_info_consumer_state.index)]
+                is_valid_tile = valid_tile_info[0] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+
+                while is_valid_tile:
+                    # Peek (try_wait) A buffer full for k_tile = 0
+                    a_consumer_state.reset_count()
+                    peek_a_full_status = cutlass.Boolean(1)
+                    if a_consumer_state.count < k_tile_cnt:
+                        peek_a_full_status = a_pipeline.consumer_try_wait(
+                            a_consumer_state
+                        )
+                    # Peek (try_wait) a sync transform buffer empty
+                    a_sync_transform_producer_state.reset_count()
+
+                    for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):  # noqa: B007
+                        # Conditionally wait for A buffer full
+                        a_pipeline.consumer_wait(a_consumer_state, peek_a_full_status)
+
+                        a_sync_transform_pipeline.producer_commit(
+                            a_sync_transform_producer_state
+                        )
+                        a_sync_transform_producer_state.advance()
+
+                        # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
+                        a_consumer_state.advance()
+                        peek_a_full_status = cutlass.Boolean(1)
+                        if a_consumer_state.count < k_tile_cnt:
+                            peek_a_full_status = a_pipeline.consumer_try_wait(
+                                a_consumer_state
+                            )
+
+                    #
+                    # Advance to next tile
+                    #
+                    tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                    valid_tile_info[0] = sInfo[(3, tile_info_consumer_state.index)]
+                    is_valid_tile = valid_tile_info[0] == 1
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                    tile_info_consumer_state.advance()
+
+                #
+                # Wait A sync transform buffer empty
+                #
+                a_sync_transform_pipeline.producer_tail(a_sync_transform_producer_state)
+
+        #
+        # Specialized TMA B/SFB load warp (warp 9)
+        # This warp uses TMA instructions to load B and SFB from global to shared memory
+        # with multicast support to reduce L2 memory traffic
+        #
+        if warp_idx == self.tma_b_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            b_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_ab_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), loopK)
+                tBgB_slice = tBgB[
+                    (None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])
+                ]
+
+                # Apply SFB slicing hack when cta_tile_shape_n=64
+                slice_n = mma_tile_coord_mnl[1]
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    slice_n = mma_tile_coord_mnl[1] // 2
+
+                # ((atom_v, rest_v), RestK)
+                tBgSFB_slice = tBgSFB[(None, slice_n, None, mma_tile_coord_mnl[2])]
+
+                # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt
+                b_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if b_producer_state.count < k_tile_cnt:
+                    peek_ab_empty_status = b_pipeline.producer_try_acquire(
+                        b_producer_state
+                    )
+                #
+                # Tma load loop
+                #
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):  # noqa: B007
+                    # Conditionally wait for B buffer empty
+                    b_pipeline.producer_acquire(b_producer_state, peek_ab_empty_status)
+
+                    tBgB_k = tBgB_slice[(None, b_producer_state.count)]
+                    tBgSFB_k = tBgSFB_slice[(None, b_producer_state.count)]
+                    tBsB_pipe = tBsB[(None, b_producer_state.index)]
+                    tBsSFB_pipe = tBsSFB[(None, b_producer_state.index)]
+
+                    tma_bar = b_pipeline.producer_get_barrier(b_producer_state)
+
+                    # TMA load B
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_k,
+                        tBsB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=b_full_mcast_mask,
+                    )
+
+                    # TMA load SFB
+                    cute.copy(
+                        tma_atom_sfb,
+                        tBgSFB_k,
+                        tBsSFB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=sfb_full_mcast_mask,
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1
+                    b_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if b_producer_state.count < k_tile_cnt:
+                        peek_ab_empty_status = b_pipeline.producer_try_acquire(
+                            b_producer_state
+                        )
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait A/B buffer empty
+            #
+            b_pipeline.producer_tail(b_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            # Make SFA tmem tensor
+            sfa_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + self.num_accumulator_tmem_cols,
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_M, MMA_K)
+            tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+            # Make SFB tmem tensor
+            sfb_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + self.num_accumulator_tmem_cols + self.num_sfa_tmem_cols,
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_N, MMA_K)
+            tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+
+            # Partition for S2T copy of SFA/SFB
+            #
+            (
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t,
+                tCtSFA_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA)
+            (
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t,
+                tCtSFB_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            if cutlass.const_expr(self.use_2cta_instrs):
+                a_sync_transform_consumer_state = pipeline.make_pipeline_state(
+                    pipeline.PipelineUserType.Consumer, self.num_ab_stage
+                )
+            a_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_ab_stage
+            )
+
+            b_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_ab_stage
+            )
+            acc_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_acc_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info from pipeline (scheduler has filtered out tiles >= num_non_exiting_tiles)
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                # Peek (try_wait) AB buffer full for k_tile = 0
+                if cutlass.const_expr(self.use_2cta_instrs):
+                    a_sync_transform_consumer_state.reset_count()
+                    peek_a_sync_transform_full_status = cutlass.Boolean(1)
+                    if (
+                        a_sync_transform_consumer_state.count < k_tile_cnt
+                        and is_leader_cta
+                    ):
+                        peek_a_sync_transform_full_status = (
+                            a_sync_transform_pipeline.consumer_try_wait(
+                                a_sync_transform_consumer_state
+                            )
+                        )
+                    a_consumer_state.reset_count()
+                else:
+                    a_consumer_state.reset_count()
+                    peek_a_full_status = cutlass.Boolean(1)
+                    if a_consumer_state.count < k_tile_cnt:
+                        peek_a_full_status = a_pipeline.consumer_try_wait(
+                            a_consumer_state
+                        )
+
+                b_consumer_state.reset_count()
+                peek_b_full_status = cutlass.Boolean(1)
+                if b_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_b_full_status = b_pipeline.consumer_try_wait(b_consumer_state)
+
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acc_producer_state.phase ^ 1
+                else:
+                    acc_stage_index = acc_producer_state.index
+
+                tCtAcc = tCtAcc_base[(None, None, None, acc_stage_index)]
+
+                # Apply TMEM pointer offset hack when cta_tile_shape_n=192 or
+                # cta_tile_shape_n=64
+                tCtSFB_mma = tCtSFB
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+                    # If this is an ODD tile, shift the TMEM start address for
+                    # cta_tile_shape_n=192 case by two words
+                    # (ignores first 64 columns of SFB)
+                    offset = (
+                        cutlass.Int32(2)
+                        if mma_tile_coord_mnl[1] % 2 == 1
+                        else cutlass.Int32(0)
+                    )
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + self.num_accumulator_tmem_cols
+                        + self.num_sfa_tmem_cols
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                elif cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    # Move in increments of 64 columns of SFB
+                    offset = cutlass.Int32((mma_tile_coord_mnl[1] % 2) * 2)
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + self.num_accumulator_tmem_cols
+                        + self.num_sfa_tmem_cols
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                    #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_acquire(acc_producer_state)
+                #
+                # Mma mainloop
+                #
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                for k_tile in cutlass.range(k_tile_cnt):  # noqa: B007
+                    # Set tensor memory buffer for current tile
+                    # (MMA, MMA_M, MMA_N)
+
+                    if is_leader_cta:
+                        # Conditionally wait for AB buffer full
+                        if cutlass.const_expr(self.use_2cta_instrs):
+                            a_sync_transform_pipeline.consumer_wait(
+                                a_sync_transform_consumer_state,
+                                peek_a_sync_transform_full_status,
+                            )
+                        else:
+                            a_pipeline.consumer_wait(
+                                a_consumer_state, peek_a_full_status
+                            )
+                        b_pipeline.consumer_wait(b_consumer_state, peek_b_full_status)
+
+                        #  Copy SFA/SFB from smem to tmem
+                        s2t_stage_coord = (
+                            None,
+                            None,
+                            None,
+                            None,
+                            b_consumer_state.index,
+                        )
+                        tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+                        tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+                        cute.copy(
+                            tiled_copy_s2t_sfa,
+                            tCsSFA_compact_s2t_staged,
+                            tCtSFA_compact_s2t,
+                        )
+                        cute.copy(
+                            tiled_copy_s2t_sfb,
+                            tCsSFB_compact_s2t_staged,
+                            tCtSFB_compact_s2t,
+                        )
+
+                        # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+                        num_kblocks = cute.size(tCrA, mode=[2])
+
+                        for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                            kblock_coord = (
+                                None,
+                                None,
+                                kblock_idx,
+                                b_consumer_state.index,
+                            )
+
+                            # Set SFA/SFB tensor to tiled_mma
+                            sf_kblock_coord = (None, None, kblock_idx)
+                            tiled_mma.set(
+                                tcgen05.Field.SFA,
+                                tCtSFA[sf_kblock_coord].iterator,
+                            )
+                            tiled_mma.set(
+                                tcgen05.Field.SFB,
+                                tCtSFB_mma[sf_kblock_coord].iterator,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kblock_coord],
+                                tCrB[kblock_coord],
+                                tCtAcc,
+                            )
+                            # Enable accumulate on tCtAcc after first kblock
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        a_pipeline.consumer_release(a_consumer_state)
+                        if cutlass.const_expr(self.use_2cta_instrs):
+                            a_sync_transform_pipeline.consumer_release(
+                                a_sync_transform_consumer_state
+                            )
+                        b_pipeline.consumer_release(b_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
+                    if cutlass.const_expr(self.use_2cta_instrs):
+                        a_sync_transform_consumer_state.advance()
+                        peek_a_sync_transform_full_status = cutlass.Boolean(1)
+                        if a_sync_transform_consumer_state.count < k_tile_cnt:
+                            if is_leader_cta:
+                                peek_a_sync_transform_full_status = (
+                                    a_sync_transform_pipeline.consumer_try_wait(
+                                        a_sync_transform_consumer_state
+                                    )
+                                )
+                        a_consumer_state.advance()
+                    else:
+                        a_consumer_state.advance()
+                        peek_a_full_status = cutlass.Boolean(1)
+                        if a_consumer_state.count < k_tile_cnt:
+                            peek_a_full_status = a_pipeline.consumer_try_wait(
+                                a_consumer_state
+                            )
+
+                    b_consumer_state.advance()
+                    peek_b_full_status = cutlass.Boolean(1)
+                    if b_consumer_state.count < k_tile_cnt:
+                        if is_leader_cta:
+                            peek_b_full_status = b_pipeline.consumer_try_wait(
+                                b_consumer_state
+                            )
+
+                #
+                # Async arrive accumulator buffer full(each kblock)
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_commit(acc_producer_state)
+
+                # Peek (try_wait) Acc buffer empty for k_tile = k_tile + 1
+                acc_producer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acc_producer_state)
+
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx <= self.epilog_warp_id[-1]:
+            #
+            # Alloc tensor memory buffer
+            #
+            tmem.allocate(self.num_tmem_alloc_cols)
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx % 128
+            (
+                tiled_copy_t2r,
+                tTR_tAcc_base,
+                tTR_rAcc_up,
+                tTR_rAcc_gate,
+            ) = self.epilog_tmem_copy_and_partition(
+                epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
+            )
+
+            tTR_rC = None
+            tiled_copy_r2s = None
+            tRS_rC = None
+            tRS_sC = None
+            bSG_sC = None
+            bSG_gC_partitioned = None
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc_up.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(
+                tiled_copy_t2r, tTR_rC, epi_tidx, sC
+            )
+            (
+                tma_atom_c,
+                bSG_sC,
+                bSG_gC_partitioned,
+            ) = self.epilog_gmem_copy_and_partition(
+                epi_tidx, tma_atom_c, tCgC, epi_tile, sC
+            )
+
+            if cutlass.const_expr(self.generate_sfc):
+                norm_const = norm_const_tensor[0]
+                # (EPI_TILE_M, EPI_TILE_N, RestM, RestN, RestL)
+                gSFC_mnl = cute.local_tile(mSFC_mnl, epi_tile, (None, None, None))
+
+                thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+                # (T2R, T2R_M, T2R_N, RestM, RestN, RestL)
+                tCgSFC_mnl = thr_copy_t2r.partition_D(gSFC_mnl)
+                tCgSFC_mnl = cute.filter_zeros(tCgSFC_mnl)
+                # (T2R, T2R_M, T2R_N)
+                tCrSFC = cute.make_rmem_tensor(
+                    tCgSFC_mnl[(None, None, None, 0, 0, 0)].layout, self.sf_dtype
+                )
+                tCrSFC_pvscale = cute.make_rmem_tensor_like(tCrSFC, cutlass.Float32)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            acc_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_acc_stage
+            )
+
+            c_pipeline = None
+            # Threads/warps participating in tma store pipeline
+            c_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+            )
+            c_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_c_stage,
+                producer_group=c_producer_group,
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            num_prev_subtiles = cutlass.Int32(0)
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Get alpha for current group
+                #
+
+                expert_idx = mma_tile_coord_mnl[2]
+                alpha_val = alpha[expert_idx]
+
+                #
+                # Slice to per mma tile index
+                #
+                bSG_gC = None
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gC = bSG_gC_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        mma_tile_coord_mnl[0],
+                        mma_tile_coord_mnl[1],
+                        0,
+                    )
+                ]
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acc_consumer_state.phase
+                    reverse_subtile = (
+                        cutlass.Boolean(True)
+                        if acc_stage_index == 0
+                        else cutlass.Boolean(False)
+                    )
+                else:
+                    acc_stage_index = acc_consumer_state.index
+
+                # Set tensor memory buffer for current tile
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[
+                    (None, None, None, None, None, acc_stage_index)
+                ]
+
+                if cutlass.const_expr(self.generate_sfc):
+                    # (T2R, T2R_M, T2R_N, RestM, RestN)
+                    tCgSFC_mn = tCgSFC_mnl[
+                        (
+                            None,
+                            None,
+                            None,
+                            None,
+                            None,
+                            0,
+                        )
+                    ]
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_pipeline.consumer_wait(acc_consumer_state)
+
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+
+                #
+                # Process accumulator subtiles with SwiGLU fusion and store to global memory
+                # Each iteration processes a pair of subtiles (up, gate) and computes
+                # up * silu(gate)
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+
+                for subtile_idx in cutlass.range(0, subtile_cnt, 2):
+                    real_subtile_idx = subtile_idx // 2
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if reverse_subtile:
+                            real_subtile_idx = (
+                                self.cta_tile_shape_mnk[1] // self.epi_tile_n_required
+                                - 1
+                                - subtile_idx // 2
+                            )
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    #
+                    tTR_tAcc_mn_up = tTR_tAcc[(None, None, None, real_subtile_idx * 2)]
+                    tTR_tAcc_mn_gate = tTR_tAcc[
+                        (None, None, None, real_subtile_idx * 2 + 1)
+                    ]
+
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn_up, tTR_rAcc_up)
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn_gate, tTR_rAcc_gate)
+
+                    #
+                    # Async arrive accumulator buffer empty earlier when overlapping_accum is enabled
+                    #
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if subtile_idx // 2 == self.iter_acc_early_release_in_epilogue:
+                            # Fence for TMEM load
+                            cute.arch.fence_view_async_tmem_load()
+                            with cute.arch.elect_one():
+                                acc_pipeline.consumer_release(acc_consumer_state)
+                            acc_consumer_state.advance()
+
+                    acc_vec_up = tTR_rAcc_up.load()
+                    acc_vec_gate = tTR_rAcc_gate.load()
+
+                    #
+                    # SwiGLU activation: output = up * silu(gate)
+                    # where silu(x) = x * sigmoid(x)
+                    # up and gate are extracted from interleaved accumulator subtiles
+                    #
+                    tCompute = cute.make_rmem_tensor(acc_vec_gate.shape, self.acc_dtype)
+                    if cutlass.const_expr(self.vectorized_f32):
+                        # SwiGLU Packed Version: uses f32x2 packed operations for better performance
+                        # Computes: output = (alpha * up) * silu(alpha * gate)
+                        # where silu(x) = x * sigmoid(x) = x / (1 + exp(-x))
+                        LOG2_E = cutlass.Float32(1.4426950408889634)
+                        for i in cutlass.range_constexpr(0, cute.size(tTR_rAcc_up), 2):
+                            acc_vec_up_alpha = cute.arch.mul_packed_f32x2(
+                                (acc_vec_up[i], acc_vec_up[i + 1]),
+                                (
+                                    cutlass.Float32(alpha_val),
+                                    cutlass.Float32(alpha_val),
+                                ),
+                            )
+                            acc_vec_gate_alpha = cute.arch.mul_packed_f32x2(
+                                (acc_vec_gate[i], acc_vec_gate[i + 1]),
+                                (
+                                    cutlass.Float32(alpha_val),
+                                    cutlass.Float32(alpha_val),
+                                ),
+                            )
+                            tCompute_log2e = cute.arch.mul_packed_f32x2(
+                                (acc_vec_gate_alpha[0], acc_vec_gate_alpha[1]),
+                                (-LOG2_E, -LOG2_E),
+                            )
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.add_packed_f32x2(
+                                (
+                                    cute.math.exp2(tCompute_log2e[0], fastmath=True),
+                                    cute.math.exp2(tCompute_log2e[1], fastmath=True),
+                                ),
+                                (1.0, 1.0),
+                            )
+                            tCompute[i] = cute.arch.rcp_approx(tCompute[i])
+                            tCompute[i + 1] = cute.arch.rcp_approx(tCompute[i + 1])
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.mul_packed_f32x2(
+                                (tCompute[i], tCompute[i + 1]),
+                                (acc_vec_gate_alpha[0], acc_vec_gate_alpha[1]),
+                            )
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.mul_packed_f32x2(
+                                (tCompute[i], tCompute[i + 1]),
+                                (acc_vec_up_alpha[0], acc_vec_up_alpha[1]),
+                            )
+                    else:
+                        # SwiGLU Unpacked Version: scalar operations
+                        # Computes: output = (alpha * up) * silu(alpha * gate)
+                        for i in cutlass.range_constexpr(cute.size(tTR_rAcc_up)):
+                            acc_vec_up_alpha = acc_vec_up[i] * cutlass.Float32(
+                                alpha_val
+                            )
+                            acc_vec_gate_alpha = acc_vec_gate[i] * cutlass.Float32(
+                                alpha_val
+                            )
+                            tCompute[i] = acc_vec_up_alpha * silu_f32(
+                                acc_vec_gate_alpha, fastmath=True
+                            )
+
+                    if cutlass.const_expr(self.generate_sfc):
+                        #
+                        # Quantization path for Float4E2M1FN output:
+                        # 1. Compute per-vector absolute max from SwiGLU result
+                        # 2. Generate scale factor C (SFC) based on max values
+                        # 3. Store SFC to global memory
+                        # 4. Quantize output by scaling with reciprocal of SFC
+                        #
+                        # Assume subtile partitioned always happens on n dimension
+                        sfc_subtile_idx_mn = (
+                            tile_info[0] * self.epi_tile_cnt[0],
+                            tile_info[1] * self.epi_tile_cnt[1] + real_subtile_idx,
+                        )
+                        tCgSFC = tCgSFC_mn[
+                            (
+                                None,
+                                None,
+                                None,
+                                *sfc_subtile_idx_mn,
+                            )
+                        ]
+
+                        #
+                        # Get absolute max across a vector and Compute SFC
+                        #
+                        tTR_rAcc_frg = cute.logical_divide(
+                            tCompute, cute.make_layout(self.sf_vec_size)
+                        )
+                        acc_frg = tTR_rAcc_frg.load()
+                        acc_frg = epilogue_op(acc_frg)
+
+                        # Apply element-wise absolute value using math.absf (supports vectors)
+                        abs_acc_frg_ir = math.absf(acc_frg.ir_value())
+                        abs_acc_frg = type(acc_frg)(
+                            abs_acc_frg_ir, acc_frg.shape, acc_frg.dtype
+                        )
+
+                        if cutlass.const_expr(self.vectorized_f32):
+                            for vi in cutlass.range_constexpr(abs_acc_frg.shape[1]):
+                                tCrSFC_pvscale[vi] = abs_acc_frg[None, vi].reduce(
+                                    cute.ReductionOp.MAX,
+                                    cutlass.Float32(0.0),
+                                    0,  # Use 0.0 as init for abs values
+                                )
+                            for vi in cutlass.range_constexpr(
+                                0, abs_acc_frg.shape[1], 2
+                            ):
+                                tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1] = (
+                                    cute.arch.mul_packed_f32x2(
+                                        (tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1]),
+                                        (
+                                            self.get_dtype_rcp_limits(self.c_dtype),
+                                            self.get_dtype_rcp_limits(self.c_dtype),
+                                        ),
+                                    )
+                                )
+                                tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1] = (
+                                    cute.arch.mul_packed_f32x2(
+                                        (tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1]),
+                                        (norm_const, norm_const),
+                                    )
+                                )
+                        else:
+                            for vi in cutlass.range_constexpr(abs_acc_frg.shape[1]):
+                                tCrSFC_pvscale[vi] = (
+                                    abs_acc_frg[None, vi].reduce(
+                                        cute.ReductionOp.MAX,
+                                        cutlass.Float32(0.0),
+                                        0,  # Use 0.0 as init for abs values
+                                    )
+                                    * self.get_dtype_rcp_limits(self.c_dtype)
+                                    * norm_const
+                                )
+
+                        # TODO: need to add f32x2 -> f8x2 conversion
+                        tCrSFC.store(tCrSFC_pvscale.load().to(self.sf_dtype))
+
+                        #
+                        # Store SFC to global memory
+                        #
+                        # TODO: Need to think about predicate on it
+                        # if cute.elem_less():
+                        cute.autovec_copy(tCrSFC, tCgSFC)
+
+                        #
+                        # Compute quantized output values and convert to C type
+                        #
+                        # TODO: need to add f8x2 -> f32x2 conversion
+                        tCrSFC_qpvscale_up = tCrSFC.load().to(cutlass.Float32)
+                        fp32_max = cutlass.Float32(3.40282346638528859812e38)
+                        if cutlass.const_expr(self.vectorized_f32):
+                            for vi in cutlass.range_constexpr(0, cute.size(tCrSFC), 2):
+                                acc_scale = cute.arch.mul_packed_f32x2(
+                                    (
+                                        cute.arch.rcp_approx(tCrSFC_qpvscale_up[vi]),
+                                        cute.arch.rcp_approx(
+                                            tCrSFC_qpvscale_up[vi + 1]
+                                        ),
+                                    ),
+                                    (norm_const, norm_const),
+                                )
+                                acc_scale_min0 = fmin(acc_scale[0], fp32_max, nan=True)
+                                acc_scale_min1 = fmin(acc_scale[1], fp32_max, nan=True)
+
+                                vec0 = tTR_rAcc_frg[None, vi]
+                                vec1 = tTR_rAcc_frg[None, vi + 1]
+                                for ei in cutlass.range_constexpr(self.sf_vec_size):
+                                    vec0[ei], vec1[ei] = cute.arch.mul_packed_f32x2(
+                                        (vec0[ei], vec1[ei]),
+                                        (acc_scale_min0, acc_scale_min1),
+                                    )
+                        else:
+                            for vi in cutlass.range_constexpr(cute.size(tCrSFC)):
+                                # TODO:Need to add E8M0 rcp approximation
+                                acc_scale = norm_const * cute.arch.rcp_approx(
+                                    tCrSFC_qpvscale_up[vi]
+                                )
+                                acc_scale = fmin(acc_scale, fp32_max, nan=True)
+
+                                vec = tTR_rAcc_frg[None, vi]
+                                for ei in cutlass.range_constexpr(self.sf_vec_size):
+                                    vec[ei] = vec[ei] * acc_scale
+
+                        acc_vec = tiled_copy_r2s.retile(tCompute).load()
+                        tRS_rC.store(acc_vec.to(self.c_dtype))
+                    else:
+                        #
+                        # Convert to C type
+                        #
+                        acc_vec = tiled_copy_r2s.retile(tCompute).load()
+                        acc_vec = epilogue_op(acc_vec.to(self.c_dtype))
+                        tRS_rC.store(acc_vec)
+
+                    #
+                    # Store C to shared memory
+                    #
+                    num_prev_subtiles = num_prev_subtiles + 1
+                    c_buffer = num_prev_subtiles % self.num_c_stage
+
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rC,
+                        tRS_sC[(None, None, None, c_buffer)],
+                    )
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    self.epilog_sync_barrier.arrive_and_wait()
+                    #
+                    # TMA store C to global memory
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_c,
+                            bSG_sC[(None, c_buffer)],
+                            bSG_gC[(None, real_subtile_idx)],
+                        )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        c_pipeline.producer_commit()
+                        c_pipeline.producer_acquire()
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                #
+                # Async arrive accumulator buffer empty
+                #
+                if cutlass.const_expr(not self.overlapping_accum):
+                    with cute.arch.elect_one():
+                        acc_pipeline.consumer_release(acc_consumer_state)
+                    acc_consumer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Dealloc the tensor memory buffer
+            #
+            tmem.relinquish_alloc_permit()
+            self.epilog_sync_barrier.arrive_and_wait()
+            tmem.free(tmem_ptr)
+            #
+            # Wait for C store complete
+            #
+            c_pipeline.producer_tail()
+
+        griddepcontrol_launch_dependents()
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory
+        (source) and register array (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc_up, tTR_rAcc_gate) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc_up: The partitioned accumulator tensor for acc up
+            - tTR_rAcc_gate: The partitioned accumulator tensor for acc gate
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(
+            copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+        )
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_mnl_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc_up = cute.make_rmem_tensor(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc_gate = cute.make_rmem_tensor(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc_up, tTR_rAcc_gate
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register
+        array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+        :type sepi: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        - partition register array (source) and global memory (destination) for none TMA store version;
+        - partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
+        :type atom: cute.CopyAtom or cute.TiledCopy
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing :
+            - For TMA store: (tma_atom_c, bSG_sC, bSG_gC) where:
+                - tma_atom_c: The TMA copy atom
+                - bSG_sC: The partitioned shared memory tensor C
+                - bSG_gC: The partitioned global tensor C
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        tma_atom_c = atom
+        sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+        gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL)
+        bSG_sC, bSG_gC = cpasync.tma_partition(
+            tma_atom_c,
+            0,
+            cute.make_layout(1),
+            sC_for_tma_partition,
+            gC_for_tma_partition,
+        )
+        return tma_atom_c, bSG_sC, bSG_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        num_smem_capacity: int,
+        occupancy: int,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/C operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout of operand C.
+        :type c_layout: utils.LayoutEnum
+        :param sf_dtype: Data type of scale factor.
+        :type sf_dtype: type[cutlass.Numeric]
+        :param sf_vec_size: Vector size of scale factor.
+        :type sf_vec_size: int
+        :param num_smem_capacity: Total available shared memory capacity in bytes.
+        :type num_smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, C stages)
+        :rtype: tuple[int, int, int]
+        """
+        # Default ACC stages
+        num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
+
+        # Default C stages
+        num_c_stage = 2
+
+        # Default Tile info stages
+        num_tile_stage = 2
+
+        # Calculate smem layout and size for one stage of A, B, and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile,
+            1,
+        )
+
+        ab_bytes_per_stage = (
+            cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
+        )
+        # 1024B alignment
+        mbar_helpers_bytes = 1024
+        c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+        c_bytes = c_bytes_per_stage * num_c_stage
+
+        # Calculate A/B stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial C stages bytes
+        # Divide remaining by bytes needed per A/B stage
+        num_ab_stage = (
+            num_smem_capacity // occupancy - (mbar_helpers_bytes + c_bytes)
+        ) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        num_c_stage += (
+            num_smem_capacity
+            - occupancy * ab_bytes_per_stage * num_ab_stage
+            - occupancy * (mbar_helpers_bytes + c_bytes)
+        ) // (occupancy * c_bytes_per_stage)
+        return num_acc_stage, num_ab_stage, num_c_stage, num_tile_stage  # type: ignore[return-value]
+
+    @staticmethod
+    def _compute_grid(
+        c: cute.Tensor,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+        raster_along_m: bool = False,
+    ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size for the output tensor C.
+
+        :param c: The output tensor C
+        :type c: cute.Tensor
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        c_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
+        gc = cute.zipped_divide(c, tiler=c_shape)
+        num_ctas_mnl = gc[(0, (None, None, None))].shape
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(
+            num_ctas_mnl, cluster_shape_mnl, raster_along_m=raster_along_m
+        )
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(
+            tile_sched_params, max_active_clusters
+        )
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def _get_tma_atom_kind(
+        atom_sm_cnt: cutlass.Int32, mcast: cutlass.Boolean
+    ) -> Union[
+        cpasync.CopyBulkTensorTileG2SMulticastOp, cpasync.CopyBulkTensorTileG2SOp
+    ]:
+        """
+        Select the appropriate TMA copy atom based on the number of SMs and the multicast flag.
+
+        :param atom_sm_cnt: The number of SMs
+        :type atom_sm_cnt: cutlass.Int32
+        :param mcast: The multicast flag
+        :type mcast: cutlass.Boolean
+
+        :return: The appropriate TMA copy atom kind
+        :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp
+
+        :raise ValueError: If the atom_sm_cnt is invalid
+        """
+        if atom_sm_cnt == 2 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 2 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 1 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE)
+        elif atom_sm_cnt == 1 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+
+        raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}")
+
+    @staticmethod
+    def get_dtype_rcp_limits(dtype: Type[cutlass.Numeric]) -> float:
+        """
+        Calculates the reciprocal of the maximum absolute value for a given data type.
+
+        :param dtype: Data type
+        :type dtype: Type[cutlass.Numeric]
+
+        :return: An float representing the reciprocal of the maximum absolute value
+        :rtype: float
+        """
+        if dtype == cutlass.Float4E2M1FN:
+            return 1 / 6.0
+        if dtype == cutlass.Float8E4M3FN:
+            return 1 / 448.0
+        if dtype == cutlass.Float8E5M2:
+            return 1 / 128.0
+        return 1.0
+
+    @staticmethod
+    def is_valid_dtypes_and_scale_factor_vec_size(
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        c_dtype: Type[cutlass.Numeric],
+    ) -> bool:
+        """
+        Check if the dtypes are valid
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+
+        :return: True if the dtypes are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        if ab_dtype not in {
+            cutlass.Float4E2M1FN,
+            cutlass.Float8E5M2,
+            cutlass.Float8E4M3FN,
+        }:
+            is_valid = False
+
+        # Check valid sf_vec_size
+        if sf_vec_size not in {16, 32}:
+            is_valid = False
+
+        # Check valid sf_dtype
+        if sf_dtype not in {cutlass.Float8E8M0FNU, cutlass.Float8E4M3FN}:
+            is_valid = False
+
+        # Check valid sf_dtype and sf_vec_size combinations
+        if sf_dtype == cutlass.Float8E4M3FN and sf_vec_size == 32:
+            is_valid = False
+        if ab_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} and sf_vec_size == 16:
+            is_valid = False
+
+        # Check valid c_dtype
+        if c_dtype not in {
+            cutlass.Float32,
+            cutlass.Float16,
+            cutlass.BFloat16,
+            cutlass.Float8E5M2,
+            cutlass.Float8E4M3FN,
+            cutlass.Float4E2M1FN,
+        }:
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_layouts(
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if layouts and dtypes are valid combinations
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major dimension of the A tensor
+        :type a_major: str
+        :param b_major: The major dimension of the B tensor
+        :type b_major: str
+        :param c_major: The major dimension of the C tensor
+        :type c_major: str
+
+        :return: True if the layouts are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
+            is_valid = False
+        if c_dtype is cutlass.Float4E2M1FN and c_major == "m":
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_mma_tiler_and_cluster_shape(
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ) -> bool:
+        """
+        Check if the mma tiler and cluster shape are valid
+
+        :param use_2cta_instrs: Whether to use 2 CTA groups
+        :type use_2cta_instrs: bool
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+
+        :return: True if the mma tiler and cluster shape are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        # Skip invalid mma tile shape
+        if mma_tiler_mn[0] not in (128, 256):
+            is_valid = False
+        # Skip invalid mma tile n
+        # SwiGlu Fusion requires even epi_tile counts,
+        # based on epi_tile_n = 64, only mma_tiler_n = 128 and 256 are supported
+        if mma_tiler_mn[1] not in (128, 256):
+            is_valid = False
+
+        # Skip illegal cluster shape
+        if (mma_tiler_mn[0] // cluster_shape_mn[0]) != 128:
+            is_valid = False
+
+        if (
+            cluster_shape_mn[0] * cluster_shape_mn[1] > 16
+            or cluster_shape_mn[0] <= 0
+            or cluster_shape_mn[1] <= 0
+            # Special cluster shape check for scale factor multicasts.
+            # Due to limited size of scale factors, we can't multicast among more than 4 CTAs.
+            or cluster_shape_mn[0] > 4
+            or cluster_shape_mn[1] > 4
+            or not is_power_of_2(cluster_shape_mn[0])
+            or not is_power_of_2(cluster_shape_mn[1])
+        ):
+            is_valid = False
+
+        # We only support cluster shape n = 1 for now
+        # TODO: Support cluster shape n > 1
+        if cluster_shape_mn[1] != 1:
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_tensor_alignment(
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the tensor alignment is valid
+
+        :param m: The number of rows in the A tensor
+        :type m: cutlass.Int64
+        :param n: The number of columns in the B tensor
+        :type n: cutlass.Int64
+        :param k: The number of columns in the A tensor
+        :type k: cutlass.Int64
+        :param l: The number of columns in the C tensor
+        :type l: cutlass.Int64
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the problem shape is valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // dtype.width
+            return num_major_elements % num_contiguous_elements == 0
+
+        if (
+            not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
+            or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
+            or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l))
+        ):
+            is_valid = False
+        return is_valid
+
+    @classmethod
+    def can_implement(
+        cls,
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        c_dtype: Type[cutlass.Numeric],
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the gemm can be implemented
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+        :param m: The number of rows in the A tensor
+        :type m: cutlass.Int64
+        :param n: The number of columns in the B tensor
+        :type n: cutlass.Int64
+        :param k: The number of columns in the A tensor
+        :type k: cutlass.Int64
+        :param l: The number of columns in the C tensor
+        :type l: cutlass.Int64
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the gemm can be implemented, False otherwise
+        :rtype: bool
+        """
+        can_implement = True
+        # Skip unsupported types
+        if not cls.is_valid_dtypes_and_scale_factor_vec_size(
+            ab_dtype, sf_dtype, sf_vec_size, c_dtype
+        ):
+            can_implement = False
+
+        # Skip unsupported layouts
+        if not cls.is_valid_layouts(ab_dtype, c_dtype, a_major, b_major, c_major):
+            can_implement = False
+
+        # Skip invalid mma tile shape and cluster shape
+        if not cls.is_valid_mma_tiler_and_cluster_shape(mma_tiler_mn, cluster_shape_mn):
+            can_implement = False
+        # Skip illegal problem shape for load/store alignment
+        if not cls.is_valid_tensor_alignment(
+            m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+        # Skip unsupported A/B layout
+        if not (a_major == "k" and b_major == "k"):
+            can_implement = False
+        return can_implement
+
+    @cute.jit
+    def wrapper(
+        self,
+        a_ptr: cute.Pointer,
+        b_ptr: cute.Pointer,
+        a_sf_ptr: cute.Pointer,
+        b_sf_ptr: cute.Pointer,
+        c_ptr: cute.Pointer,
+        c_sf_ptr: cute.Pointer,
+        alpha_ptr: cute.Pointer,
+        tile_idx_to_group_idx_ptr: cute.Pointer,
+        tile_idx_to_mn_limit_ptr: cute.Pointer,
+        token_id_mapping_ptr: cute.Pointer,
+        num_non_exiting_tiles_ptr: cute.Pointer,
+        global_sf_ptr: cute.Pointer,
+        orig_m: cutlass.Int64,
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        tile_size: cutlass.Constexpr,
+        scaling_vector_size: cutlass.Constexpr,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        scale_k = k // scaling_vector_size
+        interm_size = n // 2
+        num_tiles = m // tile_size
+        a = cute.make_tensor(
+            a_ptr, layout=cute.make_ordered_layout((orig_m, k, 1), order=(1, 0, 2))
+        )
+        b = cute.make_tensor(
+            b_ptr, layout=cute.make_ordered_layout((n, k, l), order=(1, 0, 2))
+        )
+        a_sf = cute.make_tensor(
+            a_sf_ptr,
+            layout=cute.make_ordered_layout((orig_m, scale_k, 1), order=(1, 0, 2)),
+        )
+        b_sf = cute.make_tensor(
+            b_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, n // 128, 4, scale_k // 4, l), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+        c = cute.make_tensor(
+            c_ptr, layout=cute.make_ordered_layout((m, interm_size, 1), order=(1, 0, 2))
+        )
+        c_sf = cute.make_tensor(
+            c_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, m // 128, 4, interm_size // (scaling_vector_size * 4), l),
+                order=(2, 1, 4, 0, 3, 5),
+            ),
+        )
+        alpha = cute.make_tensor(alpha_ptr, layout=cute.make_layout((l,)))
+
+        tile_idx_to_group_idx = cute.make_tensor(
+            tile_idx_to_group_idx_ptr, layout=cute.make_layout((num_tiles,))
+        )
+        tile_idx_to_mn_limit = cute.make_tensor(
+            tile_idx_to_mn_limit_ptr, layout=cute.make_layout((num_tiles,))
+        )
+        token_id_mapping = cute.make_tensor(
+            token_id_mapping_ptr, layout=cute.make_layout((m,))
+        )
+        num_non_exiting_tiles = cute.make_tensor(
+            num_non_exiting_tiles_ptr, layout=cute.make_layout((1,))
+        )
+        global_sf = cute.make_tensor(global_sf_ptr, layout=cute.make_layout((1,)))
+
+        return self(
+            a,
+            b,
+            c,
+            a_sf,
+            b_sf,
+            c_sf,
+            global_sf,
+            tile_idx_to_group_idx,
+            tile_idx_to_mn_limit,
+            token_id_mapping,
+            num_non_exiting_tiles,
+            alpha,
+            max_active_clusters=max_active_clusters,
+            stream=stream,
+            epilogue_op=epilogue_op,
+        )
+
+
+@cute.jit
+def cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+    sf_ref_tensor: cute.Tensor,
+    sf_mma_tensor: cute.Tensor,
+):
+    """Convert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layout"""
+    # sf_mma_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
+    # group to ((32, 4, rest_m), (4, rest_k), l)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 0, 3)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 1, 3)
+    for i in cutlass.range(cute.size(sf_ref_tensor)):
+        mkl_coord = sf_ref_tensor.layout.get_hier_coord(i)
+        sf_mma_tensor[mkl_coord] = sf_ref_tensor[mkl_coord]
+
+
+@cute.jit
+def cvt_sf_M32x4xrm_K4xrk_L_to_MKL(
+    sf_swizzled_tensor: cute.Tensor,
+    sf_unswizzled_tensor: cute.Tensor,
+):
+    """Convert scale factor tensor from mma specification M(32x4xrest_m)xK(4xrest_k)xL layout to MKL layout"""
+    # sf_swizzled_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
+    # group to ((32, 4, rest_m), (4, rest_k), l)
+    sf_swizzled_tensor = cute.group_modes(sf_swizzled_tensor, 0, 3)
+    sf_swizzled_tensor = cute.group_modes(sf_swizzled_tensor, 1, 3)
+    for i in cutlass.range(cute.size(sf_unswizzled_tensor)):
+        mkl_coord = sf_unswizzled_tensor.layout.get_hier_coord(i)
+        sf_unswizzled_tensor[mkl_coord] = sf_swizzled_tensor[mkl_coord]
diff --git a/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm.py b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm.py
new file mode 100644
index 0000000000..766b61d12f
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm.py
@@ -0,0 +1,2435 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import Tuple, Type, Union
+
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import cutlass.pipeline as pipeline
+import cutlass.utils as utils
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.nvgpu import cpasync, tcgen05
+
+from .utils import (
+    TRTLLM_ENABLE_PDL,
+    griddepcontrol_launch_dependents,
+    griddepcontrol_wait,
+    is_power_of_2,
+)
+
+
+class Sm100BlockScaledContiguousGroupedGemmKernel:
+    """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
+    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
+
+    :param sf_vec_size: Scalefactor vector size.
+    :type sf_vec_size: int
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
+        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16
+
+    :note: Supported accumulator data types:
+        - Float32
+
+    :note: Supported C data types:
+        - Float32
+        - Float16/BFloat16
+        - Float8E4M3FN/Float8E5M2
+
+    :note: Constraints:
+        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
+        - MMA tiler N must be 64/128/192/256
+        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors
+
+    Example:
+        >>> gemm = Sm100BlockScaledContiguousGroupedGemmKernel(
+        ...     sf_vec_size=16, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 1)
+        ... )
+        >>> gemm(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, max_active_clusters, stream)
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ):
+        """Initializes the configuration for a Blackwell blockscaled dense GEMM kernel.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        """
+
+        self.sf_vec_size = sf_vec_size
+        self.acc_dtype = cutlass.Float32
+        self.use_2cta_instrs = mma_tiler_mn[0] == 256
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+
+        self.cta_group = (
+            tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+        )
+
+        self.occupancy = 1
+        self.epilog_warp_id = (0, 1, 2, 3)
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.sched_warp_id = 6
+        self.threads_per_warp = 32
+        self.threads_per_cta = self.threads_per_warp * len(
+            (
+                *self.epilog_warp_id,
+                self.mma_warp_id,
+                self.tma_warp_id,
+                self.sched_warp_id,
+            )
+        )
+        self.threads_wo_sched = self.threads_per_warp * len(
+            (
+                *self.epilog_warp_id,
+                self.mma_warp_id,
+                self.tma_warp_id,
+            )
+        )
+        self.num_regs_uniform_warps = 64
+        self.num_regs_sched_warps = 64
+        self.num_regs_epilogue_warps = 216
+
+        # Set barrier for cta sync, epilogue sync and tmem ptr sync
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=1,
+            num_threads=self.threads_per_cta,
+        )
+        self.epilog_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=2,
+            num_threads=32 * len(self.epilog_warp_id),
+        )
+        self.tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=3,
+            num_threads=32 * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+        self.sched_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=4,
+            num_threads=self.threads_per_warp,
+        )
+        self.num_smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        # TMEM offset for final accumulator
+        self.tmem_final_offset = 384
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B
+        - Computing epilogue subtile
+        - Setting up A/B/C stage counts in shared memory
+        - Computing A/B/C shared memory layout
+        - Computing tensor memory allocation columns
+        """
+
+        self.mma_inst_shape_mn = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+        )
+        # (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K)
+        self.mma_inst_shape_mn_sfb = (
+            self.mma_inst_shape_mn[0] // (2 if self.use_2cta_instrs else 1),
+            cute.round_up(self.mma_inst_shape_mn[1], 128),
+        )
+
+        # Configure tiled mma
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.mma_tiler_sfb = (
+            self.mma_inst_shape_mn_sfb[0],
+            self.mma_inst_shape_mn_sfb[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+
+        self.cta_tile_shape_mnk_sfb = (
+            self.mma_tiler_sfb[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_sfb[1],
+            self.mma_tiler_sfb[2],
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        self.cluster_layout_sfb_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma_sfb.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+
+        # Compute epilogue subtile
+        self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
+            self.cta_tile_shape_mnk,
+            self.use_2cta_instrs,
+            self.c_layout,
+            self.c_dtype,
+        )
+
+        self.epi_tile_n = cute.size(self.epi_tile[1])
+
+        # Setup A/B/C/Scale stage count in shared memory and ACC stage count in tensor memory
+        (
+            self.num_acc_stage,
+            self.num_ab_stage,
+            self.num_c_stage,
+            self.num_tile_stage,
+        ) = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.epi_tile,
+            self.c_dtype,
+            self.c_layout,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.num_smem_capacity,
+            self.occupancy,
+        )
+
+        # Compute A/B/C/Scale shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+
+        self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile,
+            self.num_c_stage,
+        )
+
+        # Overlap and double buffer accumulator when num_acc_stage == 1 for cta_tile_n = 256 case
+        self.overlapping_accum = self.num_acc_stage == 1
+
+        # Compute number of TMEM columns for SFA/SFB/Accumulator
+        sf_atom_mn = 32
+        self.num_sfa_tmem_cols = (
+            self.cta_tile_shape_mnk[0] // sf_atom_mn
+        ) * mma_inst_tile_k
+        self.num_sfb_tmem_cols = (
+            self.cta_tile_shape_mnk_sfb[1] // sf_atom_mn
+        ) * mma_inst_tile_k
+        self.num_sf_tmem_cols = self.num_sfa_tmem_cols + self.num_sfb_tmem_cols
+        self.num_accumulator_tmem_cols = (
+            self.cta_tile_shape_mnk[1] * self.num_acc_stage
+            if not self.overlapping_accum
+            else self.cta_tile_shape_mnk[1] * 2 - self.num_sf_tmem_cols
+        )
+
+        # Only when overlapping_accum is enabled, we need to release accumulator buffer early in epilogue
+        self.iter_acc_early_release_in_epilogue = (
+            self.num_sf_tmem_cols // self.epi_tile_n
+        )
+
+        # Compute the number of tensor memory allocation columns
+        self.num_tmem_alloc_cols = 512
+
+    @cute.jit
+    def __call__(
+        self,
+        a: cute.Tensor,
+        b: cute.Tensor,
+        c: cute.Tensor,
+        sfa: cute.Tensor,
+        sfb: cute.Tensor,
+        tile_idx_to_group_idx: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        alpha: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        :param a: Input tensor A
+        :type a: cute.Tensor
+        :param b: Input tensor B
+        :type b: cute.Tensor
+        :param c: Output tensor C
+        :type c: cute.Tensor
+        :param sfa: Scale factor tensor A
+        :type sfa: cute.Tensor
+        :param sfb: Scale factor tensor B
+        :type sfb: cute.Tensor
+        :param tile_idx_to_group_idx: Mapping from tile index to group ID, shape (permuted_m/cta_tile_m,) where
+        cta_tile_m is the CTA tile M size
+        :type tile_idx_to_group_idx: cute.Tensor
+        :param num_non_exiting_tiles: Number of valid tiles (valid_m/cta_tile_m), shape (1,)
+        :type num_non_exiting_tiles: cute.Tensor
+        :param alpha: Alpha tensor for each group
+        :type alpha: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c.element_type
+        self.sf_dtype: Type[cutlass.Numeric] = sfa.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(c)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+        # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+        sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(a.shape, self.sf_vec_size)
+        sfa = cute.make_tensor(sfa.iterator, sfa_layout)
+
+        # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+        sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b.shape, self.sf_vec_size)
+        sfb = cute.make_tensor(sfb.iterator, sfb_layout)
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = sm100_utils.cluster_shape_to_tma_atom_A(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+            a_op,
+            a,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for B
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            b_op,
+            b,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for SFA
+        sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        sfa_smem_layout = cute.slice_(
+            self.sfa_smem_layout_staged, (None, None, None, 0)
+        )
+        tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+            sfa_op,
+            sfa,
+            sfa_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # Setup TMA load for SFB
+        sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        sfb_smem_layout = cute.slice_(
+            self.sfb_smem_layout_staged, (None, None, None, 0)
+        )
+        tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+            sfb_op,
+            sfb,
+            sfb_smem_layout,
+            self.mma_tiler_sfb,
+            tiled_mma_sfb,
+            self.cluster_layout_sfb_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+            x = tma_tensor_sfb.stride[0][1]
+            y = cute.ceil_div(tma_tensor_sfb.shape[0][1], 4)
+
+            new_shape = (
+                (tma_tensor_sfb.shape[0][0], ((2, 2), y)),
+                tma_tensor_sfb.shape[1],
+                tma_tensor_sfb.shape[2],
+            )
+            # Use right multiplication for ScaledBasis (3 * x instead of x * 3)
+            x_times_3 = 3 * x
+            new_stride = (
+                (tma_tensor_sfb.stride[0][0], ((x, x), x_times_3)),
+                tma_tensor_sfb.stride[1],
+                tma_tensor_sfb.stride[2],
+            )
+            tma_tensor_sfb_new_layout = cute.make_layout(new_shape, stride=new_stride)
+            tma_tensor_sfb = cute.make_tensor(
+                tma_tensor_sfb.iterator, tma_tensor_sfb_new_layout
+            )
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
+        sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
+        self.num_tma_load_bytes = (
+            a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size
+        ) * atom_thr_size
+
+        # Setup TMA store for C
+        tma_atom_c = None
+        tma_tensor_c = None
+        epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+        tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            c,
+            epi_smem_layout,
+            self.epi_tile,
+        )
+
+        # Compute grid size
+        self.tile_sched_params, grid = self._compute_grid(
+            c, self.cta_tile_shape_mnk, self.cluster_shape_mn, max_active_clusters
+        )
+
+        self.buffer_align_bytes = 1024
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            # (bidx, bidy, bidz, valid)
+            sInfo: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Int32, 4 * self.num_tile_stage],
+                # 1 byte alignment
+                1,
+            ]
+            ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+            acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+            tile_info_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.num_tile_stage * 2
+            ]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.c_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tiled_mma_sfb,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_sfa,
+            tma_tensor_sfa,
+            tma_atom_sfb,
+            tma_tensor_sfb,
+            tma_atom_c,
+            tma_tensor_c,
+            tile_idx_to_group_idx,
+            num_non_exiting_tiles,
+            alpha,
+            self.cluster_layout_vmnk,
+            self.cluster_layout_sfb_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.sfa_smem_layout_staged,
+            self.sfb_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.epi_tile,
+            self.tile_sched_params,
+            epilogue_op,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            smem=self.shared_storage.size_in_bytes(),  # type: ignore[attr-defined]
+            stream=stream,
+            min_blocks_per_mp=1,
+            use_pdl=TRTLLM_ENABLE_PDL,
+        )
+        return
+
+    def mainloop_s2t_copy_and_partition(
+        self,
+        sSF: cute.Tensor,
+        tSF: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for smem to tmem load for scale factor tensor, then use it to partition smem memory (source)
+        and tensor memory (destination).
+
+        :param sSF: The scale factor tensor in smem
+        :type sSF: cute.Tensor
+        :param tSF: The scale factor tensor in tmem
+        :type tSF: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
+            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
+            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
+            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # (MMA, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact = cute.filter_zeros(sSF)
+        # (MMA, MMA_MN, MMA_K)
+        tCtSF_compact = cute.filter_zeros(tSF)
+
+        # Make S2T CopyAtom and tiledCopy
+        copy_atom_s2t = cute.make_copy_atom(
+            tcgen05.Cp4x32x128bOp(self.cta_group),
+            self.sf_dtype,
+        )
+        tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact)
+        thr_copy_s2t = tiled_copy_s2t.get_slice(0)
+
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+            tiled_copy_s2t, tCsSF_compact_s2t_
+        )
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+        tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact)
+
+        return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tiled_mma_sfb: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_sfa: cute.CopyAtom,
+        mSFA_mkl: cute.Tensor,
+        tma_atom_sfb: cute.CopyAtom,
+        mSFB_nkl: cute.Tensor,
+        tma_atom_c: cute.CopyAtom,
+        mC_mnl: cute.Tensor,
+        tile_idx_to_group_idx: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        alpha: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        cluster_layout_sfb_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        sfa_smem_layout_staged: cute.Layout,
+        sfb_smem_layout_staged: cute.Layout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        epi_tile: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_sfa)
+            cpasync.prefetch_descriptor(tma_atom_sfb)
+            cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+
+        block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_tma_producer
+        )
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (
+            2 if use_2cta_instrs else 1
+        )
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_acc_consumer_threads
+        )
+        acc_pipeline = pipeline.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize tile info pipeline (barrier) and states
+        tile_info_pipeline_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_per_warp * 1,
+        )
+        tile_info_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_wo_sched,
+        )
+        tile_info_pipeline = pipeline.PipelineAsync.create(
+            barrier_storage=storage.tile_info_mbar_ptr.data_ptr(),
+            num_stages=self.num_tile_stage,
+            producer_group=tile_info_pipeline_producer_group,
+            consumer_group=tile_info_pipeline_consumer_group,
+        )
+
+        # Tensor memory dealloc barrier init
+        tmem = utils.TmemAllocator(
+            storage.tmem_holding_buf,
+            barrier_for_retrieve=self.tmem_alloc_barrier,
+            allocator_warp_id=self.epilog_warp_id[0],
+            is_two_cta=use_2cta_instrs,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+        )
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/C/Scale
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(
+            c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner
+        )
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(
+            a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+        )
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(
+            b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+        )
+        # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+        sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+        # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+        sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
+        # (bidx, bidy, bidz, valid)
+        info_layout = cute.make_layout((4, self.num_tile_stage), stride=(1, 4))
+        sInfo = storage.sInfo.get_tensor(info_layout)
+
+        #
+        # Compute multicast mask for A/B buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        sfa_full_mcast_mask = None
+        sfb_full_mcast_mask = None
+        if cutlass.const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
+            )
+            sfa_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            sfb_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1
+            )
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, loopM, loopK, loopL)
+        gA_mkl = cute.local_tile(
+            mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, loopN, loopK, loopL)
+        gB_nkl = cute.local_tile(
+            mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
+        )
+
+        # (bM, bK, RestM, RestK, RestL)
+        gSFA_mkl = cute.local_tile(
+            mSFA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+
+        # (bN, bK, RestN, RestK, RestL)
+        gSFB_nkl = cute.local_tile(
+            mSFB_nkl,
+            cute.slice_(self.mma_tiler_sfb, (0, None, None)),
+            (None, None, None),
+        )
+
+        # (bM, bN, loopM, loopN, loopL)
+        gC_mnl = cute.local_tile(
+            mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None)
+        )
+        k_tile_cnt = cutlass.Int32(cute.size(gA_mkl, mode=[3]))
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, loopM, loopK, loopL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, loopN, loopK, loopL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgSFA = thr_mma.partition_A(gSFA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
+        # (MMA, MMA_M, MMA_N, loopM, loopN, loopL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #  TMA load SFA partition_S/D
+        sfa_cta_layout = a_cta_layout
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+
+        tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfa,
+            block_in_cluster_coord_vmnk[2],
+            sfa_cta_layout,
+            cute.group_modes(sSFA, 0, 3),
+            cute.group_modes(tCgSFA, 0, 3),
+        )
+
+        tAsSFA = cute.filter_zeros(tAsSFA)
+        tAgSFA = cute.filter_zeros(tAgSFA)
+
+        # TMA load SFB partition_S/D
+        sfb_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfb,
+            block_in_cluster_coord_sfb_vmnk[1],
+            sfb_cta_layout,
+            cute.group_modes(sSFB, 0, 3),
+            cute.group_modes(tCgSFB, 0, 3),
+        )
+        tBsSFB = cute.filter_zeros(tBsSFB)
+        tBgSFB = cute.filter_zeros(tBgSFB)
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+
+        if cutlass.const_expr(self.overlapping_accum):
+            num_acc_stage_overlapped = 2
+            tCtAcc_fake = tiled_mma.make_fragment_C(
+                cute.append(acc_shape, num_acc_stage_overlapped)
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_fake = cute.make_tensor(
+                tCtAcc_fake.iterator,
+                cute.make_layout(
+                    tCtAcc_fake.shape,
+                    stride=(
+                        tCtAcc_fake.stride[0],
+                        tCtAcc_fake.stride[1],
+                        tCtAcc_fake.stride[2],
+                        (256 - self.num_sf_tmem_cols) * tCtAcc_fake.stride[0][1],
+                    ),
+                ),
+            )
+        else:
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_fake = tiled_mma.make_fragment_C(
+                cute.append(acc_shape, self.num_acc_stage)
+            )
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            self.cta_sync_barrier.arrive_and_wait()
+
+        griddepcontrol_wait()
+
+        #
+        # Specialized Schedule warp
+        #
+        if warp_idx == self.sched_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_sched_warps)
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            tile_info_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_tile_stage
+            )
+
+            num_valid_tiles = num_non_exiting_tiles[0]
+
+            while work_tile.is_valid_tile:
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_m = cur_tile_coord[0] // cute.size(
+                    tiled_mma.thr_id.shape
+                )
+
+                expert_idx = tile_idx_to_group_idx[mma_tile_coord_m]
+                tile_idx = mma_tile_coord_m
+
+                if tile_idx < num_valid_tiles:
+                    tile_info_pipeline.producer_acquire(tile_info_producer_state)
+                    with cute.arch.elect_one():
+                        sInfo[(0, tile_info_producer_state.index)] = cur_tile_coord[0]
+                        sInfo[(1, tile_info_producer_state.index)] = cur_tile_coord[1]
+                        sInfo[(2, tile_info_producer_state.index)] = expert_idx
+                        sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(
+                            work_tile.is_valid_tile
+                        )
+                        # fence view async shared
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+
+                    self.sched_sync_barrier.arrive_and_wait()
+                    tile_info_pipeline.producer_commit(tile_info_producer_state)
+                    tile_info_producer_state.advance()
+
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            tile_info_pipeline.producer_acquire(tile_info_producer_state)
+            with cute.arch.elect_one():
+                sInfo[(0, tile_info_producer_state.index)] = work_tile.tile_idx[0]
+                sInfo[(1, tile_info_producer_state.index)] = work_tile.tile_idx[1]
+                sInfo[(2, tile_info_producer_state.index)] = -1
+                sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(0)
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            self.sched_sync_barrier.arrive_and_wait()
+            tile_info_pipeline.producer_commit(tile_info_producer_state)
+            tile_info_producer_state.advance()
+            tile_info_pipeline.producer_tail(tile_info_producer_state)
+
+        #
+        # Specialized TMA load warp
+        #
+        if warp_idx == self.tma_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_uniform_warps)
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_ab_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), loopK)
+                tAgA_slice = tAgA[(None, mma_tile_coord_mnl[0], None, 0)]
+                # ((atom_v, rest_v), loopK)
+                tBgB_slice = tBgB[
+                    (None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])
+                ]
+
+                # ((atom_v, rest_v), RestK)
+                tAgSFA_slice = tAgSFA[(None, mma_tile_coord_mnl[0], None, 0)]
+
+                slice_n = mma_tile_coord_mnl[1]
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    slice_n = mma_tile_coord_mnl[1] // 2
+
+                # ((atom_v, rest_v), RestK)
+                tBgSFB_slice = tBgSFB[(None, slice_n, None, mma_tile_coord_mnl[2])]
+
+                # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt
+                ab_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_tile_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(
+                        ab_producer_state
+                    )
+                #
+                # Tma load loop
+                #
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):  # noqa: B007
+                    tAgA_k = tAgA_slice[(None, ab_producer_state.count)]
+                    tBgB_k = tBgB_slice[(None, ab_producer_state.count)]
+                    tAgSFA_k = tAgSFA_slice[(None, ab_producer_state.count)]
+                    tBgSFB_k = tBgSFB_slice[(None, ab_producer_state.count)]
+                    tAsA_pipe = tAsA[(None, ab_producer_state.index)]
+                    tBsB_pipe = tBsB[(None, ab_producer_state.index)]
+                    tAsSFA_pipe = tAsSFA[(None, ab_producer_state.index)]
+                    tBsSFB_pipe = tBsSFB[(None, ab_producer_state.index)]
+
+                    tma_bar = ab_pipeline.producer_get_barrier(ab_producer_state)
+
+                    # Conditionally wait for AB buffer empty
+                    ab_pipeline.producer_acquire(
+                        ab_producer_state, peek_ab_empty_status
+                    )
+
+                    # TMA load A/B
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_k,
+                        tAsA_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_k,
+                        tBsB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=b_full_mcast_mask,
+                    )
+
+                    cute.copy(
+                        tma_atom_sfa,
+                        tAgSFA_k,
+                        tAsSFA_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=sfa_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfb,
+                        tBgSFB_k,
+                        tBsSFB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=sfb_full_mcast_mask,
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1
+                    ab_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if ab_producer_state.count < k_tile_cnt:
+                        peek_ab_empty_status = ab_pipeline.producer_try_acquire(
+                            ab_producer_state
+                        )
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait A/B buffer empty
+            #
+            ab_pipeline.producer_tail(ab_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            # Make SFA tmem tensor
+            sfa_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_M, MMA_K)
+            tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+            # Make SFB tmem tensor
+            sfb_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr
+                + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base)
+                + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_N, MMA_K)
+            tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+
+            # Partition for S2T copy of SFA/SFB
+            #
+            (
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t,
+                tCtSFA_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA)
+            (
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t,
+                tCtSFB_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_ab_stage
+            )
+            acc_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_acc_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info from pipeline (scheduler has filtered out tiles >= num_non_exiting_tiles)
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                # Peek (try_wait) AB buffer full for k_tile = 0
+                ab_consumer_state.reset_count()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(
+                        ab_consumer_state
+                    )
+
+                # Peek (try_wait) Acc buffer empty for k_tile = 0
+                acc_producer_state.reset_count()
+                peek_acc_empty_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_acc_empty_status = acc_pipeline.producer_try_acquire(
+                        acc_producer_state
+                    )
+
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acc_producer_state.phase ^ 1
+                else:
+                    acc_stage_index = acc_producer_state.index
+
+                tCtAcc = tCtAcc_base[(None, None, None, acc_stage_index)]
+
+                tCtSFB_mma = tCtSFB
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+                    # If this is an ODD tile, shift the TMEM start address for cta_tile_shape_n=192 case by two words
+                    # (ignores first 64 columns of SFB)
+                    offset = (
+                        cutlass.Int32(2)
+                        if mma_tile_coord_mnl[1] % 2 == 1
+                        else cutlass.Int32(0)
+                    )
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base)
+                        + tcgen05.find_tmem_tensor_col_offset(tCtSFA)
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                elif cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    # Move in increments of 64 columns of SFB
+                    offset = cutlass.Int32((mma_tile_coord_mnl[1] % 2) * 2)
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base)
+                        + tcgen05.find_tmem_tensor_col_offset(tCtSFA)
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                    #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_acquire(
+                        acc_producer_state, peek_acc_empty_status
+                    )
+                #
+                # Mma mainloop
+                #
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):  # noqa: B007
+                    # Set tensor memory buffer for current tile
+                    # (MMA, MMA_M, MMA_N)
+
+                    if is_leader_cta:
+                        # Conditionally wait for AB buffer full
+                        ab_pipeline.consumer_wait(
+                            ab_consumer_state, peek_ab_full_status
+                        )
+
+                        #  Copy SFA/SFB from smem to tmem
+                        s2t_stage_coord = (
+                            None,
+                            None,
+                            None,
+                            None,
+                            ab_consumer_state.index,
+                        )
+                        tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+                        tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+                        cute.copy(
+                            tiled_copy_s2t_sfa,
+                            tCsSFA_compact_s2t_staged,
+                            tCtSFA_compact_s2t,
+                        )
+                        cute.copy(
+                            tiled_copy_s2t_sfb,
+                            tCsSFB_compact_s2t_staged,
+                            tCtSFB_compact_s2t,
+                        )
+
+                        # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+                        num_kblocks = cute.size(tCrA, mode=[2])
+
+                        for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                            kblock_coord = (
+                                None,
+                                None,
+                                kblock_idx,
+                                ab_consumer_state.index,
+                            )
+
+                            # Set SFA/SFB tensor to tiled_mma
+                            sf_kblock_coord = (None, None, kblock_idx)
+                            tiled_mma.set(
+                                tcgen05.Field.SFA,
+                                tCtSFA[sf_kblock_coord].iterator,
+                            )
+                            tiled_mma.set(
+                                tcgen05.Field.SFB,
+                                tCtSFB_mma[sf_kblock_coord].iterator,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kblock_coord],
+                                tCrB[kblock_coord],
+                                tCtAcc,
+                            )
+                            # Enable accumulate on tCtAcc after first kblock
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        ab_pipeline.consumer_release(ab_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
+                    ab_consumer_state.advance()
+                    peek_ab_full_status = cutlass.Boolean(1)
+                    if ab_consumer_state.count < k_tile_cnt:
+                        if is_leader_cta:
+                            peek_ab_full_status = ab_pipeline.consumer_try_wait(
+                                ab_consumer_state
+                            )
+
+                #
+                # Async arrive accumulator buffer full(each kblock)
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_commit(acc_producer_state)
+
+                # Peek (try_wait) Acc buffer empty for k_tile = k_tile + 1
+                acc_producer_state.advance()
+                if acc_producer_state.count < k_tile_cnt:
+                    if is_leader_cta:
+                        peek_acc_empty_status = acc_pipeline.producer_try_acquire(
+                            acc_producer_state
+                        )
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acc_producer_state)
+
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            #
+            # Alloc tensor memory buffer
+            #
+            tmem.allocate(self.num_tmem_alloc_cols)
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx % 128
+            (
+                tiled_copy_t2r,
+                tTR_tAcc_base,
+                tTR_rAcc,
+            ) = self.epilog_tmem_copy_and_partition(
+                epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
+            )
+
+            tTR_rC = None
+            tiled_copy_r2s = None
+            tRS_rC = None
+            tRS_sC = None
+            bSG_sC = None
+            bSG_gC_partitioned = None
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(
+                tiled_copy_t2r, tTR_rC, epi_tidx, sC
+            )
+            (
+                tma_atom_c,
+                bSG_sC,
+                bSG_gC_partitioned,
+            ) = self.epilog_gmem_copy_and_partition(
+                epi_tidx, tma_atom_c, tCgC, epi_tile, sC
+            )
+
+            acc_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_acc_stage
+            )
+
+            c_pipeline = None
+            # Threads/warps participating in tma store pipeline
+            c_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+            )
+            c_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_c_stage,
+                producer_group=c_producer_group,
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            num_prev_subtiles = cutlass.Int32(0)
+
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Get alpha for current group
+                #
+
+                expert_idx = mma_tile_coord_mnl[2]
+                alpha_val = alpha[expert_idx]
+
+                #
+                # Slice to per mma tile index
+                #
+                bSG_gC = None
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gC = bSG_gC_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        mma_tile_coord_mnl[0],
+                        mma_tile_coord_mnl[1],
+                        0,
+                    )
+                ]
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acc_consumer_state.phase
+                    reverse_subtile = (
+                        cutlass.Boolean(True)
+                        if acc_stage_index == 0
+                        else cutlass.Boolean(False)
+                    )
+                else:
+                    acc_stage_index = acc_consumer_state.index
+
+                # Set tensor memory buffer for current tile
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[
+                    (None, None, None, None, None, acc_stage_index)
+                ]
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_pipeline.consumer_wait(acc_consumer_state)
+
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+
+                #
+                # Store accumulator to global memory in subtiles
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+
+                for subtile_idx in cutlass.range(subtile_cnt):
+                    real_subtile_idx = subtile_idx
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if reverse_subtile:
+                            real_subtile_idx = (
+                                self.cta_tile_shape_mnk[1] // self.epi_tile_n
+                                - 1
+                                - subtile_idx
+                            )
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    #
+                    tTR_tAcc_mn = tTR_tAcc[(None, None, None, real_subtile_idx)]
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
+
+                    #
+                    # Async arrive accumulator buffer empty earlier when overlapping_accum is enabled
+                    #
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if subtile_idx == self.iter_acc_early_release_in_epilogue:
+                            # Fence for TMEM load
+                            cute.arch.fence_view_async_tmem_load()
+                            with cute.arch.elect_one():
+                                acc_pipeline.consumer_release(acc_consumer_state)
+                            acc_consumer_state.advance()
+
+                    #
+                    # Apply alpha and convert to C type
+                    #
+                    acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
+                    acc_vec = epilogue_op((alpha_val * acc_vec).to(self.c_dtype))
+                    tRS_rC.store(acc_vec)
+                    #
+                    # Store C to shared memory
+                    #
+                    num_prev_subtiles = num_prev_subtiles + 1
+                    c_buffer = num_prev_subtiles % self.num_c_stage
+
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rC,
+                        tRS_sC[(None, None, None, c_buffer)],
+                    )
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    self.epilog_sync_barrier.arrive_and_wait()
+                    #
+                    # TMA store C to global memory
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_c,
+                            bSG_sC[(None, c_buffer)],
+                            bSG_gC[(None, real_subtile_idx)],
+                        )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        c_pipeline.producer_commit()
+                        c_pipeline.producer_acquire()
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                #
+                # Async arrive accumulator buffer empty
+                #
+                if cutlass.const_expr(not self.overlapping_accum):
+                    with cute.arch.elect_one():
+                        acc_pipeline.consumer_release(acc_consumer_state)
+                    acc_consumer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Dealloc the tensor memory buffer
+            #
+            tmem.relinquish_alloc_permit()
+            self.epilog_sync_barrier.arrive_and_wait()
+            tmem.free(tmem_ptr)
+            #
+            # Wait for C store complete
+            #
+            c_pipeline.producer_tail()
+
+        griddepcontrol_launch_dependents()
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array
+        (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(
+            copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+        )
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_mnl_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc = cute.make_rmem_tensor(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory
+        (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+        :type sepi: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        - partition register array (source) and global memory (destination) for none TMA store version;
+        - partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
+        :type atom: cute.CopyAtom or cute.TiledCopy
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing :
+            - For TMA store: (tma_atom_c, bSG_sC, bSG_gC) where:
+                - tma_atom_c: The TMA copy atom
+                - bSG_sC: The partitioned shared memory tensor C
+                - bSG_gC: The partitioned global tensor C
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        tma_atom_c = atom
+        sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+        gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL)
+        bSG_sC, bSG_gC = cpasync.tma_partition(
+            tma_atom_c,
+            0,
+            cute.make_layout(1),
+            sC_for_tma_partition,
+            gC_for_tma_partition,
+        )
+        return tma_atom_c, bSG_sC, bSG_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        num_smem_capacity: int,
+        occupancy: int,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/C operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout of operand C.
+        :type c_layout: utils.LayoutEnum
+        :param sf_dtype: Data type of scale factor.
+        :type sf_dtype: type[cutlass.Numeric]
+        :param sf_vec_size: Vector size of scale factor.
+        :type sf_vec_size: int
+        :param num_smem_capacity: Total available shared memory capacity in bytes.
+        :type num_smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, C stages)
+        :rtype: tuple[int, int, int]
+        """
+        # Default ACC stages
+        num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
+
+        # num_acc_stage = 1
+
+        # Default C stages
+        num_c_stage = 2
+
+        # Default Tile info stages
+        num_tile_stage = 2
+
+        # Calculate smem layout and size for one stage of A, B, and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile,
+            1,
+        )
+
+        ab_bytes_per_stage = (
+            cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
+        )
+        # 1024B alignment
+        mbar_helpers_bytes = 1024
+        c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+        c_bytes = c_bytes_per_stage * num_c_stage
+
+        # Calculate A/B stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial C stages bytes
+        # Divide remaining by bytes needed per A/B stage
+        # cute.printf("num_smem_capacity: {}, occupancy: {}, mbar_helpers_bytes: {}, c_bytes: {}", num_smem_capacity,
+        # occupancy, mbar_helpers_bytes, c_bytes)
+        # cute.printf("ab_bytes_per_stage: {}", ab_bytes_per_stage)
+        num_ab_stage = (
+            num_smem_capacity // occupancy - (mbar_helpers_bytes + c_bytes)
+        ) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        num_c_stage += (
+            num_smem_capacity
+            - occupancy * ab_bytes_per_stage * num_ab_stage
+            - occupancy * (mbar_helpers_bytes + c_bytes)
+        ) // (occupancy * c_bytes_per_stage)
+
+        return num_acc_stage, num_ab_stage, num_c_stage, num_tile_stage  # type: ignore[return-value]
+
+    @staticmethod
+    def _compute_grid(
+        c: cute.Tensor,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+    ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size for the output tensor C.
+
+        :param c: The output tensor C
+        :type c: cute.Tensor
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        c_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
+        gc = cute.zipped_divide(c, tiler=c_shape)
+        num_ctas_mnl = gc[(0, (None, None, None))].shape
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(
+            num_ctas_mnl, cluster_shape_mnl
+        )
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(
+            tile_sched_params, max_active_clusters
+        )
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def _get_tma_atom_kind(
+        atom_sm_cnt: cutlass.Int32, mcast: cutlass.Boolean
+    ) -> Union[
+        cpasync.CopyBulkTensorTileG2SMulticastOp, cpasync.CopyBulkTensorTileG2SOp
+    ]:
+        """
+        Select the appropriate TMA copy atom based on the number of SMs and the multicast flag.
+
+        :param atom_sm_cnt: The number of SMs
+        :type atom_sm_cnt: cutlass.Int32
+        :param mcast: The multicast flag
+        :type mcast: cutlass.Boolean
+
+        :return: The appropriate TMA copy atom kind
+        :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp
+
+        :raise ValueError: If the atom_sm_cnt is invalid
+        """
+        if atom_sm_cnt == 2 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 2 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 1 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE)
+        elif atom_sm_cnt == 1 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+
+        raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}")
+
+    @staticmethod
+    def is_valid_dtypes_and_scale_factor_vec_size(
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        c_dtype: Type[cutlass.Numeric],
+    ) -> bool:
+        """
+        Check if the dtypes are valid
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param acc_dtype: The data type of the accumulator
+        :type acc_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+
+        :return: True if the dtypes are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        if ab_dtype not in {
+            cutlass.Float4E2M1FN,
+            cutlass.Float8E5M2,
+            cutlass.Float8E4M3FN,
+        }:
+            is_valid = False
+
+        # Check valid sf_vec_size
+        if sf_vec_size not in {16, 32}:
+            is_valid = False
+
+        # Check valid sf_dtype
+        if sf_dtype not in {cutlass.Float8E8M0FNU, cutlass.Float8E4M3FN}:
+            is_valid = False
+
+        # Check valid sf_dtype and sf_vec_size combinations
+        if sf_dtype == cutlass.Float8E4M3FN and sf_vec_size == 32:
+            is_valid = False
+        if ab_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} and sf_vec_size == 16:
+            is_valid = False
+
+        if c_dtype not in {cutlass.Float32, cutlass.Float16, cutlass.BFloat16}:
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_layouts(
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if layouts and dtypes are valid combinations
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major dimension of the A tensor
+        :type a_major: str
+        :param b_major: The major dimension of the B tensor
+        :type b_major: str
+        :param c_major: The major dimension of the C tensor
+        :type c_major: str
+
+        :return: True if the layouts are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
+            is_valid = False
+        if c_dtype is cutlass.Float4E2M1FN and c_major == "m":
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_mma_tiler_and_cluster_shape(
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ) -> bool:
+        """
+        Check if the mma tiler and cluster shape are valid
+
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+
+        :return: True if the mma tiler and cluster shape are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        # Skip invalid mma tile shape
+        if mma_tiler_mn[0] not in (128, 256):
+            is_valid = False
+        # Skip invalid mma tile n
+        if mma_tiler_mn[1] not in (64, 128, 192, 256):
+            is_valid = False
+
+        # Skip illegal cluster shape
+        if (mma_tiler_mn[0] // cluster_shape_mn[0]) != 128:
+            is_valid = False
+
+        if (
+            cluster_shape_mn[0] * cluster_shape_mn[1] > 16
+            or cluster_shape_mn[0] <= 0
+            or cluster_shape_mn[1] <= 0
+            or not is_power_of_2(cluster_shape_mn[0])
+            or not is_power_of_2(cluster_shape_mn[1])
+        ):
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_tensor_alignment(
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the tensor alignment is valid
+
+        :param m: The number of rows in the A tensor
+        :type m: cutlass.Int64
+        :param n: The number of columns in the B tensor
+        :type n: cutlass.Int64
+        :param k: The number of columns in the A tensor
+        :type k: cutlass.Int64
+        :param l: The number of columns in the C tensor
+        :type l: cutlass.Int64
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the problem shape is valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // dtype.width
+            return num_major_elements % num_contiguous_elements == 0
+
+        if (
+            not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
+            or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
+            or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l))
+        ):
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def can_implement(
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        c_dtype: Type[cutlass.Numeric],
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        m: int,
+        n: int,
+        k: int,
+        l: int,  # noqa: E741
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the gemm can be implemented
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param k: The number of columns in the A tensor
+        :type k: int
+        :param l: The number of columns in the C tensor
+        :type l: int
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the gemm can be implemented, False otherwise
+        :rtype: bool
+        """
+        can_implement = True
+
+        # Skip unsupported A/B layout
+        if not (a_major == "k" and b_major == "k"):
+            can_implement = False
+
+        # Skip unsupported types
+        if not Sm100BlockScaledContiguousGroupedGemmKernel.is_valid_dtypes_and_scale_factor_vec_size(
+            ab_dtype, sf_dtype, sf_vec_size, c_dtype
+        ):
+            can_implement = False
+
+        # Skip unsupported layouts
+        if not Sm100BlockScaledContiguousGroupedGemmKernel.is_valid_layouts(
+            ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+
+        if not Sm100BlockScaledContiguousGroupedGemmKernel.is_valid_mma_tiler_and_cluster_shape(
+            mma_tiler_mn,
+            cluster_shape_mn,
+        ):
+            can_implement = False
+        # Skip illegal problem shape for load/store alignment
+        if not Sm100BlockScaledContiguousGroupedGemmKernel.is_valid_tensor_alignment(
+            m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+        return can_implement
+
+    @cute.jit
+    def wrapper(
+        self,
+        a_ptr: cute.Pointer,
+        b_ptr: cute.Pointer,
+        a_sf_ptr: cute.Pointer,
+        b_sf_ptr: cute.Pointer,
+        c_ptr: cute.Pointer,
+        alpha_ptr: cute.Pointer,
+        tile_idx_to_group_idx_ptr: cute.Pointer,
+        num_non_exiting_tiles_ptr: cute.Pointer,
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        tile_size: cutlass.Constexpr,
+        scaling_vector_size: cutlass.Constexpr,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        scale_k = k // scaling_vector_size
+        num_tiles = m // tile_size
+        a = cute.make_tensor(
+            a_ptr, layout=cute.make_ordered_layout((m, k, 1), order=(1, 0, 2))
+        )
+        b = cute.make_tensor(
+            b_ptr, layout=cute.make_ordered_layout((n, k, l), order=(1, 0, 2))
+        )
+        a_sf = cute.make_tensor(
+            a_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, m // 128, 4, scale_k // 4, 1), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+        b_sf = cute.make_tensor(
+            b_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, n // 128, 4, scale_k // 4, l), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+        c = cute.make_tensor(
+            c_ptr, layout=cute.make_ordered_layout((m, n, 1), order=(1, 0, 2))
+        )
+        alpha = cute.make_tensor(alpha_ptr, layout=cute.make_layout((l,)))
+        tile_idx_to_group_idx = cute.make_tensor(
+            tile_idx_to_group_idx_ptr, layout=cute.make_layout((num_tiles,))
+        )
+        num_non_exiting_tiles = cute.make_tensor(
+            num_non_exiting_tiles_ptr, layout=cute.make_layout((1,))
+        )
+        return self(
+            a,
+            b,
+            c,
+            a_sf,
+            b_sf,
+            tile_idx_to_group_idx,
+            num_non_exiting_tiles,
+            alpha,
+            max_active_clusters=max_active_clusters,
+            stream=stream,
+            epilogue_op=epilogue_op,
+        )
+
+
+@cute.jit
+def cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+    sf_ref_tensor: cute.Tensor,
+    sf_mma_tensor: cute.Tensor,
+):
+    """Convert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layout"""
+    # sf_mma_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
+    # group to ((32, 4, rest_m), (4, rest_k), l)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 0, 3)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 1, 3)
+    for i in cutlass.range(cute.size(sf_ref_tensor)):
+        mkl_coord = sf_ref_tensor.layout.get_hier_coord(i)
+        sf_mma_tensor[mkl_coord] = sf_ref_tensor[mkl_coord]
diff --git a/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py
new file mode 100644
index 0000000000..1b8871007c
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py
@@ -0,0 +1,2599 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import Tuple, Type, Union
+
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import cutlass.pipeline as pipeline
+import cutlass.utils as utils
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.nvgpu import cpasync, tcgen05
+
+from .utils import (
+    TRTLLM_ENABLE_PDL,
+    atomic_add_func,
+    griddepcontrol_launch_dependents,
+    griddepcontrol_wait,
+    is_power_of_2,
+    vectorized_atomic_add_bf16x8,
+    vectorized_atomic_add_fp32x2,
+)
+
+"""
+High-performance persistent blockscaled contiguous grouped dense GEMM (C = alpha * (SFA * A) * (SFB * B)) example for
+the NVIDIA Blackwell architecture using CUTE DSL.
+- Matrix A is MxKx1, A can be row-major("K"), ValidM is composed of valid m in different groups
+- Matrix B is NxKxL, B can be column-major("K"), L is grouped dimension
+- Matrix C is SxNX1, C can be row-major("N"), ValidM is composed of valid m in different groups
+- Matrix SFA layout is filled internally according to A shape and BlockScaledBasicChunk, which has
+  M x ceil_div(K, sf_vec_size) x L elements respectively
+- Matrix SFB layout is filled internally according to B shape and BlockScaledBasicChunk, which has
+  N x ceil_div(K, sf_vec_size) x L elements respectively
+
+Matrix A/C Memory Layout Diagrams:
+
+   ```
+    Group 0    Group 1   Group 2
+   -+---------+---------+---------+
+    |         |         |         |
+   K| ValidM0 | ValidM1 | ValidM2 |
+    |         |         |         |
+   -+---------+---------+---------+
+    |<-        ValidM           ->|
+   ```
+   Note: the Group(L) dimension will be flatted into M dimension, and the rest Group(L) size is 1.
+         each ValidM will be aligned to 256 or 128. The alignment is determined by the mma_tiler_mn parameter.
+         For NVFP4, 2CTA, the alignment is 256. For NVFP4, 1CTA, the alignment is 128.
+
+This GEMM kernel supports the following features:
+    - Utilizes Tensor Memory Access (TMA) for efficient memory operations
+    - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Support persistent tile scheduling to better overlap memory load/store with mma between tiles
+    - Support warp specialization to avoid explicit pipelining between mainloop load and mma
+
+This GEMM works as follows:
+1. DMA warp: Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+2. SCALE warp: Load scaleA and scaleB matrices from global memory (GMEM) to shared memory (SMEM) using non-TMA
+   operations.
+2. MMA warp:
+    - Load scale factor A/B from shared memory (SMEM) to tensor memory (TMEM) using tcgen05.cp instruction.
+    - Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+3. EPILOGUE warp (with Fused Finalize for MoE):
+    - Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+    - Apply alpha scaling: acc_scaled = alpha * acc
+    - **Fused Finalize Logic** (following TensorRT-LLM's sm90_visitor_scatter.hpp pattern):
+      a) Use permuted_idx_to_expanded_idx to map from permuted row to token/topk indices
+      b) Load router_scale directly from global memory to register (no shared memory)
+      c) Apply router_scale: Final = router_scale * acc_scaled
+    - Type convert Final matrix to output type.
+    - Store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations.
+
+SM100 tcgen05.mma.kind.block_scale instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Read scalefactor A from TMEM
+- Read scalefactor B from TMEM
+- Write accumulator to TMEM
+The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+
+.. code-block:: bash
+
+    python blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py         \
+      --ab_dtype Float4E2M1FN --out_dtype BFloat16         \
+      --sf_dtype Float8E4M3FN --sf_vec_size 16                                   \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                             \
+      --benchmark 1024x7168x2048x64
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py        \\     \
+      --ab_dtype Float4E2M1FN --out_dtype BFloat16           \
+      --sf_dtype Float8E4M3FN --sf_vec_size 16                                   \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                             \
+      --benchmark [80,120,160]x7168x2048x64
+
+Constraints:
+* Supported input data types: mxf8, mxf4, nvf4
+  see detailed valid dtype combinations in below Sm100BlockScaledPersistentDenseGemmKernel class documentation
+* A/B tensor must have the same data type, mixed data type is not supported (e.g., mxf8 x mxf4)
+* Mma tiler M must be 128 or 256(use_2cta_instrs)
+* Mma tiler N must be 64/128/192/256
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+* Cluster shape M must be multiple of 2 if Mma tiler M is 256(use_2cta_instrs)
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 16 and 32 for Float8 and Float4, respectively.
+
+CUDA Graph Support:
+* For CUDA graph support, the tile_idx_to_expert_idx, A/C matrices, and scale factor A can be padded to a larger size
+  (e.g., permuted_m = m*topK + num_local_experts*(256-1), example: 4096*8 + (256/32)*255 = 34808)
+* Use create_tensors() with permuted_m parameter to automatically pad:
+  - tile_idx_to_expert_idx: padded for invalid tiles
+  - A matrix: padded to permuted_m rows (padding rows contain dummy data)
+  - C matrix: padded to permuted_m rows (output buffer for cuda_graph)
+  - Scale factor A: padded to match A matrix dimensions
+* Kernel handling of padding (similar to masked_grouped_gemm.py):
+  - Scheduler warp checks if tile_idx >= num_non_exiting_tiles to exit
+  - Only valid tiles (tile_idx < num_non_exiting_tiles) are written to tile_info pipeline
+  - When no more valid tiles exist, outer loop exits and calls producer_tail()
+  - Consumer warps process only valid tiles from pipeline
+  - No deadlock or synchronization issues
+* Consumer warps check initial tile against num_non_exiting_tiles and set is_valid_tile=False if
+  tile_idx >= num_non_exiting_tiles
+* Only rows within (aligned_groupm[0]+aligned_groupm[1]+...) contain valid data
+* Padding rows in C matrix will not be written by the kernel
+"""
+
+
+# TODO(zhichenj): Remove this hook helper function after nvidia-cutlass-dsl 4.4 is released.
+def hooked_PersistentTileSchedulerParams_init(
+    self,
+    problem_shape_ntile_mnl: cute.Shape,
+    cluster_shape_mnk: cute.Shape,
+    swizzle_size: int = 1,
+    raster_along_m: bool = True,
+    *,
+    loc=None,
+    ip=None,
+):
+    if cluster_shape_mnk[2] != 1:
+        raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}")
+    if swizzle_size < 1:
+        raise ValueError(f"expect swizzle_size >= 1, but get {swizzle_size}")
+
+    self.problem_shape_ntile_mnl = problem_shape_ntile_mnl
+    # cluster_shape_mnk is kept for reconstruction
+    self._cluster_shape_mnk = cluster_shape_mnk
+    self.cluster_shape_mn = cluster_shape_mnk[:2]
+    self.swizzle_size = swizzle_size
+    self._raster_along_m = raster_along_m
+    self._loc = loc
+
+    # Apply swizzle if swizzle_size > 1
+    if swizzle_size > 1:
+        problem_shape_ncluster_mnl = cute.round_up(
+            self.problem_layout_ncluster_mnl.shape,
+            (1, swizzle_size, 1) if raster_along_m else (swizzle_size, 1, 1),
+        )
+
+        if raster_along_m:
+            self.problem_layout_ncluster_mnl = cute.make_layout(
+                (
+                    problem_shape_ncluster_mnl[0],
+                    (swizzle_size, problem_shape_ncluster_mnl[1] // swizzle_size),
+                    problem_shape_ncluster_mnl[2],
+                ),
+                stride=(
+                    swizzle_size,
+                    (1, swizzle_size * problem_shape_ncluster_mnl[0]),
+                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
+                ),
+                loc=loc,
+                ip=ip,
+            )
+        else:
+            self.problem_layout_ncluster_mnl = cute.make_layout(
+                (
+                    (swizzle_size, problem_shape_ncluster_mnl[0] // swizzle_size),
+                    problem_shape_ncluster_mnl[1],
+                    problem_shape_ncluster_mnl[2],
+                ),
+                stride=(
+                    (1, swizzle_size * problem_shape_ncluster_mnl[1]),
+                    swizzle_size,
+                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
+                ),
+                loc=loc,
+                ip=ip,
+            )
+
+    # Create FastDivmod divisors (only when swizzle_size == 1 for correctness)
+    # FastDivmod assumes simple col-major/row-major layout, incompatible with swizzled layouts
+    if swizzle_size == 1:
+        problem_shape_ncluster_mnl = cute.ceil_div(
+            self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip
+        )
+        if raster_along_m:
+            self.problem_layout_ncluster_mnl = cute.make_layout(
+                problem_shape_ncluster_mnl,
+                stride=(
+                    1,
+                    problem_shape_ncluster_mnl[0],
+                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
+                ),
+                loc=loc,
+                ip=ip,
+            )
+        else:
+            self.problem_layout_ncluster_mnl = cute.make_layout(
+                problem_shape_ncluster_mnl,
+                stride=(
+                    problem_shape_ncluster_mnl[1],
+                    1,
+                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
+                ),
+                loc=loc,
+                ip=ip,
+            )
+        problem_layout_size = cute.size(
+            self.problem_layout_ncluster_mnl, loc=loc, ip=ip
+        )
+        cluster_count_m = self.problem_layout_ncluster_mnl.shape[0]
+        cluster_count_n = self.problem_layout_ncluster_mnl.shape[1]
+
+        # batch_fdd: Used to map linear_idx to work_unit_id (handles persistent scheduling)
+        self.batch_fdd = cute.fast_divmod_create_divisor(
+            problem_layout_size, loc=loc, ip=ip
+        )
+
+        # cluster_shape_m_fdd: Used to decode work_unit_id to cluster coordinates
+        self.cluster_shape_m_fdd = cute.fast_divmod_create_divisor(
+            cluster_count_m, loc=loc, ip=ip
+        )
+
+        # cluster_shape_n_fdd: Used for the second level decomposition
+        self.cluster_shape_n_fdd = cute.fast_divmod_create_divisor(
+            cluster_count_n, loc=loc, ip=ip
+        )
+    else:
+        # FastDivmod not applicable with swizzling, set to None
+        self.batch_fdd = None
+        self.cluster_shape_m_fdd = None
+        self.cluster_shape_n_fdd = None
+
+
+def hooked_get_cluster_work_idx_with_fastdivmod(
+    self, current_work_linear_idx: cutlass.Int32, *, loc=None, ip=None
+) -> Tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32]:
+    work_iteration, work_unit_id = divmod(
+        current_work_linear_idx, self.params.batch_fdd
+    )
+
+    if self.params._raster_along_m:
+        # raster_along_m=True means column major (m is fastest)
+        # First, get cluster_m using cluster_shape_m_fdd
+        cluster_n_batch, cluster_m = divmod(
+            work_unit_id, self.params.cluster_shape_m_fdd
+        )
+
+        # Then decode cluster_n_batch to get cluster_n and batch_l using FastDivmod
+        batch_l, cluster_n = divmod(cluster_n_batch, self.params.cluster_shape_n_fdd)
+    else:
+        # raster_along_m=False means row major (n is fastest)
+        # First, get cluster_n using cluster_shape_n_fdd
+        cluster_m_batch, cluster_n = divmod(
+            work_unit_id, self.params.cluster_shape_n_fdd
+        )
+
+        # Then decode cluster_m_batch to get cluster_m and batch_l using FastDivmod
+        batch_l, cluster_m = divmod(cluster_m_batch, self.params.cluster_shape_m_fdd)
+
+    return (cluster_m, cluster_n, batch_l)
+
+
+cutlass.utils.PersistentTileSchedulerParams.__init__ = (
+    hooked_PersistentTileSchedulerParams_init
+)
+cutlass.utils.StaticPersistentTileScheduler._get_cluster_work_idx_with_fastdivmod = (
+    hooked_get_cluster_work_idx_with_fastdivmod
+)
+
+
+class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel:
+    """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
+    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
+
+    :param sf_vec_size: Scalefactor vector size.
+    :type sf_vec_size: int
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
+        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16
+
+    :note: Supported accumulator data types:
+        - Float32
+
+    :note: Supported C data types:
+        - Float32
+        - Float16/BFloat16
+        - Float8E4M3FN/Float8E5M2
+
+    :note: Constraints:
+        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
+        - MMA tiler N must be 64/128/192/256
+        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors
+
+    Example:
+        >>> gemm = Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel(
+        ...     sf_vec_size=16, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 1)
+        ... )
+        >>> gemm(
+        ...     a_tensor, b_tensor, sfa_tensor, sfb_tensor, out_tensor, max_active_clusters, stream
+        ... )
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        raster_along_m: bool = False,
+    ):
+        """Initializes the configuration for a Blackwell blockscaled dense GEMM kernel.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        :param raster_along_m: Boolean, True to use raster along M.
+        :type raster_along_m: bool
+        """
+
+        self.sf_vec_size = sf_vec_size
+        self.acc_dtype = cutlass.Float32
+        self.use_2cta_instrs = mma_tiler_mn[0] == 256
+        self.cluster_shape_mn = cluster_shape_mn
+        self.raster_along_m = raster_along_m
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+
+        self.cta_group = (
+            tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+        )
+
+        self.occupancy = 1
+        self.epilog_warp_id = (0, 1, 2, 3)
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.sched_warp_id = 6
+        self.threads_per_warp = 32
+        self.threads_per_cta = self.threads_per_warp * len(
+            (
+                *self.epilog_warp_id,
+                self.mma_warp_id,
+                self.tma_warp_id,
+                self.sched_warp_id,
+            )
+        )
+        self.threads_wo_sched = self.threads_per_warp * len(
+            (
+                *self.epilog_warp_id,
+                self.mma_warp_id,
+                self.tma_warp_id,
+            )
+        )
+        self.num_regs_uniform_warps = 64
+        self.num_regs_sched_warps = 64
+        self.num_regs_epilogue_warps = 216
+
+        # Set barrier for cta sync, epilogue sync and tmem ptr sync
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=1,
+            num_threads=self.threads_per_cta,
+        )
+        self.epilog_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=2,
+            num_threads=32 * len(self.epilog_warp_id),
+        )
+        self.tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=3,
+            num_threads=32 * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+        self.sched_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=4,
+            num_threads=self.threads_per_warp,
+        )
+        self.num_smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        # TMEM offset for final accumulator
+        self.tmem_final_offset = 384
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B
+        - Computing epilogue subtile
+        - Setting up A/B/C stage counts in shared memory
+        - Computing A/B/C shared memory layout
+        - Computing tensor memory allocation columns
+        """
+        self.mma_inst_shape_mn = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+        )
+        # (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K)
+        self.mma_inst_shape_mn_sfb = (
+            self.mma_inst_shape_mn[0] // (2 if self.use_2cta_instrs else 1),
+            cute.round_up(self.mma_inst_shape_mn[1], 128),
+        )
+
+        # Configure tiled mma
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.mma_tiler_sfb = (
+            self.mma_inst_shape_mn_sfb[0],
+            self.mma_inst_shape_mn_sfb[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+
+        self.cta_tile_shape_mnk_sfb = (
+            self.mma_tiler_sfb[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_sfb[1],
+            self.mma_tiler_sfb[2],
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        self.cluster_layout_sfb_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma_sfb.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+
+        # Compute epilogue subtile
+        self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
+            self.cta_tile_shape_mnk,
+            self.use_2cta_instrs,
+            self.gemm_output_layout,
+            self.out_dtype,
+        )
+
+        self.epi_tile_n = cute.size(self.epi_tile[1])
+
+        # Setup A/B/C/Scale stage count in shared memory and ACC stage count in tensor memory
+        (
+            self.num_acc_stage,
+            self.num_ab_stage,
+            self.num_c_stage,
+            self.num_tile_stage,
+        ) = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.out_dtype,
+            self.gemm_output_layout,
+            self.epi_tile,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.num_smem_capacity,
+            self.occupancy,
+        )
+
+        # Compute A/B/C/Scale shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+
+        self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.out_dtype,
+            self.gemm_output_layout,
+            self.epi_tile,
+            self.num_c_stage,
+        )
+        # Overlap and double buffer accumulator when num_acc_stage == 1 for cta_tile_n = 256 case
+        self.overlapping_accum = self.num_acc_stage == 1
+
+        sf_atom_mn = 32
+
+        self.num_sfa_tmem_cols = (
+            self.cta_tile_shape_mnk[0] // sf_atom_mn
+        ) * mma_inst_tile_k
+        self.num_sfb_tmem_cols = (
+            self.cta_tile_shape_mnk_sfb[1] // sf_atom_mn
+        ) * mma_inst_tile_k
+        self.num_sf_tmem_cols = self.num_sfa_tmem_cols + self.num_sfb_tmem_cols
+        self.num_accumulator_tmem_cols = (
+            self.cta_tile_shape_mnk[1] * self.num_acc_stage
+            if not self.overlapping_accum
+            else self.cta_tile_shape_mnk[1] * 2 - self.num_sf_tmem_cols
+        )
+
+        self.iter_acc_early_release_in_epilogue = (
+            self.num_sf_tmem_cols // self.epi_tile_n
+        )
+
+        # Compute the number of tensor memory allocation columns
+        self.num_tmem_alloc_cols = 512
+
+    @cute.jit
+    def __call__(
+        self,
+        a: cute.Tensor,
+        b: cute.Tensor,
+        out: cute.Tensor,
+        sfa: cute.Tensor,
+        sfb: cute.Tensor,
+        tile_idx_to_expert_idx: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        tile_idx_to_mn_limit: cute.Tensor,
+        alpha: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        permuted_idx_to_expanded_idx: cute.Tensor,
+        token_final_scales: cute.Tensor,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        :param a: Input tensor A
+        :type a: cute.Tensor
+        :param b: Input tensor B
+        :type b: cute.Tensor
+        :param out: Finalized output tensor (shape [seq_len, n])
+        :type out: cute.Tensor
+        :param sfa: Scale factor tensor A
+        :type sfa: cute.Tensor
+        :param sfb: Scale factor tensor B
+        :type sfb: cute.Tensor
+        :param tile_idx_to_expert_idx: Mapping from tile index to expert ID, shape (permuted_m/cta_tile_m,) where
+        cta_tile_m is the CTA tile M size
+        :type tile_idx_to_expert_idx: cute.Tensor
+        :param num_non_exiting_tiles: Number of valid tiles (valid_m/cta_tile_m), shape (1,)
+        :type num_non_exiting_tiles: cute.Tensor
+        :param alpha: Alpha tensor for each group
+        :type alpha: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param permuted_idx_to_expanded_idx: Mapping from permuted index to expanded index, shape (permuted_m,)
+        :type permuted_idx_to_expanded_idx: cute.Tensor
+        :param token_final_scales: Token-wise scaling factors, shape (m, topK)
+        :type token_final_scales: cute.Tensor
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b.element_type
+        self.out_dtype: Type[cutlass.Numeric] = out.element_type
+        self.sf_dtype: Type[cutlass.Numeric] = sfa.element_type
+        self.final_scale_dtype: Type[cutlass.Numeric] = token_final_scales.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode()
+        self.gemm_output_layout = utils.LayoutEnum.ROW_MAJOR
+
+        self.topK = token_final_scales.shape[1]
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+        # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+        # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+        sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(a.shape, self.sf_vec_size)
+        sfa = cute.make_tensor(sfa.iterator, sfa_layout)
+
+        # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+        sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b.shape, self.sf_vec_size)
+        sfb = cute.make_tensor(sfb.iterator, sfb_layout)
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = sm100_utils.cluster_shape_to_tma_atom_A(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+            a_op,
+            a,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for B
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            b_op,
+            b,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for SFA
+        sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        sfa_smem_layout = cute.slice_(
+            self.sfa_smem_layout_staged, (None, None, None, 0)
+        )
+        tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+            sfa_op,
+            sfa,
+            sfa_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # Setup TMA load for SFB
+        sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        sfb_smem_layout = cute.slice_(
+            self.sfb_smem_layout_staged, (None, None, None, 0)
+        )
+        tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+            sfb_op,
+            sfb,
+            sfb_smem_layout,
+            self.mma_tiler_sfb,
+            tiled_mma_sfb,
+            self.cluster_layout_sfb_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+            x = tma_tensor_sfb.stride[0][1]
+            y = cute.ceil_div(tma_tensor_sfb.shape[0][1], 4)
+
+            new_shape = (
+                (tma_tensor_sfb.shape[0][0], ((2, 2), y)),
+                tma_tensor_sfb.shape[1],
+                tma_tensor_sfb.shape[2],
+            )
+            # Use right multiplication for ScaledBasis (3 * x instead of x * 3)
+            x_times_3 = 3 * x
+            new_stride = (
+                (tma_tensor_sfb.stride[0][0], ((x, x), x_times_3)),
+                tma_tensor_sfb.stride[1],
+                tma_tensor_sfb.stride[2],
+            )
+            tma_tensor_sfb_new_layout = cute.make_layout(new_shape, stride=new_stride)
+            tma_tensor_sfb = cute.make_tensor(
+                tma_tensor_sfb.iterator, tma_tensor_sfb_new_layout
+            )
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
+        sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
+        self.num_tma_load_bytes = (
+            a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size
+        ) * atom_thr_size
+
+        self.tile_sched_params, grid = self._compute_grid(
+            (a.shape[0], b.shape[0], a.shape[2]),
+            self.cta_tile_shape_mnk,
+            self.cluster_shape_mn,
+            max_active_clusters,
+            self.raster_along_m,
+        )
+
+        self.buffer_align_bytes = 1024
+
+        #### finalized epi layout ####
+        epi_tile_m = cute.size(self.epi_tile[0])
+        epi_tile_n = cute.size(self.epi_tile[1])
+        epi_tile_size = epi_tile_m * epi_tile_n
+        num_epilogue_threads = 32 * len(self.epilog_warp_id)
+        self.ttr_racc_size = epi_tile_size // num_epilogue_threads
+
+        if cutlass.const_expr(self.out_dtype == cutlass.BFloat16):
+            # 8-element vectorization for BF16
+            self.epi_layout = cute.make_layout(
+                shape=(self.ttr_racc_size // 8, 4, 2), stride=(8, 2, 1)
+            )
+            self.epi_loop_size = self.ttr_racc_size // 8
+            self.element_offset = 8
+
+        elif cutlass.const_expr(self.out_dtype == cutlass.Float32):
+            # 2-element vectorization for FP32
+            self.epi_layout = cute.make_layout(
+                shape=(self.ttr_racc_size // 2, 2), stride=(2, 1)
+            )
+            self.epi_loop_size = self.ttr_racc_size // 2
+            self.element_offset = 2
+        else:
+            # Scalar fallback
+            self.epi_layout = cute.make_layout(shape=(self.ttr_racc_size,), stride=(1,))
+            self.epi_loop_size = self.ttr_racc_size
+            self.element_offset = 1
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            # (bidx, bidy, bidz, valid)
+            sInfo: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Int32, 5 * self.num_tile_stage],
+                # 1 byte alignment
+                1,
+            ]
+            ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+            acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+            tile_info_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.num_tile_stage * 2
+            ]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tiled_mma_sfb,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_sfa,
+            tma_tensor_sfa,
+            tma_atom_sfb,
+            tma_tensor_sfb,
+            out,
+            tile_idx_to_expert_idx,
+            num_non_exiting_tiles,
+            tile_idx_to_mn_limit,
+            alpha,
+            permuted_idx_to_expanded_idx,
+            token_final_scales,
+            self.cluster_layout_vmnk,
+            self.cluster_layout_sfb_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.sfa_smem_layout_staged,
+            self.sfb_smem_layout_staged,
+            self.epi_tile,
+            self.epi_layout,
+            self.tile_sched_params,
+            epilogue_op,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            smem=self.shared_storage.size_in_bytes(),  # type: ignore[attr-defined]
+            stream=stream,
+            min_blocks_per_mp=1,
+            use_pdl=TRTLLM_ENABLE_PDL,
+        )
+        return
+
+    def mainloop_s2t_copy_and_partition(
+        self,
+        sSF: cute.Tensor,
+        tSF: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for smem to tmem load for scale factor tensor, then use it to partition smem memory (source) and
+        tensor memory (destination).
+
+        :param sSF: The scale factor tensor in smem
+        :type sSF: cute.Tensor
+        :param tSF: The scale factor tensor in tmem
+        :type tSF: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
+            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
+            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
+            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # (MMA, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact = cute.filter_zeros(sSF)
+        # (MMA, MMA_MN, MMA_K)
+        tCtSF_compact = cute.filter_zeros(tSF)
+
+        # Make S2T CopyAtom and tiledCopy
+        copy_atom_s2t = cute.make_copy_atom(
+            tcgen05.Cp4x32x128bOp(self.cta_group),
+            self.sf_dtype,
+        )
+        tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact)
+        thr_copy_s2t = tiled_copy_s2t.get_slice(0)
+
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+            tiled_copy_s2t, tCsSF_compact_s2t_
+        )
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+        tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact)
+
+        return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t
+
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tiled_mma_sfb: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_sfa: cute.CopyAtom,
+        mSFA_mkl: cute.Tensor,
+        tma_atom_sfb: cute.CopyAtom,
+        mSFB_nkl: cute.Tensor,
+        out: cute.Tensor,
+        tile_idx_to_expert_idx: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        tile_idx_to_mn_limit: cute.Tensor,
+        alpha: cute.Tensor,
+        permuted_idx_to_expanded_idx: cute.Tensor,
+        token_final_scales: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        cluster_layout_sfb_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        sfa_smem_layout_staged: cute.Layout,
+        sfb_smem_layout_staged: cute.Layout,
+        epi_tile: cute.Tile,
+        epi_layout: cute.Layout,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_sfa)
+            cpasync.prefetch_descriptor(tma_atom_sfb)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+
+        block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_tma_producer
+        )
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (
+            2 if use_2cta_instrs else 1
+        )
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_acc_consumer_threads
+        )
+        acc_pipeline = pipeline.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize tile info pipeline (barrier) and states
+        tile_info_pipeline_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_per_warp * 1,
+        )
+        tile_info_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_wo_sched,
+        )
+        tile_info_pipeline = pipeline.PipelineAsync.create(
+            barrier_storage=storage.tile_info_mbar_ptr.data_ptr(),
+            num_stages=self.num_tile_stage,
+            producer_group=tile_info_pipeline_producer_group,
+            consumer_group=tile_info_pipeline_consumer_group,
+        )
+
+        # Tensor memory dealloc barrier init
+        tmem = utils.TmemAllocator(
+            storage.tmem_holding_buf,
+            barrier_for_retrieve=self.tmem_alloc_barrier,
+            allocator_warp_id=self.epilog_warp_id[0],
+            is_two_cta=use_2cta_instrs,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+        )
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/C/Scale/ExpandedIdx
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(
+            a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+        )
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(
+            b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+        )
+        # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+        sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+        # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+        sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
+        # (bidx, bidy, bidz, valid)
+        info_layout = cute.make_layout((5, self.num_tile_stage), stride=(1, 5))
+        sInfo = storage.sInfo.get_tensor(info_layout)
+
+        #
+        # Compute multicast mask for A/B buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        sfa_full_mcast_mask = None
+        sfb_full_mcast_mask = None
+        if cutlass.const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
+            )
+            sfa_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            sfb_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1
+            )
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, loopM, loopK, loopL)
+        gA_mkl = cute.local_tile(
+            mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, loopN, loopK, loopL)
+        gB_nkl = cute.local_tile(
+            mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
+        )
+
+        # (bM, bK, RestM, RestK, RestL)
+        gSFA_mkl = cute.local_tile(
+            mSFA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+
+        # (bN, bK, RestN, RestK, RestL)
+        gSFB_nkl = cute.local_tile(
+            mSFB_nkl,
+            cute.slice_(self.mma_tiler_sfb, (0, None, None)),
+            (None, None, None),
+        )
+
+        k_tile_cnt = cutlass.Int32(cute.size(gA_mkl, mode=[3]))
+
+        #
+        # Partition global tensor for TiledMMA_A/B
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, loopM, loopK, loopL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, loopN, loopK, loopL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgSFA = thr_mma.partition_A(gSFA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
+
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #  TMA load SFA partition_S/D
+        sfa_cta_layout = a_cta_layout
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+
+        tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfa,
+            block_in_cluster_coord_vmnk[2],
+            sfa_cta_layout,
+            cute.group_modes(sSFA, 0, 3),
+            cute.group_modes(tCgSFA, 0, 3),
+        )
+
+        tAsSFA = cute.filter_zeros(tAsSFA)
+        tAgSFA = cute.filter_zeros(tAgSFA)
+
+        # TMA load SFB partition_S/D
+        sfb_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfb,
+            block_in_cluster_coord_sfb_vmnk[1],
+            sfb_cta_layout,
+            cute.group_modes(sSFB, 0, 3),
+            cute.group_modes(tCgSFB, 0, 3),
+        )
+        tBsSFB = cute.filter_zeros(tBsSFB)
+        tBgSFB = cute.filter_zeros(tBgSFB)
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+
+        if cutlass.const_expr(self.overlapping_accum):
+            num_acc_stage_overlapped = 2
+            tCtAcc_fake = tiled_mma.make_fragment_C(
+                cute.append(acc_shape, num_acc_stage_overlapped)
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_fake = cute.make_tensor(
+                tCtAcc_fake.iterator,
+                cute.make_layout(
+                    tCtAcc_fake.shape,
+                    stride=(
+                        tCtAcc_fake.stride[0],
+                        tCtAcc_fake.stride[1],
+                        tCtAcc_fake.stride[2],
+                        (256 - self.num_sf_tmem_cols) * tCtAcc_fake.stride[0][1],
+                    ),
+                ),
+            )
+        else:
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_fake = tiled_mma.make_fragment_C(
+                cute.append(acc_shape, self.num_acc_stage)
+            )
+
+        gC_mnl = cute.local_tile(
+            out, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None)
+        )
+
+        # (MMA, MMA_M, MMA_N, loopM, loopN, loopL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            self.cta_sync_barrier.arrive_and_wait()
+
+        griddepcontrol_wait()
+
+        #
+        # Specialized Schedule warp
+        #
+        if warp_idx == self.sched_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_sched_warps)
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            tile_info_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_tile_stage
+            )
+
+            num_valid_tiles = num_non_exiting_tiles[0]
+
+            if cutlass.const_expr(self.raster_along_m):
+                while work_tile.is_valid_tile:
+                    cur_tile_coord = work_tile.tile_idx
+                    mma_tile_coord_m = cur_tile_coord[0] // cute.size(
+                        tiled_mma.thr_id.shape
+                    )
+                    expert_idx = tile_idx_to_expert_idx[mma_tile_coord_m]
+                    tile_idx = mma_tile_coord_m
+                    if tile_idx < num_valid_tiles:
+                        tile_info_pipeline.producer_acquire(tile_info_producer_state)
+                        mn_limit = tile_idx_to_mn_limit[tile_idx]
+                        with cute.arch.elect_one():
+                            sInfo[(0, tile_info_producer_state.index)] = cur_tile_coord[
+                                0
+                            ]
+                            sInfo[(1, tile_info_producer_state.index)] = cur_tile_coord[
+                                1
+                            ]
+                            sInfo[(2, tile_info_producer_state.index)] = expert_idx
+                            sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(
+                                work_tile.is_valid_tile
+                            )
+                            sInfo[(4, tile_info_producer_state.index)] = mn_limit
+                            # fence view async shared
+                        cute.arch.fence_proxy(
+                            cute.arch.ProxyKind.async_shared,
+                            space=cute.arch.SharedSpace.shared_cta,
+                        )
+
+                        self.sched_sync_barrier.arrive_and_wait()
+                        tile_info_pipeline.producer_commit(tile_info_producer_state)
+                        tile_info_producer_state.advance()
+
+                    tile_sched.advance_to_next_work()
+                    work_tile = tile_sched.get_current_work()
+            else:
+                is_continue = cutlass.Boolean(1)
+                while work_tile.is_valid_tile and is_continue:
+                    cur_tile_coord = work_tile.tile_idx
+                    mma_tile_coord_m = cur_tile_coord[0] // cute.size(
+                        tiled_mma.thr_id.shape
+                    )
+                    expert_idx = tile_idx_to_expert_idx[mma_tile_coord_m]
+                    tile_idx = mma_tile_coord_m
+                    if tile_idx < num_valid_tiles:
+                        tile_info_pipeline.producer_acquire(tile_info_producer_state)
+                        mn_limit = tile_idx_to_mn_limit[tile_idx]
+                        with cute.arch.elect_one():
+                            sInfo[(0, tile_info_producer_state.index)] = cur_tile_coord[
+                                0
+                            ]
+                            sInfo[(1, tile_info_producer_state.index)] = cur_tile_coord[
+                                1
+                            ]
+                            sInfo[(2, tile_info_producer_state.index)] = expert_idx
+                            sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(
+                                work_tile.is_valid_tile
+                            )
+                            sInfo[(4, tile_info_producer_state.index)] = mn_limit
+                            # fence view async shared
+                        cute.arch.fence_proxy(
+                            cute.arch.ProxyKind.async_shared,
+                            space=cute.arch.SharedSpace.shared_cta,
+                        )
+
+                        self.sched_sync_barrier.arrive_and_wait()
+                        tile_info_pipeline.producer_commit(tile_info_producer_state)
+                        tile_info_producer_state.advance()
+
+                    else:
+                        is_continue = cutlass.Boolean(0)
+
+                    tile_sched.advance_to_next_work()
+                    work_tile = tile_sched.get_current_work()
+
+            tile_info_pipeline.producer_acquire(tile_info_producer_state)
+            with cute.arch.elect_one():
+                sInfo[(0, tile_info_producer_state.index)] = work_tile.tile_idx[0]
+                sInfo[(1, tile_info_producer_state.index)] = work_tile.tile_idx[1]
+                sInfo[(2, tile_info_producer_state.index)] = -1
+                sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(0)
+                sInfo[(4, tile_info_producer_state.index)] = cutlass.Int32(0)
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            self.sched_sync_barrier.arrive_and_wait()
+            tile_info_pipeline.producer_commit(tile_info_producer_state)
+            tile_info_producer_state.advance()
+            tile_info_pipeline.producer_tail(tile_info_producer_state)
+
+        #
+        # Specialized TMA load warp
+        #
+        if warp_idx == self.tma_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_uniform_warps)
+
+            ab_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_ab_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info from pipeline (scheduler has filtered out tiles >= num_non_exiting_tiles)
+            tile_info = cute.make_rmem_tensor((5,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(5, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), loopK)
+                tAgA_slice = tAgA[(None, mma_tile_coord_mnl[0], None, 0)]
+                # ((atom_v, rest_v), loopK)
+                tBgB_slice = tBgB[
+                    (None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])
+                ]
+
+                # ((atom_v, rest_v), RestK)
+                tAgSFA_slice = tAgSFA[(None, mma_tile_coord_mnl[0], None, 0)]
+
+                slice_n = mma_tile_coord_mnl[1]
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    slice_n = mma_tile_coord_mnl[1] // 2
+
+                # ((atom_v, rest_v), RestK)
+                tBgSFB_slice = tBgSFB[(None, slice_n, None, mma_tile_coord_mnl[2])]
+
+                # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt
+                ab_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_tile_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(
+                        ab_producer_state
+                    )
+                #
+                # Tma load loop
+                #
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):  # noqa: B007
+                    tAgA_k = tAgA_slice[(None, ab_producer_state.count)]
+                    tBgB_k = tBgB_slice[(None, ab_producer_state.count)]
+                    tAgSFA_k = tAgSFA_slice[(None, ab_producer_state.count)]
+                    tBgSFB_k = tBgSFB_slice[(None, ab_producer_state.count)]
+                    tAsA_pipe = tAsA[(None, ab_producer_state.index)]
+                    tBsB_pipe = tBsB[(None, ab_producer_state.index)]
+                    tAsSFA_pipe = tAsSFA[(None, ab_producer_state.index)]
+                    tBsSFB_pipe = tBsSFB[(None, ab_producer_state.index)]
+
+                    tma_bar = ab_pipeline.producer_get_barrier(ab_producer_state)
+
+                    # Conditionally wait for AB buffer empty
+                    ab_pipeline.producer_acquire(
+                        ab_producer_state, peek_ab_empty_status
+                    )
+
+                    # TMA load A/B
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_k,
+                        tAsA_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_k,
+                        tBsB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=b_full_mcast_mask,
+                    )
+
+                    cute.copy(
+                        tma_atom_sfa,
+                        tAgSFA_k,
+                        tAsSFA_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=sfa_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfb,
+                        tBgSFB_k,
+                        tBsSFB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=sfb_full_mcast_mask,
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1
+                    ab_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if ab_producer_state.count < k_tile_cnt:
+                        peek_ab_empty_status = ab_pipeline.producer_try_acquire(
+                            ab_producer_state
+                        )
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(5, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait A/B buffer empty
+            #
+            ab_pipeline.producer_tail(ab_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            # Make SFA tmem tensor
+            sfa_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + self.num_accumulator_tmem_cols,
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_M, MMA_K)
+            tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+            # Make SFB tmem tensor
+            # Make SFB tmem tensor
+            sfb_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + self.num_accumulator_tmem_cols + self.num_sfa_tmem_cols,
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_N, MMA_K)
+            tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+
+            # Partition for S2T copy of SFA/SFB
+            #
+            (
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t,
+                tCtSFA_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA)
+            (
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t,
+                tCtSFB_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
+
+            ab_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_ab_stage
+            )
+            acc_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_acc_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info from pipeline (scheduler has filtered out tiles >= num_non_exiting_tiles)
+            tile_info = cute.make_rmem_tensor((5,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(5, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                # Peek (try_wait) AB buffer full for k_tile = 0
+                ab_consumer_state.reset_count()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(
+                        ab_consumer_state
+                    )
+
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acc_producer_state.phase ^ 1
+                else:
+                    acc_stage_index = acc_producer_state.index
+
+                tCtAcc = tCtAcc_base[(None, None, None, acc_stage_index)]
+
+                tCtSFB_mma = tCtSFB
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+                    # If this is an ODD tile, shift the TMEM start address for cta_tile_shape_n=192 case by two words
+                    # (ignores first 64 columns of SFB)
+                    offset = (
+                        cutlass.Int32(2)
+                        if mma_tile_coord_mnl[1] % 2 == 1
+                        else cutlass.Int32(0)
+                    )
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + self.num_accumulator_tmem_cols
+                        + self.num_sfa_tmem_cols
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                elif cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    # Move in increments of 64 columns of SFB
+                    offset = cutlass.Int32((mma_tile_coord_mnl[1] % 2) * 2)
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + self.num_accumulator_tmem_cols
+                        + self.num_sfa_tmem_cols
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_acquire(acc_producer_state)
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                #
+                # Mma mainloop
+                #
+                for k_tile in cutlass.range(k_tile_cnt):  # noqa: B007
+                    if is_leader_cta:
+                        # Conditionally wait for AB buffer full
+                        ab_pipeline.consumer_wait(
+                            ab_consumer_state, peek_ab_full_status
+                        )
+
+                        #  Copy SFA/SFB from smem to tmem
+                        s2t_stage_coord = (
+                            None,
+                            None,
+                            None,
+                            None,
+                            ab_consumer_state.index,
+                        )
+                        tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+                        tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+                        cute.copy(
+                            tiled_copy_s2t_sfa,
+                            tCsSFA_compact_s2t_staged,
+                            tCtSFA_compact_s2t,
+                        )
+                        cute.copy(
+                            tiled_copy_s2t_sfb,
+                            tCsSFB_compact_s2t_staged,
+                            tCtSFB_compact_s2t,
+                        )
+
+                        # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+                        num_kblocks = cute.size(tCrA, mode=[2])
+
+                        for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                            kblock_coord = (
+                                None,
+                                None,
+                                kblock_idx,
+                                ab_consumer_state.index,
+                            )
+
+                            # Set SFA/SFB tensor to tiled_mma
+                            sf_kblock_coord = (None, None, kblock_idx)
+                            tiled_mma.set(
+                                tcgen05.Field.SFA,
+                                tCtSFA[sf_kblock_coord].iterator,
+                            )
+                            tiled_mma.set(
+                                tcgen05.Field.SFB,
+                                tCtSFB_mma[sf_kblock_coord].iterator,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kblock_coord],
+                                tCrB[kblock_coord],
+                                tCtAcc,
+                            )
+
+                            # Enable accumulate on tCtAcc after first kblock
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        ab_pipeline.consumer_release(ab_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
+                    ab_consumer_state.advance()
+                    peek_ab_full_status = cutlass.Boolean(1)
+                    if ab_consumer_state.count < k_tile_cnt:
+                        if is_leader_cta:
+                            peek_ab_full_status = ab_pipeline.consumer_try_wait(
+                                ab_consumer_state
+                            )
+
+                #
+                # Async arrive accumulator buffer full(each kblock)
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_commit(acc_producer_state)
+
+                # Peek (try_wait) Acc buffer empty for k_tile = k_tile + 1
+                acc_producer_state.advance()
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(5, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acc_producer_state)
+
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            #
+            # Alloc tensor memory buffer
+            #
+            tmem.allocate(self.num_tmem_alloc_cols)
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx % 128
+            (
+                tiled_copy_t2r,
+                tTR_tAcc_base,
+                tTR_rAcc,
+            ) = self.epilog_tmem_copy_and_partition(
+                epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
+            )
+
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc.shape, self.out_dtype)
+
+            acc_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_acc_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            token_idx = cutlass.Int32(0)
+            token_scale = self.final_scale_dtype(0.0)
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((5,), cutlass.Int32)
+
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(5, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Get alpha for current group
+                #
+
+                expert_idx = mma_tile_coord_mnl[2]
+                alpha_val = alpha[expert_idx]
+
+                tile_m_start = tile_info[0] * self.cta_tile_shape_mnk[0]
+                permuted_row = tile_m_start + epi_tidx
+                expanded_idx = permuted_idx_to_expanded_idx[permuted_row]
+                is_valid_row = permuted_row < tile_info[4]
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acc_consumer_state.phase
+                    reverse_subtile = (
+                        cutlass.Boolean(True)
+                        if acc_stage_index == 0
+                        else cutlass.Boolean(False)
+                    )
+                else:
+                    acc_stage_index = acc_consumer_state.index
+
+                # Set tensor memory buffer for current tile
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[
+                    (None, None, None, None, None, acc_stage_index)
+                ]
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_pipeline.consumer_wait(acc_consumer_state)
+
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                #
+                # Process sub-tiles with vectorized scatter-add
+                # Following TensorRT-LLM's direct G2R (global to register) approach
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+
+                if is_valid_row:
+                    topK = token_final_scales.shape[1]
+                    token_idx = expanded_idx // topK
+                    topk_idx = expanded_idx % topK
+                    token_scale = token_final_scales[(token_idx, topk_idx)]
+                    alpha_val = alpha_val * token_scale
+
+                scatter_out = cute.domain_offset(
+                    (token_idx, 0, 0),
+                    out,  # Use original tensor to get real pointer
+                )
+
+                for subtile_idx in cutlass.range(subtile_cnt):
+                    real_subtile_idx = subtile_idx
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if reverse_subtile:
+                            real_subtile_idx = subtile_cnt - 1 - subtile_idx
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    #
+                    tTR_tAcc_mn = tTR_tAcc[(None, None, None, real_subtile_idx)]
+
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
+
+                    #
+                    # Async arrive accumulator buffer empty earlier when overlapping_accum is enabled
+                    #
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if subtile_idx == self.iter_acc_early_release_in_epilogue:
+                            # Fence for TMEM load
+                            cute.arch.fence_view_async_tmem_load()
+                            with cute.arch.elect_one():
+                                acc_pipeline.consumer_release(acc_consumer_state)
+                            acc_consumer_state.advance()
+
+                    # Get vectorized accumulator and apply alpha scaling
+                    acc_vec = tTR_rAcc.load()
+                    acc_vec_final = alpha_val * acc_vec
+
+                    tTR_rC.store(acc_vec_final.to(self.out_dtype))
+
+                    if is_valid_row:
+                        rOut_epi = cute.make_tensor(tTR_rC.iterator, epi_layout)
+
+                        base_coord_n = mma_tile_coord_mnl[1] * self.cta_tile_shape_mnk[
+                            1
+                        ] + real_subtile_idx * cute.size(tTR_rC)
+
+                        for index in cutlass.range(
+                            self.epi_loop_size, unroll_full=True
+                        ):
+                            coord_n = base_coord_n + index * self.element_offset
+                            scatter_out_offset = cute.domain_offset(
+                                (0, coord_n, 0), scatter_out
+                            )
+                            if cutlass.const_expr(self.out_dtype == cutlass.BFloat16):
+                                rOut_epi_packed = rOut_epi[index, None, None]
+                                vectorized_atomic_add_bf16x8(
+                                    rOut_epi_packed, scatter_out_offset
+                                )
+                            elif cutlass.const_expr(self.out_dtype == cutlass.Float32):
+                                rOut_epi_packed = rOut_epi[index, None]
+                                vectorized_atomic_add_fp32x2(
+                                    rOut_epi_packed, scatter_out_offset
+                                )
+                            else:
+                                rOut_epi_packed = rOut_epi[index]
+                                atomic_add_func(rOut_epi_packed, scatter_out_offset)
+                #
+                # Async arrive accumulator buffer empty
+                #
+                if cutlass.const_expr(not self.overlapping_accum):
+                    cute.arch.fence_view_async_tmem_load()
+                    with cute.arch.elect_one():
+                        acc_pipeline.consumer_release(acc_consumer_state)
+                    acc_consumer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(5, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Dealloc the tensor memory buffer
+            #
+            tmem.relinquish_alloc_permit()
+            self.epilog_sync_barrier.arrive_and_wait()
+            tmem.free(tmem_ptr)
+
+        griddepcontrol_launch_dependents()
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array
+        (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.gemm_output_layout,
+            self.out_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(
+            copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+        )
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_mnl_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc = cute.make_rmem_tensor(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        out_dtype: Type[cutlass.Numeric],
+        gemm_output_layout: utils.LayoutEnum,
+        epi_tile: cute.Tile,
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        num_smem_capacity: int,
+        occupancy: int,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/C operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param out_dtype: Data type of operand C (output).
+        :type out_dtype: type[cutlass.Numeric]
+        :param gemm_output_layout: Layout of operand C.
+        :type gemm_output_layout: utils.LayoutEnum
+        :param sf_dtype: Data type of scale factor.
+        :type sf_dtype: type[cutlass.Numeric]
+        :param sf_vec_size: Vector size of scale factor.
+        :type sf_vec_size: int
+        :param num_smem_capacity: Total available shared memory capacity in bytes.
+        :type num_smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, C stages)
+        :rtype: tuple[int, int, int]
+        """
+        # Default ACC stages
+        num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
+
+        # Default C stages
+        num_c_stage = 2
+
+        # Default Tile info stages
+        num_tile_stage = 2
+
+        # Calculate smem layout and size for one stage of A, B, and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        ab_bytes_per_stage = (
+            cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
+        )
+        # 1024B alignment for mbar
+        mbar_helpers_bytes = 1024
+
+        # Calculate A/B stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial C stages bytes
+        # Divide remaining by bytes needed per A/B stage
+        num_ab_stage = (
+            num_smem_capacity // occupancy - mbar_helpers_bytes
+        ) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        num_c_stage = 2
+        return num_acc_stage, num_ab_stage, num_c_stage, num_tile_stage  # type: ignore[return-value]
+
+    @staticmethod
+    def _compute_grid(
+        gemm_shape: Tuple[int, int, int],
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+        raster_along_m: bool,
+    ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size based on GEMM shape.
+
+        :param gemm_shape: The GEMM computation shape (M, N, L)
+        :type gemm_shape: tuple[int, int, int]
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+        :param raster_along_m: Boolean, True to use raster along M.
+        :type raster_along_m: bool
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        (m, n, l) = gemm_shape  # noqa: E741
+
+        num_ctas_m = cute.ceil_div(m, cta_tile_shape_mnk[0])
+        num_ctas_n = cute.ceil_div(n, cta_tile_shape_mnk[1])
+        num_ctas_l = l
+
+        num_ctas_mnl = (num_ctas_m, num_ctas_n, num_ctas_l)
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(
+            num_ctas_mnl, cluster_shape_mnl, raster_along_m=raster_along_m
+        )
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(
+            tile_sched_params, max_active_clusters
+        )
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def _get_tma_atom_kind(
+        atom_sm_cnt: cutlass.Int32, mcast: cutlass.Boolean
+    ) -> Union[
+        cpasync.CopyBulkTensorTileG2SMulticastOp, cpasync.CopyBulkTensorTileG2SOp
+    ]:
+        """
+        Select the appropriate TMA copy atom based on the number of SMs and the multicast flag.
+
+        :param atom_sm_cnt: The number of SMs
+        :type atom_sm_cnt: cutlass.Int32
+        :param mcast: The multicast flag
+        :type mcast: cutlass.Boolean
+
+        :return: The appropriate TMA copy atom kind
+        :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp
+
+        :raise ValueError: If the atom_sm_cnt is invalid
+        """
+        if atom_sm_cnt == 2 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 2 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 1 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE)
+        elif atom_sm_cnt == 1 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+
+        raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}")
+
+    @staticmethod
+    def is_valid_dtypes_and_scale_factor_vec_size(
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        out_dtype: Type[cutlass.Numeric],
+    ) -> bool:
+        """
+        Check if the dtypes are valid
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param out_dtype: The data type of the output tensor
+        :type out_dtype: Type[cutlass.Numeric]
+
+        :return: True if the dtypes are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        if ab_dtype not in {
+            cutlass.Float4E2M1FN,
+            cutlass.Float8E5M2,
+            cutlass.Float8E4M3FN,
+        }:
+            is_valid = False
+
+        # Check valid sf_vec_size
+        if sf_vec_size not in {16, 32}:
+            is_valid = False
+
+        # Check valid sf_dtype
+        if sf_dtype not in {cutlass.Float8E8M0FNU, cutlass.Float8E4M3FN}:
+            is_valid = False
+
+        # Check valid sf_dtype and sf_vec_size combinations
+        if sf_dtype == cutlass.Float8E4M3FN and sf_vec_size == 32:
+            is_valid = False
+        if ab_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} and sf_vec_size == 16:
+            is_valid = False
+
+        if out_dtype not in {cutlass.Float32, cutlass.Float16, cutlass.BFloat16}:
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_layouts(
+        ab_dtype: Type[cutlass.Numeric],
+        out_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        out_major: str,
+    ) -> bool:
+        """
+        Check if layouts and dtypes are valid combinations
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param out_dtype: The data type of the output tensor
+        :type out_dtype: Type[cutlass.Numeric]
+        :param a_major: The major dimension of the A tensor
+        :type a_major: str
+        :param b_major: The major dimension of the B tensor
+        :type b_major: str
+        :param out_major: The major dimension of the C tensor
+        :type out_major: str
+
+        :return: True if the layouts are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
+            is_valid = False
+        if out_dtype is cutlass.Float4E2M1FN and out_major == "m":
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_mma_tiler_and_cluster_shape(
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ) -> bool:
+        """
+        Check if the mma tiler and cluster shape are valid
+
+        :param use_2cta_instrs: Whether to use 2 CTA groups
+        :type use_2cta_instrs: bool
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+
+        :return: True if the mma tiler and cluster shape are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        # Skip invalid mma tile shape
+        if mma_tiler_mn[0] not in (128, 256):
+            is_valid = False
+        # Skip invalid mma tile n
+        if mma_tiler_mn[1] not in (64, 128, 192, 256):
+            is_valid = False
+
+        # Skip illegal cluster shape
+        if (mma_tiler_mn[0] // cluster_shape_mn[0]) != 128:
+            is_valid = False
+
+        if (
+            cluster_shape_mn[0] * cluster_shape_mn[1] > 16
+            or cluster_shape_mn[0] <= 0
+            or cluster_shape_mn[1] <= 0
+            # Special cluster shape check for scale factor multicasts.
+            # Due to limited size of scale factors, we can't multicast among more than 4 CTAs.
+            or cluster_shape_mn[0] > 4
+            or cluster_shape_mn[1] > 4
+            or not is_power_of_2(cluster_shape_mn[0])
+            or not is_power_of_2(cluster_shape_mn[1])
+        ):
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_tensor_alignment(
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        ab_dtype: Type[cutlass.Numeric],
+        out_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        out_major: str,
+    ) -> bool:
+        """
+        Check if the tensor alignment is valid
+
+        :param m: The number of rows in the A tensor
+        :type m: cutlass.Int64
+        :param n: The number of columns in the B tensor
+        :type n: cutlass.Int64
+        :param k: The number of columns in the A tensor
+        :type k: cutlass.Int64
+        :param l: The number of columns in the C tensor
+        :type l: cutlass.Int64
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param out_dtype: The data type of the output tensor
+        :type out_dtype: Type[cutlass.Numeric]
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param out_major: The major axis of the C tensor
+        :type out_major: str
+
+        :return: True if the problem shape is valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // dtype.width
+            return num_major_elements % num_contiguous_elements == 0
+
+        if (
+            not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
+            or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
+            or not check_contigous_16B_alignment(out_dtype, out_major == "m", (m, n, l))
+        ):
+            is_valid = False
+        return is_valid
+
+    @classmethod
+    def can_implement(
+        cls,
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        out_dtype: Type[cutlass.Numeric],
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        a_major: str,
+        b_major: str,
+        out_major: str,
+    ) -> bool:
+        """
+        Check if the gemm can be implemented
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param out_dtype: The data type of the output tensor
+        :type out_dtype: Type[cutlass.Numeric]
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+        :param m: The number of rows in the A tensor
+        :type m: cutlass.Int64
+        :param n: The number of columns in the B tensor
+        :type n: cutlass.Int64
+        :param k: The number of columns in the A tensor
+        :type k: cutlass.Int64
+        :param l: The number of columns in the C tensor
+        :type l: cutlass.Int64
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param out_major: The major axis of the C tensor
+        :type out_major: str
+
+        :return: True if the gemm can be implemented, False otherwise
+        :rtype: bool
+        """
+        can_implement = True
+        # Skip unsupported types
+        if not cls.is_valid_dtypes_and_scale_factor_vec_size(
+            ab_dtype, sf_dtype, sf_vec_size, out_dtype
+        ):
+            can_implement = False
+
+        # Skip unsupported layouts
+        if not cls.is_valid_layouts(ab_dtype, out_dtype, a_major, b_major, out_major):
+            can_implement = False
+
+        # Skip invalid mma tile shape and cluster shape
+        if not cls.is_valid_mma_tiler_and_cluster_shape(mma_tiler_mn, cluster_shape_mn):
+            can_implement = False
+        # Skip illegal problem shape for load/store alignment
+        if not cls.is_valid_tensor_alignment(
+            m, n, k, l, ab_dtype, out_dtype, a_major, b_major, out_major
+        ):
+            can_implement = False
+        # Skip unsupported A/B layout
+        if not (a_major == "k" and b_major == "k"):
+            can_implement = False
+        return can_implement
+
+    @cute.jit
+    def wrapper(
+        self,
+        a_ptr: cute.Pointer,
+        b_ptr: cute.Pointer,
+        a_sf_ptr: cute.Pointer,
+        b_sf_ptr: cute.Pointer,
+        c_ptr: cute.Pointer,
+        alpha_ptr: cute.Pointer,
+        tile_idx_to_group_idx_ptr: cute.Pointer,
+        tile_idx_to_mn_limit_ptr: cute.Pointer,
+        permuted_idx_to_expanded_idx_ptr: cute.Pointer,
+        num_non_exiting_tiles_ptr: cute.Pointer,
+        token_final_scales_ptr: cute.Pointer,
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        num_tokens: cutlass.Int64,
+        top_k: cutlass.Int64,
+        tile_size: cutlass.Constexpr,
+        scaling_vector_size: cutlass.Constexpr,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        scale_k = k // scaling_vector_size
+        num_tiles = m // tile_size
+        a = cute.make_tensor(
+            a_ptr, layout=cute.make_ordered_layout((m, k, 1), order=(1, 0, 2))
+        )
+        b = cute.make_tensor(
+            b_ptr, layout=cute.make_ordered_layout((n, k, l), order=(1, 0, 2))
+        )
+        a_sf = cute.make_tensor(
+            a_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, m // 128, 4, scale_k // 4, 1), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+        b_sf = cute.make_tensor(
+            b_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, n // 128, 4, scale_k // 4, l), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+        c = cute.make_tensor(
+            c_ptr, layout=cute.make_ordered_layout((num_tokens, n, 1), order=(1, 0, 2))
+        )
+        alpha = cute.make_tensor(alpha_ptr, layout=cute.make_layout((l,)))
+
+        tile_idx_to_group_idx = cute.make_tensor(
+            tile_idx_to_group_idx_ptr, layout=cute.make_layout((num_tiles,))
+        )
+        tile_idx_to_mn_limit = cute.make_tensor(
+            tile_idx_to_mn_limit_ptr, layout=cute.make_layout((num_tiles,))
+        )
+        permuted_idx_to_expanded_idx = cute.make_tensor(
+            permuted_idx_to_expanded_idx_ptr, layout=cute.make_layout((m,))
+        )
+        num_non_exiting_tiles = cute.make_tensor(
+            num_non_exiting_tiles_ptr, layout=cute.make_layout((1,))
+        )
+        token_final_scales = cute.make_tensor(
+            token_final_scales_ptr,
+            layout=cute.make_ordered_layout((num_tokens, top_k), order=(1, 0)),
+        )
+
+        return self(
+            a,
+            b,
+            c,
+            a_sf,
+            b_sf,
+            tile_idx_to_group_idx,
+            num_non_exiting_tiles,
+            tile_idx_to_mn_limit,
+            alpha,
+            max_active_clusters=max_active_clusters,
+            stream=stream,
+            permuted_idx_to_expanded_idx=permuted_idx_to_expanded_idx,
+            token_final_scales=token_final_scales,
+            epilogue_op=epilogue_op,
+        )
+
+
+@cute.jit
+def cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+    sf_ref_tensor: cute.Tensor,
+    sf_mma_tensor: cute.Tensor,
+):
+    """Convert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layout"""
+    # sf_mma_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
+    # group to ((32, 4, rest_m), (4, rest_k), l)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 0, 3)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 1, 3)
+    for i in cutlass.range(cute.size(sf_ref_tensor)):
+        mkl_coord = sf_ref_tensor.layout.get_hier_coord(i)
+        sf_mma_tensor[mkl_coord] = sf_ref_tensor[mkl_coord]
diff --git a/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py
new file mode 100644
index 0000000000..94cd4ff44e
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/blackwell/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py
@@ -0,0 +1,2822 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import Optional, Tuple, Type, Union
+
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import cutlass.pipeline as pipeline
+import cutlass.utils as utils
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass._mlir.dialects import math
+from cutlass.cute.nvgpu import cpasync, tcgen05
+
+from .utils import (
+    TRTLLM_ENABLE_PDL,
+    fmin,
+    griddepcontrol_launch_dependents,
+    griddepcontrol_wait,
+    is_power_of_2,
+    silu_f32,
+)
+
+"""
+High-performance persistent blockscaled contiguous grouped dense GEMM (C = alpha * (SFA * A) * (SFB * B)) example for
+the NVIDIA Blackwell architecture using CUTE DSL.
+- Matrix A is MxKx1, A can be row-major("K"), ValidM is composed of valid m in different groups
+- Matrix B is NxKxL, B can be column-major("K"), L is grouped dimension
+- Matrix C is MxNx1, C can be row-major("N"), ValidM is composed of valid m in different groups
+- Matrix SFA layout is filled internally according to A shape and BlockScaledBasicChunk, which has
+  M x ceil_div(K, sf_vec_size) x L elements respectively
+- Matrix SFB layout is filled internally according to B shape and BlockScaledBasicChunk, which has
+  N x ceil_div(K, sf_vec_size) x L elements respectively
+
+Matrix A/C Memory Layout Diagrams:
+
+   ```
+    Group 0    Group 1   Group 2
+   -+---------+---------+---------+
+    |         |         |         |
+   K| ValidM0 | ValidM1 | ValidM2 |
+    |         |         |         |
+   -+---------+---------+---------+
+    |<-        ValidM           ->|
+   ```
+   Note: the Group(L) dimension will be flatted into M dimension, and the rest Group(L) size is 1.
+         each ValidM will be aligned to 256 or 128. The alignment is determined by the mma_tiler_mn parameter.
+         For NVFP4, 2CTA, the alignment is 256. For NVFP4, 1CTA, the alignment is 128.
+
+This GEMM kernel supports the following features:
+    - Utilizes Tensor Memory Access (TMA) for efficient memory operations
+    - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Support persistent tile scheduling to better overlap memory load/store with mma between tiles
+    - Support warp specialization to avoid explicit pipelining between mainloop load and mma
+
+This GEMM works as follows:
+1. DMA warp: Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+2. MMA warp:
+    - Load scale factor A/B from shared memory (SMEM) to tensor memory (TMEM) using tcgen05.cp instruction.
+    - Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+5. EPILOGUE warps:
+    - Load two accumulator subtiles (up and gate) from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+    - Apply alpha scaling: up_scaled = alpha * up, gate_scaled = alpha * gate
+    - Compute SwiGLU activation: output = up_scaled * silu(gate_scaled), where silu(x) = x * sigmoid(x)
+    - If c_dtype is Float4E2M1FN: generate scale factor C (SFC) and quantize output
+    - Type convert output to c_dtype.
+    - Store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations.
+
+SM100 tcgen05.mma.kind.block_scale instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Read scalefactor A from TMEM
+- Read scalefactor B from TMEM
+- Write accumulator to TMEM
+The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+
+Constraints:
+* Supported input data types: mxf8, mxf4, nvf4
+  see detailed valid dtype combinations in below Sm100BlockScaledPersistentDenseGemmKernel class documentation
+* A/B tensor must have the same data type, mixed data type is not supported (e.g., mxf8 x mxf4)
+* Mma tiler M must be 128 or 256(use_2cta_instrs)
+* Mma tiler N must be 64/128/192/256
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+* Cluster shape M must be multiple of 2 if Mma tiler M is 256(use_2cta_instrs)
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 16 and 32 for Float8 and Float4, respectively.
+
+CUDA Graph Support:
+* For CUDA graph support, the tile_idx_to_expert_idx, A/C matrices, and scale factor A can be padded to a larger size
+  (e.g., permuted_m = m*topK + num_local_experts*(256-1), example: 4096*8 + (256/32)*255 = 34808)
+* Use create_tensors() with permuted_m parameter to automatically pad:
+  - tile_idx_to_expert_idx: padded for invalid tiles
+  - A matrix: padded to permuted_m rows (padding rows contain dummy data)
+  - C matrix: padded to permuted_m rows (output buffer for cuda_graph)
+  - Scale factor A: padded to match A matrix dimensions
+* Kernel handling of padding:
+  - Scheduler warp checks if tile_idx >= num_non_exiting_tiles to exit
+  - Only valid tiles (tile_idx < num_non_exiting_tiles) are written to tile_info pipeline
+  - When no more valid tiles exist, outer loop exits and calls producer_tail()
+  - Consumer warps process only valid tiles from pipeline
+  - No deadlock or synchronization issues
+* Consumer warps check initial tile against num_non_exiting_tiles and set
+  is_valid_tile=False if tile_idx >= num_non_exiting_tiles
+* Only rows within (aligned_groupm[0]+aligned_groupm[1]+...) contain valid data
+* Padding rows in C matrix will not be written by the kernel
+"""
+
+
+class Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel:
+    """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
+    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
+
+    :param sf_vec_size: Scalefactor vector size.
+    :type sf_vec_size: int
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
+        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16
+
+    :note: Supported accumulator data types:
+        - Float32
+
+    :note: Supported C data types:
+        - Float32
+        - Float16/BFloat16
+        - Float8E4M3FN/Float8E5M2
+
+    :note: Constraints:
+        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
+        - MMA tiler N must be 64/128/192/256
+        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors
+
+    Example:
+        >>> gemm = Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel(
+        ...     sf_vec_size=16, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 1)
+        ... )
+        >>> gemm(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, max_active_clusters, stream)
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        vectorized_f32: bool,
+    ):
+        """Initializes the configuration for a Blackwell blockscaled dense GEMM kernel with SwiGLU fusion.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+            - use_2cta_instrs: Automatically inferred from mma_tiler_mn[0]
+              (True when M=256, False when M=128).
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        3.  Scale Factor Configuration:
+            - sf_vec_size: Vector size for block-scaled quantization.
+
+        4.  Performance Optimization:
+            - vectorized_f32: Enable vectorized f32x2 operations.
+
+        :param sf_vec_size: Vector size for scale factors (16 for NVF4, 32 for MXF4/MXF8).
+        :type sf_vec_size: int
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+            use_2cta_instrs is automatically set based on M (True if M=256, False if M=128).
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        :param vectorized_f32: Enable vectorized f32x2 operations for better performance.
+        :type vectorized_f32: bool
+        """
+
+        self.sf_vec_size = sf_vec_size
+        self.acc_dtype = cutlass.Float32
+        self.use_2cta_instrs = mma_tiler_mn[0] == 256
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+
+        self.cta_group = (
+            tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+        )
+
+        self.occupancy = 1
+        self.epilog_warp_id = (0, 1, 2, 3)
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.sched_warp_id = 6
+        self.threads_per_warp = 32
+        self.threads_per_cta = self.threads_per_warp * len(
+            (
+                *self.epilog_warp_id,
+                self.mma_warp_id,
+                self.tma_warp_id,
+                self.sched_warp_id,
+            )
+        )
+        self.threads_wo_sched = self.threads_per_warp * len(
+            (
+                *self.epilog_warp_id,
+                self.mma_warp_id,
+                self.tma_warp_id,
+            )
+        )
+
+        # Set barrier for cta sync, epilogue sync and tmem ptr sync
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=1,
+            num_threads=self.threads_per_cta,
+        )
+        self.epilog_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=2,
+            num_threads=32 * len(self.epilog_warp_id),
+        )
+        self.tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=3,
+            num_threads=32 * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+        self.sched_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=4,
+            num_threads=self.threads_per_warp,
+        )
+
+        self.num_smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        SM100_TMEM_CAPACITY_COLUMNS = 512
+        self.num_tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS
+
+        self.vectorized_f32 = vectorized_f32
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B
+        - Computing epilogue subtile
+        - Setting up A/B/C stage counts in shared memory
+        - Computing A/B/C shared memory layout
+        - Computing tensor memory allocation columns
+        """
+
+        self.mma_inst_shape_mn = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+        )
+        # (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K)
+        self.mma_inst_shape_mn_sfb = (
+            self.mma_inst_shape_mn[0] // (2 if self.use_2cta_instrs else 1),
+            cute.round_up(self.mma_inst_shape_mn[1], 128),
+        )
+
+        # Configure tiled mma
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.mma_tiler_sfb = (
+            self.mma_inst_shape_mn_sfb[0],
+            self.mma_inst_shape_mn_sfb[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.mma_tiler_c = (
+            self.mma_inst_shape_mn[0],
+            self.mma_inst_shape_mn[1] // 2,
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+        self.cta_tile_shape_mnk_sfb = (
+            self.mma_tiler_sfb[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_sfb[1],
+            self.mma_tiler_sfb[2],
+        )
+
+        self.cta_tile_shape_mnk_c = (
+            self.mma_tiler_c[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_c[1],
+            self.mma_tiler_c[2],
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        self.cluster_layout_sfb_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma_sfb.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+
+        # Compute epilogue subtile
+        # self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
+        #     self.cta_tile_shape_mnk,
+        #     self.use_2cta_instrs,
+        #     self.c_layout,
+        #     self.c_dtype,
+        # )
+        self.epi_tile = (128, 64)
+        self.epi_tile_cnt = (
+            self.cta_tile_shape_mnk_c[0] // self.epi_tile[0],
+            self.cta_tile_shape_mnk_c[1] // self.epi_tile[1],
+        )
+
+        # Setup A/B/C/Scale stage count in shared memory and ACC stage count in tensor memory
+        (
+            self.num_acc_stage,
+            self.num_ab_stage,
+            self.num_c_stage,
+            self.num_tile_stage,
+        ) = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.epi_tile,
+            self.c_dtype,
+            self.c_layout,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.num_smem_capacity,
+            self.occupancy,
+        )
+
+        # Compute A/B/C/Scale shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+
+        self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile,
+            self.num_c_stage,
+        )
+
+        # Overlap and double buffer accumulator when num_acc_stage == 1 for cta_tile_n = 256 case
+        self.overlapping_accum = self.num_acc_stage == 1
+
+        # Compute number of TMEM columns for SFA/SFB/Accumulator
+        sf_atom_mn = 32
+        self.num_sfa_tmem_cols = (
+            self.cta_tile_shape_mnk[0] // sf_atom_mn
+        ) * mma_inst_tile_k
+        self.num_sfb_tmem_cols = (
+            self.cta_tile_shape_mnk_sfb[1] // sf_atom_mn
+        ) * mma_inst_tile_k
+        self.num_sf_tmem_cols = self.num_sfa_tmem_cols + self.num_sfb_tmem_cols
+        self.num_accumulator_tmem_cols = (
+            self.cta_tile_shape_mnk[1] * self.num_acc_stage
+            if not self.overlapping_accum
+            else self.cta_tile_shape_mnk[1] * 2 - self.num_sf_tmem_cols
+        )
+
+        self.epi_tile_n_required = 2 * cute.size(self.epi_tile[1])
+        # Only when overlapping_accum is enabled, we need to release accumulator buffer early in epilogue
+        self.iter_acc_early_release_in_epilogue = (
+            self.num_sf_tmem_cols // self.epi_tile_n_required
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a: cute.Tensor,
+        b: cute.Tensor,
+        c: cute.Tensor,
+        sfa: cute.Tensor,
+        sfb: cute.Tensor,
+        sfc_tensor: Optional[cute.Tensor],
+        norm_const_tensor: Optional[cute.Tensor],
+        tile_idx_to_expert_idx: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        alpha: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        :param a: Input tensor A
+        :type a: cute.Tensor
+        :param b: Input tensor B
+        :type b: cute.Tensor
+        :param c: Output tensor C
+        :type c: cute.Tensor
+        :param sfa: Scale factor tensor A
+        :type sfa: cute.Tensor
+        :param sfb: Scale factor tensor B
+        :type sfb: cute.Tensor
+        :param sfc_tensor: Scale factor tensor C
+        :type sfc_tensor: Optional[cute.Tensor]
+        :param norm_const_tensor: Norm constant tensor
+        :type norm_const_tensor: Optional[cute.Tensor]
+        :param tile_idx_to_expert_idx: Mapping from tile index to expert ID, shape (permuted_m/cta_tile_m,) where
+        cta_tile_m is the CTA tile M size
+        :type tile_idx_to_expert_idx: cute.Tensor
+        :param num_non_exiting_tiles: Number of valid tiles (valid_m/cta_tile_m), shape (1,)
+        :type num_non_exiting_tiles: cute.Tensor
+        :param alpha: Alpha tensor for each group
+        :type alpha: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c.element_type
+        self.sf_dtype: Type[cutlass.Numeric] = sfa.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(c)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+        # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+        sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(a.shape, self.sf_vec_size)
+        sfa = cute.make_tensor(sfa.iterator, sfa_layout)
+
+        # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+        sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b.shape, self.sf_vec_size)
+        sfb = cute.make_tensor(sfb.iterator, sfb_layout)
+
+        # Setup sfc tensor by filling C tensor to scale factor atom layout
+        self.generate_sfc = sfc_tensor is not None and norm_const_tensor is not None
+        if cutlass.const_expr(self.generate_sfc):
+            sfc_layout = blockscaled_utils.tile_atom_to_shape_SF(
+                c.shape, self.sf_vec_size
+            )
+            sfc_tensor = cute.make_tensor(sfc_tensor.iterator, sfc_layout)
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        # For 2CTA blockscaled kernels, SFB needs to be replicated across peer CTAs.
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = sm100_utils.cluster_shape_to_tma_atom_A(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+            a_op,
+            a,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for B
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            b_op,
+            b,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for SFA
+        sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        sfa_smem_layout = cute.slice_(
+            self.sfa_smem_layout_staged, (None, None, None, 0)
+        )
+        tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+            sfa_op,
+            sfa,
+            sfa_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # Setup TMA load for SFB
+        sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(
+            self.cluster_shape_mn, tiled_mma.thr_id
+        )
+        sfb_smem_layout = cute.slice_(
+            self.sfb_smem_layout_staged, (None, None, None, 0)
+        )
+        tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+            sfb_op,
+            sfb,
+            sfb_smem_layout,
+            self.mma_tiler_sfb,
+            tiled_mma_sfb,
+            self.cluster_layout_sfb_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # This modifies the layout to handle overlapping 256x(# of scale factors for a single column of B (nNSF))
+        # logical blocks for SFB when cta_tile_shape_n=192.
+        if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+            x = tma_tensor_sfb.stride[0][1]
+            y = cute.ceil_div(tma_tensor_sfb.shape[0][1], 4)
+
+            new_shape = (
+                (tma_tensor_sfb.shape[0][0], ((2, 2), y)),
+                tma_tensor_sfb.shape[1],
+                tma_tensor_sfb.shape[2],
+            )
+            # Use right multiplication for ScaledBasis (3 * x instead of x * 3)
+            x_times_3 = 3 * x
+            new_stride = (
+                (tma_tensor_sfb.stride[0][0], ((x, x), x_times_3)),
+                tma_tensor_sfb.stride[1],
+                tma_tensor_sfb.stride[2],
+            )
+            tma_tensor_sfb_new_layout = cute.make_layout(new_shape, stride=new_stride)
+            tma_tensor_sfb = cute.make_tensor(
+                tma_tensor_sfb.iterator, tma_tensor_sfb_new_layout
+            )
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
+        sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
+        self.num_tma_load_bytes = (
+            a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size
+        ) * atom_thr_size
+
+        # Setup TMA store for C
+        tma_atom_c = None
+        tma_tensor_c = None
+        epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+        tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            c,
+            epi_smem_layout,
+            self.epi_tile,
+        )
+
+        # Compute grid size
+        self.tile_sched_params, grid = self._compute_grid(
+            c, self.cta_tile_shape_mnk_c, self.cluster_shape_mn, max_active_clusters
+        )
+
+        self.buffer_align_bytes = 1024
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            # (bidx, bidy, bidz, valid)
+            sInfo: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Int32, 4 * self.num_tile_stage],
+                # 1 byte alignment
+                1,
+            ]
+            ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+            acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+            tile_info_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.num_tile_stage * 2
+            ]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.c_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tiled_mma_sfb,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_sfa,
+            tma_tensor_sfa,
+            tma_atom_sfb,
+            tma_tensor_sfb,
+            tma_atom_c,
+            tma_tensor_c,
+            sfc_tensor,
+            norm_const_tensor,
+            tile_idx_to_expert_idx,
+            num_non_exiting_tiles,
+            alpha,
+            self.cluster_layout_vmnk,
+            self.cluster_layout_sfb_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.sfa_smem_layout_staged,
+            self.sfb_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.epi_tile,
+            self.tile_sched_params,
+            epilogue_op,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            smem=self.shared_storage.size_in_bytes(),  # type: ignore[attr-defined]
+            stream=stream,
+            min_blocks_per_mp=1,
+            use_pdl=TRTLLM_ENABLE_PDL,
+        )
+        return
+
+    def mainloop_s2t_copy_and_partition(
+        self,
+        sSF: cute.Tensor,
+        tSF: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for smem to tmem load for scale factor tensor, then use it to
+        partition smem memory (source) and tensor memory (destination).
+
+        :param sSF: The scale factor tensor in smem
+        :type sSF: cute.Tensor
+        :param tSF: The scale factor tensor in tmem
+        :type tSF: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
+            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
+            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
+            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # (MMA, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact = cute.filter_zeros(sSF)
+        # (MMA, MMA_MN, MMA_K)
+        tCtSF_compact = cute.filter_zeros(tSF)
+
+        # Make S2T CopyAtom and tiledCopy
+        copy_atom_s2t = cute.make_copy_atom(
+            tcgen05.Cp4x32x128bOp(self.cta_group),
+            self.sf_dtype,
+        )
+        tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact)
+        thr_copy_s2t = tiled_copy_s2t.get_slice(0)
+
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+            tiled_copy_s2t, tCsSF_compact_s2t_
+        )
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+        tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact)
+
+        return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tiled_mma_sfb: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_sfa: cute.CopyAtom,
+        mSFA_mkl: cute.Tensor,
+        tma_atom_sfb: cute.CopyAtom,
+        mSFB_nkl: cute.Tensor,
+        tma_atom_c: cute.CopyAtom,
+        mC_mnl: cute.Tensor,
+        mSFC_mnl: Optional[cute.Tensor],
+        norm_const_tensor: Optional[cute.Tensor],
+        tile_idx_to_expert_idx: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        alpha: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        cluster_layout_sfb_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        sfa_smem_layout_staged: cute.Layout,
+        sfb_smem_layout_staged: cute.Layout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        epi_tile: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_sfa)
+            cpasync.prefetch_descriptor(tma_atom_sfb)
+            cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+
+        block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_tma_producer
+        )
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (
+            2 if use_2cta_instrs else 1
+        )
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_acc_consumer_threads
+        )
+        acc_pipeline = pipeline.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize tile info pipeline (barrier) and states
+        tile_info_pipeline_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_per_warp * 1,
+        )
+        tile_info_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_wo_sched,
+        )
+        tile_info_pipeline = pipeline.PipelineAsync.create(
+            barrier_storage=storage.tile_info_mbar_ptr.data_ptr(),
+            num_stages=self.num_tile_stage,
+            producer_group=tile_info_pipeline_producer_group,
+            consumer_group=tile_info_pipeline_consumer_group,
+        )
+
+        # Tensor memory dealloc barrier init
+        tmem = utils.TmemAllocator(
+            storage.tmem_holding_buf,
+            barrier_for_retrieve=self.tmem_alloc_barrier,
+            allocator_warp_id=self.epilog_warp_id[0],
+            is_two_cta=use_2cta_instrs,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+        )
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/C/Scale
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(
+            c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner
+        )
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(
+            a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+        )
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(
+            b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+        )
+        # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+        sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+        # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+        sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
+        # (bidx, bidy, bidz, valid)
+        info_layout = cute.make_layout((4, self.num_tile_stage), stride=(1, 4))
+        sInfo = storage.sInfo.get_tensor(info_layout)
+
+        #
+        # Compute multicast mask for A/B buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        sfa_full_mcast_mask = None
+        sfb_full_mcast_mask = None
+        if cutlass.const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
+            )
+            sfa_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            sfb_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1
+            )
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, loopM, loopK, loopL)
+        gA_mkl = cute.local_tile(
+            mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, loopN, loopK, loopL)
+        gB_nkl = cute.local_tile(
+            mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
+        )
+
+        # (bM, bK, RestM, RestK, RestL)
+        gSFA_mkl = cute.local_tile(
+            mSFA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+
+        # (bN, bK, RestN, RestK, RestL)
+        gSFB_nkl = cute.local_tile(
+            mSFB_nkl,
+            cute.slice_(self.mma_tiler_sfb, (0, None, None)),
+            (None, None, None),
+        )
+
+        # (bM, bN, loopM, loopN, loopL)
+        gC_mnl = cute.local_tile(
+            mC_mnl, cute.slice_(self.mma_tiler_c, (None, None, 0)), (None, None, None)
+        )
+        k_tile_cnt = cutlass.Int32(cute.size(gA_mkl, mode=[3]))
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, loopM, loopK, loopL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, loopN, loopK, loopL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgSFA = thr_mma.partition_A(gSFA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
+        # (MMA, MMA_M, MMA_N, loopM, loopN, loopL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #  TMA load SFA partition_S/D
+        sfa_cta_layout = a_cta_layout
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+
+        tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfa,
+            block_in_cluster_coord_vmnk[2],
+            sfa_cta_layout,
+            cute.group_modes(sSFA, 0, 3),
+            cute.group_modes(tCgSFA, 0, 3),
+        )
+
+        tAsSFA = cute.filter_zeros(tAsSFA)
+        tAgSFA = cute.filter_zeros(tAgSFA)
+
+        # TMA load SFB partition_S/D
+        sfb_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfb,
+            block_in_cluster_coord_sfb_vmnk[1],
+            sfb_cta_layout,
+            cute.group_modes(sSFB, 0, 3),
+            cute.group_modes(tCgSFB, 0, 3),
+        )
+        tBsSFB = cute.filter_zeros(tBsSFB)
+        tBgSFB = cute.filter_zeros(tBgSFB)
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        if cutlass.const_expr(self.overlapping_accum):
+            num_acc_stage_overlapped = 2
+            tCtAcc_fake = tiled_mma.make_fragment_C(
+                cute.append(acc_shape, num_acc_stage_overlapped)
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_fake = cute.make_tensor(
+                tCtAcc_fake.iterator,
+                cute.make_layout(
+                    tCtAcc_fake.shape,
+                    stride=(
+                        tCtAcc_fake.stride[0],
+                        tCtAcc_fake.stride[1],
+                        tCtAcc_fake.stride[2],
+                        (256 - self.num_sf_tmem_cols) * tCtAcc_fake.stride[0][1],
+                    ),
+                ),
+            )
+        else:
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_fake = tiled_mma.make_fragment_C(
+                cute.append(acc_shape, self.num_acc_stage)
+            )
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            self.cta_sync_barrier.arrive_and_wait()
+
+        griddepcontrol_wait()
+
+        #
+        # Specialized Schedule warp
+        #
+        if warp_idx == self.sched_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            tile_info_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_tile_stage
+            )
+
+            while work_tile.is_valid_tile:
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_m = cur_tile_coord[0] // cute.size(
+                    tiled_mma.thr_id.shape
+                )
+                if mma_tile_coord_m < num_non_exiting_tiles[0]:
+                    tile_info_pipeline.producer_acquire(tile_info_producer_state)
+                    cur_tile_coord = work_tile.tile_idx
+                    expert_idx = tile_idx_to_expert_idx[mma_tile_coord_m]
+                    with cute.arch.elect_one():
+                        sInfo[(0, tile_info_producer_state.index)] = cur_tile_coord[0]
+                        sInfo[(1, tile_info_producer_state.index)] = cur_tile_coord[1]
+                        sInfo[(2, tile_info_producer_state.index)] = expert_idx
+                        sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(
+                            work_tile.is_valid_tile
+                        )
+                        # fence view async shared
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+
+                    self.sched_sync_barrier.arrive_and_wait()
+                    tile_info_pipeline.producer_commit(tile_info_producer_state)
+                    tile_info_producer_state.advance()
+
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            tile_info_pipeline.producer_acquire(tile_info_producer_state)
+            with cute.arch.elect_one():
+                sInfo[(0, tile_info_producer_state.index)] = work_tile.tile_idx[0]
+                sInfo[(1, tile_info_producer_state.index)] = work_tile.tile_idx[1]
+                sInfo[(2, tile_info_producer_state.index)] = -1
+                sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(0)
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            self.sched_sync_barrier.arrive_and_wait()
+            tile_info_pipeline.producer_commit(tile_info_producer_state)
+            tile_info_producer_state.advance()
+            tile_info_pipeline.producer_tail(tile_info_producer_state)
+
+        #
+        # Specialized TMA load warp
+        #
+        if warp_idx == self.tma_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_ab_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), loopK)
+                tAgA_slice = tAgA[(None, mma_tile_coord_mnl[0], None, 0)]
+                # ((atom_v, rest_v), loopK)
+                tBgB_slice = tBgB[
+                    (None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])
+                ]
+
+                # ((atom_v, rest_v), RestK)
+                tAgSFA_slice = tAgSFA[(None, mma_tile_coord_mnl[0], None, 0)]
+
+                # Apply SFB slicing hack when cta_tile_shape_n=64
+                slice_n = mma_tile_coord_mnl[1]
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    slice_n = mma_tile_coord_mnl[1] // 2
+
+                # ((atom_v, rest_v), RestK)
+                tBgSFB_slice = tBgSFB[(None, slice_n, None, mma_tile_coord_mnl[2])]
+
+                # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt
+                ab_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_tile_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(
+                        ab_producer_state
+                    )
+                #
+                # Tma load loop
+                #
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):  # noqa: B007
+                    tAgA_k = tAgA_slice[(None, ab_producer_state.count)]
+                    tBgB_k = tBgB_slice[(None, ab_producer_state.count)]
+                    tAgSFA_k = tAgSFA_slice[(None, ab_producer_state.count)]
+                    tBgSFB_k = tBgSFB_slice[(None, ab_producer_state.count)]
+                    tAsA_pipe = tAsA[(None, ab_producer_state.index)]
+                    tBsB_pipe = tBsB[(None, ab_producer_state.index)]
+                    tAsSFA_pipe = tAsSFA[(None, ab_producer_state.index)]
+                    tBsSFB_pipe = tBsSFB[(None, ab_producer_state.index)]
+
+                    tma_bar = ab_pipeline.producer_get_barrier(ab_producer_state)
+
+                    # Conditionally wait for AB buffer empty
+                    ab_pipeline.producer_acquire(
+                        ab_producer_state, peek_ab_empty_status
+                    )
+
+                    # TMA load A/B
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_k,
+                        tAsA_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_k,
+                        tBsB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=b_full_mcast_mask,
+                    )
+
+                    cute.copy(
+                        tma_atom_sfa,
+                        tAgSFA_k,
+                        tAsSFA_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=sfa_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfb,
+                        tBgSFB_k,
+                        tBsSFB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=sfb_full_mcast_mask,
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1
+                    ab_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if ab_producer_state.count < k_tile_cnt:
+                        peek_ab_empty_status = ab_pipeline.producer_try_acquire(
+                            ab_producer_state
+                        )
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait A/B buffer empty
+            #
+            ab_pipeline.producer_tail(ab_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            # Make SFA tmem tensor
+            sfa_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + self.num_accumulator_tmem_cols,
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_M, MMA_K)
+            tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+            # Make SFB tmem tensor
+            sfb_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + self.num_accumulator_tmem_cols + self.num_sfa_tmem_cols,
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_N, MMA_K)
+            tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+
+            # Partition for S2T copy of SFA/SFB
+            #
+            (
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t,
+                tCtSFA_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA)
+            (
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t,
+                tCtSFB_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_ab_stage
+            )
+            acc_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_acc_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info from pipeline (scheduler has filtered out tiles >= num_non_exiting_tiles)
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                # Peek (try_wait) AB buffer full for k_tile = 0
+                ab_consumer_state.reset_count()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(
+                        ab_consumer_state
+                    )
+
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acc_producer_state.phase ^ 1
+                else:
+                    acc_stage_index = acc_producer_state.index
+
+                tCtAcc = tCtAcc_base[(None, None, None, acc_stage_index)]
+
+                # Apply TMEM pointer offset hack when cta_tile_shape_n=192 or
+                # cta_tile_shape_n=64
+                tCtSFB_mma = tCtSFB
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+                    # If this is an ODD tile, shift the TMEM start address for
+                    # cta_tile_shape_n=192 case by two words
+                    # (ignores first 64 columns of SFB)
+                    offset = (
+                        cutlass.Int32(2)
+                        if mma_tile_coord_mnl[1] % 2 == 1
+                        else cutlass.Int32(0)
+                    )
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + self.num_accumulator_tmem_cols
+                        + self.num_sfa_tmem_cols
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                elif cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    # Move in increments of 64 columns of SFB
+                    offset = cutlass.Int32((mma_tile_coord_mnl[1] % 2) * 2)
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + self.num_accumulator_tmem_cols
+                        + self.num_sfa_tmem_cols
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                    #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_acquire(acc_producer_state)
+                #
+                # Mma mainloop
+                #
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                for k_tile in cutlass.range(k_tile_cnt):  # noqa: B007
+                    # Set tensor memory buffer for current tile
+                    # (MMA, MMA_M, MMA_N)
+
+                    if is_leader_cta:
+                        # Conditionally wait for AB buffer full
+                        ab_pipeline.consumer_wait(
+                            ab_consumer_state, peek_ab_full_status
+                        )
+
+                        #  Copy SFA/SFB from smem to tmem
+                        s2t_stage_coord = (
+                            None,
+                            None,
+                            None,
+                            None,
+                            ab_consumer_state.index,
+                        )
+                        tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+                        tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+                        cute.copy(
+                            tiled_copy_s2t_sfa,
+                            tCsSFA_compact_s2t_staged,
+                            tCtSFA_compact_s2t,
+                        )
+                        cute.copy(
+                            tiled_copy_s2t_sfb,
+                            tCsSFB_compact_s2t_staged,
+                            tCtSFB_compact_s2t,
+                        )
+
+                        # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+                        num_kblocks = cute.size(tCrA, mode=[2])
+
+                        for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                            kblock_coord = (
+                                None,
+                                None,
+                                kblock_idx,
+                                ab_consumer_state.index,
+                            )
+
+                            # Set SFA/SFB tensor to tiled_mma
+                            sf_kblock_coord = (None, None, kblock_idx)
+                            tiled_mma.set(
+                                tcgen05.Field.SFA,
+                                tCtSFA[sf_kblock_coord].iterator,
+                            )
+                            tiled_mma.set(
+                                tcgen05.Field.SFB,
+                                tCtSFB_mma[sf_kblock_coord].iterator,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kblock_coord],
+                                tCrB[kblock_coord],
+                                tCtAcc,
+                            )
+                            # Enable accumulate on tCtAcc after first kblock
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        ab_pipeline.consumer_release(ab_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
+                    ab_consumer_state.advance()
+                    peek_ab_full_status = cutlass.Boolean(1)
+                    if ab_consumer_state.count < k_tile_cnt:
+                        if is_leader_cta:
+                            peek_ab_full_status = ab_pipeline.consumer_try_wait(
+                                ab_consumer_state
+                            )
+
+                #
+                # Async arrive accumulator buffer full(each kblock)
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_commit(acc_producer_state)
+
+                # Peek (try_wait) Acc buffer empty for k_tile = k_tile + 1
+                acc_producer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acc_producer_state)
+
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            #
+            # Alloc tensor memory buffer
+            #
+            tmem.allocate(self.num_tmem_alloc_cols)
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx % 128
+            (
+                tiled_copy_t2r,
+                tTR_tAcc_base,
+                tTR_rAcc_up,
+                tTR_rAcc_gate,
+            ) = self.epilog_tmem_copy_and_partition(
+                epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
+            )
+
+            tTR_rC = None
+            tiled_copy_r2s = None
+            tRS_rC = None
+            tRS_sC = None
+            bSG_sC = None
+            bSG_gC_partitioned = None
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc_up.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(
+                tiled_copy_t2r, tTR_rC, epi_tidx, sC
+            )
+            (
+                tma_atom_c,
+                bSG_sC,
+                bSG_gC_partitioned,
+            ) = self.epilog_gmem_copy_and_partition(
+                epi_tidx, tma_atom_c, tCgC, epi_tile, sC
+            )
+
+            if cutlass.const_expr(self.generate_sfc):
+                norm_const = norm_const_tensor[0]
+                # (EPI_TILE_M, EPI_TILE_N, RestM, RestN, RestL)
+                gSFC_mnl = cute.local_tile(mSFC_mnl, epi_tile, (None, None, None))
+
+                thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+                # (T2R, T2R_M, T2R_N, RestM, RestN, RestL)
+                tCgSFC_mnl = thr_copy_t2r.partition_D(gSFC_mnl)
+                tCgSFC_mnl = cute.filter_zeros(tCgSFC_mnl)
+                # (T2R, T2R_M, T2R_N)
+                tCrSFC = cute.make_rmem_tensor(
+                    tCgSFC_mnl[(None, None, None, 0, 0, 0)].layout, self.sf_dtype
+                )
+                tCrSFC_pvscale = cute.make_rmem_tensor_like(tCrSFC, cutlass.Float32)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            acc_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_acc_stage
+            )
+
+            c_pipeline = None
+            # Threads/warps participating in tma store pipeline
+            c_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+            )
+            c_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_c_stage,
+                producer_group=c_producer_group,
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            num_prev_subtiles = cutlass.Int32(0)
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Get alpha for current group
+                #
+
+                expert_idx = mma_tile_coord_mnl[2]
+                alpha_val = alpha[expert_idx]
+
+                #
+                # Slice to per mma tile index
+                #
+                bSG_gC = None
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gC = bSG_gC_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        mma_tile_coord_mnl[0],
+                        mma_tile_coord_mnl[1],
+                        0,
+                    )
+                ]
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acc_consumer_state.phase
+                    reverse_subtile = (
+                        cutlass.Boolean(True)
+                        if acc_stage_index == 0
+                        else cutlass.Boolean(False)
+                    )
+                else:
+                    acc_stage_index = acc_consumer_state.index
+
+                # Set tensor memory buffer for current tile
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[
+                    (None, None, None, None, None, acc_stage_index)
+                ]
+
+                if cutlass.const_expr(self.generate_sfc):
+                    # (T2R, T2R_M, T2R_N, RestM, RestN)
+                    tCgSFC_mn = tCgSFC_mnl[
+                        (
+                            None,
+                            None,
+                            None,
+                            None,
+                            None,
+                            0,
+                        )
+                    ]
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_pipeline.consumer_wait(acc_consumer_state)
+
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+
+                #
+                # Process accumulator subtiles with SwiGLU fusion and store to global memory
+                # Each iteration processes a pair of subtiles (up, gate) and computes
+                # up * silu(gate)
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+
+                for subtile_idx in cutlass.range(0, subtile_cnt, 2):
+                    real_subtile_idx = subtile_idx // 2
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if reverse_subtile:
+                            real_subtile_idx = (
+                                self.cta_tile_shape_mnk[1] // self.epi_tile_n_required
+                                - 1
+                                - subtile_idx // 2
+                            )
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    #
+                    tTR_tAcc_mn_up = tTR_tAcc[(None, None, None, real_subtile_idx * 2)]
+                    tTR_tAcc_mn_gate = tTR_tAcc[
+                        (None, None, None, real_subtile_idx * 2 + 1)
+                    ]
+
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn_up, tTR_rAcc_up)
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn_gate, tTR_rAcc_gate)
+
+                    #
+                    # Async arrive accumulator buffer empty earlier when overlapping_accum is enabled
+                    #
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if subtile_idx // 2 == self.iter_acc_early_release_in_epilogue:
+                            # Fence for TMEM load
+                            cute.arch.fence_view_async_tmem_load()
+                            with cute.arch.elect_one():
+                                acc_pipeline.consumer_release(acc_consumer_state)
+                            acc_consumer_state.advance()
+
+                    acc_vec_up = tTR_rAcc_up.load()
+                    acc_vec_gate = tTR_rAcc_gate.load()
+
+                    #
+                    # SwiGLU activation: output = up * silu(gate)
+                    # where silu(x) = x * sigmoid(x)
+                    # up and gate are extracted from interleaved accumulator subtiles
+                    #
+                    tCompute = cute.make_rmem_tensor(acc_vec_gate.shape, self.acc_dtype)
+                    if cutlass.const_expr(self.vectorized_f32):
+                        # SwiGLU Packed Version: uses f32x2 packed operations for better performance
+                        # Computes: output = (alpha * up) * silu(alpha * gate)
+                        # where silu(x) = x * sigmoid(x) = x / (1 + exp(-x))
+                        LOG2_E = cutlass.Float32(1.4426950408889634)
+                        for i in cutlass.range_constexpr(0, cute.size(tTR_rAcc_up), 2):
+                            acc_vec_up_alpha = cute.arch.mul_packed_f32x2(
+                                (acc_vec_up[i], acc_vec_up[i + 1]),
+                                (
+                                    cutlass.Float32(alpha_val),
+                                    cutlass.Float32(alpha_val),
+                                ),
+                            )
+                            acc_vec_gate_alpha = cute.arch.mul_packed_f32x2(
+                                (acc_vec_gate[i], acc_vec_gate[i + 1]),
+                                (
+                                    cutlass.Float32(alpha_val),
+                                    cutlass.Float32(alpha_val),
+                                ),
+                            )
+                            tCompute_log2e = cute.arch.mul_packed_f32x2(
+                                (acc_vec_gate_alpha[0], acc_vec_gate_alpha[1]),
+                                (-LOG2_E, -LOG2_E),
+                            )
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.add_packed_f32x2(
+                                (
+                                    cute.math.exp2(tCompute_log2e[0], fastmath=True),
+                                    cute.math.exp2(tCompute_log2e[1], fastmath=True),
+                                ),
+                                (1.0, 1.0),
+                            )
+                            tCompute[i] = cute.arch.rcp_approx(tCompute[i])
+                            tCompute[i + 1] = cute.arch.rcp_approx(tCompute[i + 1])
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.mul_packed_f32x2(
+                                (tCompute[i], tCompute[i + 1]),
+                                (acc_vec_gate_alpha[0], acc_vec_gate_alpha[1]),
+                            )
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.mul_packed_f32x2(
+                                (tCompute[i], tCompute[i + 1]),
+                                (acc_vec_up_alpha[0], acc_vec_up_alpha[1]),
+                            )
+                    else:
+                        # SwiGLU Unpacked Version: scalar operations
+                        # Computes: output = (alpha * up) * silu(alpha * gate)
+                        for i in cutlass.range_constexpr(cute.size(tTR_rAcc_up)):
+                            acc_vec_up_alpha = acc_vec_up[i] * cutlass.Float32(
+                                alpha_val
+                            )
+                            acc_vec_gate_alpha = acc_vec_gate[i] * cutlass.Float32(
+                                alpha_val
+                            )
+                            tCompute[i] = acc_vec_up_alpha * silu_f32(
+                                acc_vec_gate_alpha, fastmath=True
+                            )
+
+                    if cutlass.const_expr(self.generate_sfc):
+                        #
+                        # Quantization path for Float4E2M1FN output:
+                        # 1. Compute per-vector absolute max from SwiGLU result
+                        # 2. Generate scale factor C (SFC) based on max values
+                        # 3. Store SFC to global memory
+                        # 4. Quantize output by scaling with reciprocal of SFC
+                        #
+                        # Assume subtile partitioned always happens on n dimension
+                        sfc_subtile_idx_mn = (
+                            tile_info[0] * self.epi_tile_cnt[0],
+                            tile_info[1] * self.epi_tile_cnt[1] + real_subtile_idx,
+                        )
+                        tCgSFC = tCgSFC_mn[
+                            (
+                                None,
+                                None,
+                                None,
+                                *sfc_subtile_idx_mn,
+                            )
+                        ]
+
+                        #
+                        # Get absolute max across a vector and Compute SFC
+                        #
+                        tTR_rAcc_frg = cute.logical_divide(
+                            tCompute, cute.make_layout(self.sf_vec_size)
+                        )
+                        acc_frg = tTR_rAcc_frg.load()
+                        acc_frg = epilogue_op(acc_frg)
+
+                        # Apply element-wise absolute value using math.absf (supports vectors)
+                        abs_acc_frg_ir = math.absf(acc_frg.ir_value())
+                        abs_acc_frg = type(acc_frg)(
+                            abs_acc_frg_ir, acc_frg.shape, acc_frg.dtype
+                        )
+
+                        if cutlass.const_expr(self.vectorized_f32):
+                            for vi in cutlass.range_constexpr(abs_acc_frg.shape[1]):
+                                tCrSFC_pvscale[vi] = abs_acc_frg[None, vi].reduce(
+                                    cute.ReductionOp.MAX,
+                                    cutlass.Float32(0.0),
+                                    0,  # Use 0.0 as init for abs values
+                                )
+                            for vi in cutlass.range_constexpr(
+                                0, abs_acc_frg.shape[1], 2
+                            ):
+                                tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1] = (
+                                    cute.arch.mul_packed_f32x2(
+                                        (tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1]),
+                                        (
+                                            self.get_dtype_rcp_limits(self.c_dtype),
+                                            self.get_dtype_rcp_limits(self.c_dtype),
+                                        ),
+                                    )
+                                )
+                                tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1] = (
+                                    cute.arch.mul_packed_f32x2(
+                                        (tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1]),
+                                        (norm_const, norm_const),
+                                    )
+                                )
+                        else:
+                            for vi in cutlass.range_constexpr(abs_acc_frg.shape[1]):
+                                tCrSFC_pvscale[vi] = (
+                                    abs_acc_frg[None, vi].reduce(
+                                        cute.ReductionOp.MAX,
+                                        cutlass.Float32(0.0),
+                                        0,  # Use 0.0 as init for abs values
+                                    )
+                                    * self.get_dtype_rcp_limits(self.c_dtype)
+                                    * norm_const
+                                )
+
+                        # TODO: need to add f32x2 -> f8x2 conversion
+                        tCrSFC.store(tCrSFC_pvscale.load().to(self.sf_dtype))
+
+                        #
+                        # Store SFC to global memory
+                        #
+                        # TODO: Need to think about predicate on it
+                        # if cute.elem_less():
+                        cute.autovec_copy(tCrSFC, tCgSFC)
+
+                        #
+                        # Compute quantized output values and convert to C type
+                        #
+                        # TODO: need to add f8x2 -> f32x2 conversion
+                        tCrSFC_qpvscale_up = tCrSFC.load().to(cutlass.Float32)
+                        fp32_max = cutlass.Float32(3.40282346638528859812e38)
+                        if cutlass.const_expr(self.vectorized_f32):
+                            for vi in cutlass.range_constexpr(0, cute.size(tCrSFC), 2):
+                                acc_scale = cute.arch.mul_packed_f32x2(
+                                    (
+                                        cute.arch.rcp_approx(tCrSFC_qpvscale_up[vi]),
+                                        cute.arch.rcp_approx(
+                                            tCrSFC_qpvscale_up[vi + 1]
+                                        ),
+                                    ),
+                                    (norm_const, norm_const),
+                                )
+                                acc_scale_min0 = fmin(acc_scale[0], fp32_max, nan=True)
+                                acc_scale_min1 = fmin(acc_scale[1], fp32_max, nan=True)
+
+                                vec0 = tTR_rAcc_frg[None, vi]
+                                vec1 = tTR_rAcc_frg[None, vi + 1]
+                                for ei in cutlass.range_constexpr(self.sf_vec_size):
+                                    vec0[ei], vec1[ei] = cute.arch.mul_packed_f32x2(
+                                        (vec0[ei], vec1[ei]),
+                                        (acc_scale_min0, acc_scale_min1),
+                                    )
+                        else:
+                            for vi in cutlass.range_constexpr(cute.size(tCrSFC)):
+                                # TODO:Need to add E8M0 rcp approximation
+                                acc_scale = norm_const * cute.arch.rcp_approx(
+                                    tCrSFC_qpvscale_up[vi]
+                                )
+                                acc_scale = fmin(acc_scale, fp32_max, nan=True)
+
+                                vec = tTR_rAcc_frg[None, vi]
+                                for ei in cutlass.range_constexpr(self.sf_vec_size):
+                                    vec[ei] = vec[ei] * acc_scale
+
+                        acc_vec = tiled_copy_r2s.retile(tCompute).load()
+                        tRS_rC.store(acc_vec.to(self.c_dtype))
+                    else:
+                        #
+                        # Convert to C type
+                        #
+                        acc_vec = tiled_copy_r2s.retile(tCompute).load()
+                        acc_vec = epilogue_op(acc_vec.to(self.c_dtype))
+                        tRS_rC.store(acc_vec)
+
+                    #
+                    # Store C to shared memory
+                    #
+                    num_prev_subtiles = num_prev_subtiles + 1
+                    c_buffer = num_prev_subtiles % self.num_c_stage
+
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rC,
+                        tRS_sC[(None, None, None, c_buffer)],
+                    )
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    self.epilog_sync_barrier.arrive_and_wait()
+                    #
+                    # TMA store C to global memory
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_c,
+                            bSG_sC[(None, c_buffer)],
+                            bSG_gC[(None, real_subtile_idx)],
+                        )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        c_pipeline.producer_commit()
+                        c_pipeline.producer_acquire()
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                #
+                # Async arrive accumulator buffer empty
+                #
+                if cutlass.const_expr(not self.overlapping_accum):
+                    with cute.arch.elect_one():
+                        acc_pipeline.consumer_release(acc_consumer_state)
+                    acc_consumer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Dealloc the tensor memory buffer
+            #
+            tmem.relinquish_alloc_permit()
+            self.epilog_sync_barrier.arrive_and_wait()
+            tmem.free(tmem_ptr)
+            #
+            # Wait for C store complete
+            #
+            c_pipeline.producer_tail()
+
+        griddepcontrol_launch_dependents()
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory
+        (source) and register array (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc_up, tTR_rAcc_gate) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc_up: The partitioned accumulator tensor for acc up
+            - tTR_rAcc_gate: The partitioned accumulator tensor for acc gate
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(
+            copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+        )
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_mnl_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc_up = cute.make_rmem_tensor(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc_gate = cute.make_rmem_tensor(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc_up, tTR_rAcc_gate
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register
+        array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+        :type sepi: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        - partition register array (source) and global memory (destination) for none TMA store version;
+        - partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
+        :type atom: cute.CopyAtom or cute.TiledCopy
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing :
+            - For TMA store: (tma_atom_c, bSG_sC, bSG_gC) where:
+                - tma_atom_c: The TMA copy atom
+                - bSG_sC: The partitioned shared memory tensor C
+                - bSG_gC: The partitioned global tensor C
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        tma_atom_c = atom
+        sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+        gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL)
+        bSG_sC, bSG_gC = cpasync.tma_partition(
+            tma_atom_c,
+            0,
+            cute.make_layout(1),
+            sC_for_tma_partition,
+            gC_for_tma_partition,
+        )
+        return tma_atom_c, bSG_sC, bSG_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        num_smem_capacity: int,
+        occupancy: int,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/C operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout of operand C.
+        :type c_layout: utils.LayoutEnum
+        :param sf_dtype: Data type of scale factor.
+        :type sf_dtype: type[cutlass.Numeric]
+        :param sf_vec_size: Vector size of scale factor.
+        :type sf_vec_size: int
+        :param num_smem_capacity: Total available shared memory capacity in bytes.
+        :type num_smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, C stages)
+        :rtype: tuple[int, int, int]
+        """
+        # Default ACC stages
+        num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
+
+        # Default C stages
+        num_c_stage = 2
+
+        # Default Tile info stages
+        num_tile_stage = 2
+
+        # Calculate smem layout and size for one stage of A, B, and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile,
+            1,
+        )
+
+        ab_bytes_per_stage = (
+            cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
+        )
+        # 1024B alignment
+        mbar_helpers_bytes = 1024
+        c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+        c_bytes = c_bytes_per_stage * num_c_stage
+
+        # Calculate A/B stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial C stages bytes
+        # Divide remaining by bytes needed per A/B stage
+        num_ab_stage = (
+            num_smem_capacity // occupancy - (mbar_helpers_bytes + c_bytes)
+        ) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        num_c_stage += (
+            num_smem_capacity
+            - occupancy * ab_bytes_per_stage * num_ab_stage
+            - occupancy * (mbar_helpers_bytes + c_bytes)
+        ) // (occupancy * c_bytes_per_stage)
+        return num_acc_stage, num_ab_stage, num_c_stage, num_tile_stage  # type: ignore[return-value]
+
+    @staticmethod
+    def _compute_grid(
+        c: cute.Tensor,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+    ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size for the output tensor C.
+
+        :param c: The output tensor C
+        :type c: cute.Tensor
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        c_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
+        gc = cute.zipped_divide(c, tiler=c_shape)
+        num_ctas_mnl = gc[(0, (None, None, None))].shape
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(
+            num_ctas_mnl, cluster_shape_mnl
+        )
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(
+            tile_sched_params, max_active_clusters
+        )
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def _get_tma_atom_kind(
+        atom_sm_cnt: cutlass.Int32, mcast: cutlass.Boolean
+    ) -> Union[
+        cpasync.CopyBulkTensorTileG2SMulticastOp, cpasync.CopyBulkTensorTileG2SOp
+    ]:
+        """
+        Select the appropriate TMA copy atom based on the number of SMs and the multicast flag.
+
+        :param atom_sm_cnt: The number of SMs
+        :type atom_sm_cnt: cutlass.Int32
+        :param mcast: The multicast flag
+        :type mcast: cutlass.Boolean
+
+        :return: The appropriate TMA copy atom kind
+        :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp
+
+        :raise ValueError: If the atom_sm_cnt is invalid
+        """
+        if atom_sm_cnt == 2 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 2 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 1 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE)
+        elif atom_sm_cnt == 1 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+
+        raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}")
+
+    @staticmethod
+    def get_dtype_rcp_limits(dtype: Type[cutlass.Numeric]) -> float:
+        """
+        Calculates the reciprocal of the maximum absolute value for a given data type.
+
+        :param dtype: Data type
+        :type dtype: Type[cutlass.Numeric]
+
+        :return: An float representing the reciprocal of the maximum absolute value
+        :rtype: float
+        """
+        if dtype == cutlass.Float4E2M1FN:
+            return 1 / 6.0
+        if dtype == cutlass.Float8E4M3FN:
+            return 1 / 448.0
+        if dtype == cutlass.Float8E5M2:
+            return 1 / 128.0
+        return 1.0
+
+    @staticmethod
+    def is_valid_dtypes_and_scale_factor_vec_size(
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        c_dtype: Type[cutlass.Numeric],
+    ) -> bool:
+        """
+        Check if the dtypes are valid
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+
+        :return: True if the dtypes are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        if ab_dtype not in {
+            cutlass.Float4E2M1FN,
+            cutlass.Float8E5M2,
+            cutlass.Float8E4M3FN,
+        }:
+            is_valid = False
+
+        # Check valid sf_vec_size
+        if sf_vec_size not in {16, 32}:
+            is_valid = False
+
+        # Check valid sf_dtype
+        if sf_dtype not in {cutlass.Float8E8M0FNU, cutlass.Float8E4M3FN}:
+            is_valid = False
+
+        # Check valid sf_dtype and sf_vec_size combinations
+        if sf_dtype == cutlass.Float8E4M3FN and sf_vec_size == 32:
+            is_valid = False
+        if ab_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} and sf_vec_size == 16:
+            is_valid = False
+
+        # Check valid c_dtype
+        if c_dtype not in {
+            cutlass.Float32,
+            cutlass.Float16,
+            cutlass.BFloat16,
+            cutlass.Float8E5M2,
+            cutlass.Float8E4M3FN,
+            cutlass.Float4E2M1FN,
+        }:
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_layouts(
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if layouts and dtypes are valid combinations
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major dimension of the A tensor
+        :type a_major: str
+        :param b_major: The major dimension of the B tensor
+        :type b_major: str
+        :param c_major: The major dimension of the C tensor
+        :type c_major: str
+
+        :return: True if the layouts are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
+            is_valid = False
+        if c_dtype is cutlass.Float4E2M1FN and c_major == "m":
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_mma_tiler_and_cluster_shape(
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ) -> bool:
+        """
+        Check if the mma tiler and cluster shape are valid
+
+        :param use_2cta_instrs: Whether to use 2 CTA groups
+        :type use_2cta_instrs: bool
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+
+        :return: True if the mma tiler and cluster shape are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        # Skip invalid mma tile shape
+        if mma_tiler_mn[0] not in (128, 256):
+            is_valid = False
+        # Skip invalid mma tile n
+        # SwiGlu Fusion requires even epi_tile counts,
+        # based on epi_tile_n = 64, only mma_tiler_n = 128 and 256 are supported
+        if mma_tiler_mn[1] not in (128, 256):
+            is_valid = False
+
+        # Skip illegal cluster shape
+        if (mma_tiler_mn[0] // cluster_shape_mn[0]) != 128:
+            is_valid = False
+
+        if (
+            cluster_shape_mn[0] * cluster_shape_mn[1] > 16
+            or cluster_shape_mn[0] <= 0
+            or cluster_shape_mn[1] <= 0
+            # Special cluster shape check for scale factor multicasts.
+            # Due to limited size of scale factors, we can't multicast among more than 4 CTAs.
+            or cluster_shape_mn[0] > 4
+            or cluster_shape_mn[1] > 4
+            or not is_power_of_2(cluster_shape_mn[0])
+            or not is_power_of_2(cluster_shape_mn[1])
+        ):
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_tensor_alignment(
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the tensor alignment is valid
+
+        :param m: The number of rows in the A tensor
+        :type m: cutlass.Int64
+        :param n: The number of columns in the B tensor
+        :type n: cutlass.Int64
+        :param k: The number of columns in the A tensor
+        :type k: cutlass.Int64
+        :param l: The number of columns in the C tensor
+        :type l: cutlass.Int64
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the problem shape is valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // dtype.width
+            return num_major_elements % num_contiguous_elements == 0
+
+        if (
+            not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
+            or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
+            or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l))
+        ):
+            is_valid = False
+        return is_valid
+
+    @classmethod
+    def can_implement(
+        cls,
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        c_dtype: Type[cutlass.Numeric],
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the gemm can be implemented
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+        :param m: The number of rows in the A tensor
+        :type m: cutlass.Int64
+        :param n: The number of columns in the B tensor
+        :type n: cutlass.Int64
+        :param k: The number of columns in the A tensor
+        :type k: cutlass.Int64
+        :param l: The number of columns in the C tensor
+        :type l: cutlass.Int64
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the gemm can be implemented, False otherwise
+        :rtype: bool
+        """
+        can_implement = True
+        # Skip unsupported types
+        if not cls.is_valid_dtypes_and_scale_factor_vec_size(
+            ab_dtype, sf_dtype, sf_vec_size, c_dtype
+        ):
+            can_implement = False
+
+        # Skip unsupported layouts
+        if not cls.is_valid_layouts(ab_dtype, c_dtype, a_major, b_major, c_major):
+            can_implement = False
+
+        # Skip invalid mma tile shape and cluster shape
+        if not cls.is_valid_mma_tiler_and_cluster_shape(mma_tiler_mn, cluster_shape_mn):
+            can_implement = False
+        # Skip illegal problem shape for load/store alignment
+        if not cls.is_valid_tensor_alignment(
+            m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+        # Skip unsupported A/B layout
+        if not (a_major == "k" and b_major == "k"):
+            can_implement = False
+        return can_implement
+
+    @cute.jit
+    def wrapper(
+        self,
+        a_ptr: cute.Pointer,
+        b_ptr: cute.Pointer,
+        a_sf_ptr: cute.Pointer,
+        b_sf_ptr: cute.Pointer,
+        c_ptr: cute.Pointer,
+        c_sf_ptr: cute.Pointer,
+        alpha_ptr: cute.Pointer,
+        tile_idx_to_group_idx_ptr: cute.Pointer,
+        num_non_exiting_tiles_ptr: cute.Pointer,
+        global_sf_ptr: cute.Pointer,
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        tile_size: cutlass.Constexpr,
+        scaling_vector_size: cutlass.Constexpr,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        scale_k = k // scaling_vector_size
+        interm_size = n // 2
+        scale_interm_size = interm_size // scaling_vector_size
+        num_tiles = m // tile_size
+        a = cute.make_tensor(
+            a_ptr, layout=cute.make_ordered_layout((m, k, 1), order=(1, 0, 2))
+        )
+        b = cute.make_tensor(
+            b_ptr, layout=cute.make_ordered_layout((n, k, l), order=(1, 0, 2))
+        )
+        a_sf = cute.make_tensor(
+            a_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, m // 128, 4, scale_k // 4, 1), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+        b_sf = cute.make_tensor(
+            b_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, n // 128, 4, scale_k // 4, l), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+        c = cute.make_tensor(
+            c_ptr, layout=cute.make_ordered_layout((m, interm_size, 1), order=(1, 0, 2))
+        )
+        c_sf = cute.make_tensor(
+            c_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, m // 128, 4, scale_interm_size // 4, 1),
+                order=(2, 1, 4, 0, 3, 5),
+            ),
+        )
+        alpha = cute.make_tensor(alpha_ptr, layout=cute.make_layout((l,)))
+
+        tile_idx_to_group_idx = cute.make_tensor(
+            tile_idx_to_group_idx_ptr, layout=cute.make_layout((num_tiles,))
+        )
+        num_non_exiting_tiles = cute.make_tensor(
+            num_non_exiting_tiles_ptr, layout=cute.make_layout((1,))
+        )
+        global_sf = cute.make_tensor(global_sf_ptr, layout=cute.make_layout((1,)))
+
+        return self(
+            a,
+            b,
+            c,
+            a_sf,
+            b_sf,
+            c_sf,
+            global_sf,
+            tile_idx_to_group_idx,
+            num_non_exiting_tiles,
+            alpha,
+            max_active_clusters=max_active_clusters,
+            stream=stream,
+            epilogue_op=epilogue_op,
+        )
+
+
+@cute.jit
+def cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+    sf_ref_tensor: cute.Tensor,
+    sf_mma_tensor: cute.Tensor,
+):
+    """Convert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layout"""
+    # sf_mma_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
+    # group to ((32, 4, rest_m), (4, rest_k), l)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 0, 3)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 1, 3)
+    for i in cutlass.range(cute.size(sf_ref_tensor)):
+        mkl_coord = sf_ref_tensor.layout.get_hier_coord(i)
+        sf_mma_tensor[mkl_coord] = sf_ref_tensor[mkl_coord]
diff --git a/flashinfer/fused_moe/cute_dsl/blackwell/custom_pipeline.py b/flashinfer/fused_moe/cute_dsl/blackwell/custom_pipeline.py
new file mode 100644
index 0000000000..8f5be11589
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/blackwell/custom_pipeline.py
@@ -0,0 +1,542 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This file is copied and modified from https://github.com/NVIDIA/cutlass/tree/main/python/CuTeDSL/cutlass/pipeline
+
+from dataclasses import dataclass
+from typing import Optional
+
+import cutlass.cute as cute
+from cutlass.cutlass_dsl import Boolean, if_generate
+from cutlass.pipeline import (
+    Agent,
+    CooperativeGroup,
+    PipelineAsync,
+    PipelineOp,
+    PipelineState,
+    agent_sync,
+)
+
+
+def pipeline_init_wait(cta_layout_vmnk: Optional[cute.Layout] = None):
+    """Initializes the mbarrier and synchronizes the threadblock or cluster.
+
+    This function places a fence on the mbarrier initialization to ensure
+    proper synchronization across the threadblock or cluster.
+
+    Args:
+        cta_layout_vmnk (Optional[cute.Layout]): The CTA layout for VMNK. Defaults to None.
+    """
+    cute.arch.mbarrier_init_fence()
+
+
+##############################################################################
+# Pipeline classes
+##############################################################################
+
+
+@dataclass(frozen=True)
+class PipelineTmaUmma(PipelineAsync):
+    """PipelineTmaUmma is used for TMA producers and UMMA consumers.
+
+    This class is typically used in scenarios such as Blackwell mainloops, where TMA (Tensor Memory Access) producers interact with UMMA (Universal Matrix Multiply Accumulate) consumers.
+
+    Attributes:
+        is_leader_cta (bool): Indicates if the current CTA is the leader.
+        cta_group (cute.nvgpu.tcgen05.CtaGroup): The CTA group associated with the pipeline.
+    """
+
+    is_leader_cta: bool
+    cta_group: cute.nvgpu.tcgen05.CtaGroup
+
+    @staticmethod
+    def _compute_mcast_arrival_mask(
+        cta_layout_vmnk: cute.Layout, mcast_mode_mn: tuple[int, int]
+    ):
+        """Computes a mask for signaling arrivals to multicasting threadblocks.
+
+        Returns:
+            The computed mask for multicasting threadblocks.
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        cta_in_cluster_coord_vmnk = cta_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+
+        tma_mcast_mask_a = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, cta_in_cluster_coord_vmnk, mcast_mode=2
+        )
+        tma_mcast_mask_b = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, cta_in_cluster_coord_vmnk, mcast_mode=1
+        )
+
+        block_in_cluster_coord_vmnk_peer = (
+            cta_in_cluster_coord_vmnk[0] ^ 1,
+            *cta_in_cluster_coord_vmnk[1:],
+        )
+        tma_mcast_mask_a_peer = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=2
+        )
+        tma_mcast_mask_b_peer = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=1
+        )
+
+        assert not (mcast_mode_mn[0] == 0 and mcast_mode_mn[1] == 0)
+        if mcast_mode_mn[0] == 1 and mcast_mode_mn[1] == 1:
+            return (
+                tma_mcast_mask_a
+                | tma_mcast_mask_b
+                | tma_mcast_mask_a_peer
+                | tma_mcast_mask_b_peer
+            )
+        elif mcast_mode_mn[1] == 1:
+            return tma_mcast_mask_b | tma_mcast_mask_b_peer
+        assert mcast_mode_mn[0] == 1
+        return tma_mcast_mask_a | tma_mcast_mask_a_peer
+
+    @staticmethod
+    def _compute_is_leader_cta(cta_layout_vmnk: cute.Layout):
+        """
+        Computes leader threadblocks for 2CTA kernels. For 1CTA, all threadblocks are leaders.
+
+        Args:
+            cta_layout_vmnk (cute.Layout): Layout of the cluster shape.
+
+        Returns:
+            bool: True if the current threadblock is a leader, False otherwise.
+        """
+        bidx, bidy, _ = cute.arch.block_idx()
+
+        mma_coord_vmnk = (
+            bidx % cute.size(cta_layout_vmnk, mode=[0]),
+            bidx // cute.size(cta_layout_vmnk, mode=[0]),
+            bidy,
+            None,
+        )
+        return mma_coord_vmnk[0] == 0
+
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+        mcast_mode_mn: tuple[int, int] = (1, 1),
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
+
+        Args:
+            barrier_storage (cute.Pointer): Pointer to the smem address for this pipeline's mbarriers.
+            num_stages (int): Number of buffer stages for this pipeline.
+            producer_group (CooperativeGroup): `CooperativeGroup` for the producer agent.
+            consumer_group (CooperativeGroup): `CooperativeGroup` for the consumer agent.
+            tx_count (int): Number of bytes expected to be written to the transaction barrier for one stage.
+            cta_layout_vmnk (cute.Layout | None): Layout of the cluster shape.
+            mcast_mode_mn (tuple[int, int]): Tuple of two integers, specifying whether mcast is enabled for the m and n modes. At least one of the two integers must be 1.
+
+        Returns:
+            PipelineTmaUmma: An instance of PipelineTmaUmma with all necessary attributes computed.
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+
+        producer_type = PipelineOp.TmaLoad
+        consumer_type = PipelineOp.TCGen05Mma
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer, tx_count
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            # No mcast mask if not using clusters
+            producer_mask = None
+            # All threadblocks are leaders if not using clusters
+            is_leader_cta = True
+        else:
+            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(
+                cta_layout_vmnk, mcast_mode_mn
+            )
+            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk)
+
+        cta_group = (
+            cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            else cute.nvgpu.tcgen05.CtaGroup.TWO
+        )
+
+        consumer_mask = producer_mask
+
+        pipeline_init_wait(cta_layout_vmnk)
+
+        return PipelineTmaUmma(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            is_leader_cta,
+            cta_group,
+        )
+
+    def consumer_release(self, state: PipelineState):
+        """
+        UMMA consumer release buffer empty, cta_group needs to be provided.
+
+        Google style:
+        Args:
+            state (PipelineState): The current pipeline state.
+
+        Returns:
+            None
+        """
+        self.sync_object_empty.arrive(state.index, self.consumer_mask, self.cta_group)
+
+    def producer_acquire(
+        self, state: PipelineState, try_acquire_token: Optional[Boolean] = None
+    ):
+        """
+        Conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
+
+        Google style:
+        This method is used by the TMA producer to conditionally wait for the buffer to be empty and, for leader threadblocks, to set the transaction barrier.
+
+        Args:
+            state (PipelineState): The current pipeline state.
+            try_acquire_token (Optional[Boolean]): Optional token to control conditional acquire.
+
+        Returns:
+            None
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase),
+        )
+        if_generate(
+            self.is_leader_cta,
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+        )
+
+    def producer_commit(self, state: PipelineState):
+        """
+        TMA producer commit is a noop since TMA instruction itself updates the transaction count.
+
+        Google style:
+        This method does nothing because the TMA instruction automatically updates the transaction count.
+
+        Args:
+            state (PipelineState): The current pipeline state.
+
+        Returns:
+            None
+        """
+
+
+@dataclass(frozen=True)
+class PipelineUmmaAsync(PipelineAsync):
+    cta_group: cute.nvgpu.tcgen05.CtaGroup
+
+    @staticmethod
+    def _compute_tmem_sync_mask(cta_layout_vmnk: cute.Layout):
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        cta_in_cluster_coord_vmnk = cta_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+        return cute.make_layout_image_mask(
+            cta_layout_vmnk, cta_in_cluster_coord_vmnk, mode=0
+        )
+
+    @staticmethod
+    def _compute_peer_cta_rank():
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        return cta_rank_in_cluster // 2 * 2
+
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineUmmaAsync.
+
+        Args:
+            barrier_storage (cute.Pointer): Pointer to the smem address for this pipeline's mbarriers.
+            num_stages (int): Number of buffer stages for this pipeline.
+            producer_group (CooperativeGroup): CooperativeGroup for the producer agent.
+            consumer_group (CooperativeGroup): CooperativeGroup for the consumer agent.
+            cta_layout_vmnk (cute.Layout or None): Layout of the cluster shape.
+
+        Returns:
+            PipelineUmmaAsync: An instance of PipelineUmmaAsync.
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+
+        producer_type = PipelineOp.TCGen05Mma
+        consumer_type = PipelineOp.AsyncThread
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            # Set mask to None if not using clusters (i.e. 1CTA kernels)
+            producer_mask = None
+        else:
+            producer_mask = PipelineUmmaAsync._compute_tmem_sync_mask(cta_layout_vmnk)
+
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1:
+            # Set mask to None if not using 2CTA instructions
+            consumer_mask = None
+        else:
+            consumer_mask = PipelineUmmaAsync._compute_peer_cta_rank()
+
+        cta_group = (
+            cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            else cute.nvgpu.tcgen05.CtaGroup.TWO
+        )
+
+        pipeline_init_wait(cta_layout_vmnk)
+
+        return PipelineUmmaAsync(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            cta_group,
+        )
+
+    def producer_commit(self, state: PipelineState):
+        self.sync_object_full.arrive(state.index, self.producer_mask, self.cta_group)
+
+    def producer_tail(self, state: PipelineState):
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        is_leader_cta = cta_rank_in_cluster % 2 == 0
+
+        def then_body():
+            # Assume state contains that next useful buffer
+            # So we only need to advance to num_stages - 1 times to last used buffer
+            for i in range(self.num_stages - 1):  # noqa: B007
+                state.advance()
+            self.producer_acquire(state)
+
+        if_generate(is_leader_cta, then_body)
+
+
+@dataclass(frozen=True)
+class PipelineCpAsyncUmma(PipelineAsync):
+    """
+    PipelineCpAsyncUmma is used for LDGSTS (CpAsync) producers and UMMA consumers.
+
+    This pipeline is specifically designed for scenarios where:
+    - Producers use LDGSTS instructions (cp.async) to load data from global to shared memory
+    - Consumers are UMMA warps that perform MMA operations using the loaded data
+
+    Key differences from PipelineAsyncUmma:
+    - Suitable for gather/permutation operations during load
+    - Used in this kernel for A and SFA matrices with token-based gather addressing
+    """
+
+    cta_group: cute.nvgpu.tcgen05.CtaGroup
+
+    @staticmethod
+    def _compute_leading_cta_rank(cta_v_size):
+        """
+        Computes the leading CTA rank.
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        return cta_rank_in_cluster // cta_v_size * cta_v_size
+
+    @staticmethod
+    def _compute_is_leader_cta(cta_layout_vmnk: cute.Layout):
+        """
+        Computes leader threadblocks for 2CTA kernels. For 1CTA, all threadblocks are leaders.
+        """
+        bidx, bidy, _ = cute.arch.block_idx()
+        mma_coord_vmnk = (
+            bidx % cute.size(cta_layout_vmnk, mode=[0]),
+            bidx // cute.size(cta_layout_vmnk, mode=[0]),
+            bidy,
+            None,
+        )
+        return mma_coord_vmnk[0] == 0
+
+    @staticmethod
+    def _compute_peer_cta_mask(cta_layout_vmnk: cute.Layout):
+        """
+        Computes a mask for signaling arrivals to multicasting threadblocks.
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        cta_in_cluster_coord_vmnk = cta_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+        mask_self = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, cta_in_cluster_coord_vmnk, mcast_mode=0
+        )
+        block_in_cluster_coord_vmnk_peer = (
+            cta_in_cluster_coord_vmnk[0] ^ 1,
+            *cta_in_cluster_coord_vmnk[1:],
+        )
+        mask_peer = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=0
+        )
+        return mask_self | mask_peer
+
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+        defer_sync: bool = False,
+    ):
+        """Creates and initializes a new PipelineCpAsyncUmma instance.
+
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: int
+        :param producer_group: CooperativeGroup for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: CooperativeGroup for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param barrier_storage: Pointer to the shared memory address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer, optional
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout, optional
+        :param defer_sync: Whether to defer the sync
+        :type defer_sync: bool, optional
+        :raises ValueError: If barrier_storage is not a cute.Pointer instance
+        :return: A new PipelineCpAsyncUmma instance configured with the provided parameters
+        :rtype: PipelineCpAsyncUmma
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+
+        producer_type = PipelineOp.AsyncLoad
+        consumer_type = PipelineOp.TCGen05Mma
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8),
+            num_stages,
+            producer,
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        cta_v_size = (
+            cute.size(cta_layout_vmnk, mode=[0]) if cta_layout_vmnk is not None else 1
+        )
+        cta_group = (
+            cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            else cute.nvgpu.tcgen05.CtaGroup.TWO
+        )
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1:
+            # No mcast mask if we're not using 2CTA tcgen05 MMA
+            producer_mask = None
+            consumer_mask = None
+        else:
+            # If we're using 2CTA UMMAs, producer will arrive the mbar on leading CTA
+            # We need to get the target cta_rank
+            producer_mask = PipelineCpAsyncUmma._compute_leading_cta_rank(cta_v_size)
+            # consumer needs to get the mask to signal
+            consumer_mask = PipelineCpAsyncUmma._compute_peer_cta_mask(cta_layout_vmnk)
+
+        if not defer_sync:
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+                agent_sync(Agent.ThreadBlock)
+            else:
+                agent_sync(Agent.ThreadBlockCluster, is_relaxed=True)
+
+        return PipelineCpAsyncUmma(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            cta_group,
+        )
+
+    def consumer_release(self, state: PipelineState):
+        """
+        UMMA consumer release buffer empty, cta_group needs to be provided.
+        """
+        self.sync_object_empty.arrive(state.index, self.consumer_mask, self.cta_group)
diff --git a/flashinfer/fused_moe/cute_dsl/blackwell/utils.py b/flashinfer/fused_moe/cute_dsl/blackwell/utils.py
new file mode 100644
index 0000000000..5f2ffa5e7e
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/blackwell/utils.py
@@ -0,0 +1,350 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This file is copied and modified from cutlass https://github.com/NVIDIA/cutlass/blob/main/python/CuTeDSL/cutlass/cute/core.py
+
+import ctypes
+import os
+from typing import Union
+
+import cutlass
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass.cute as cute
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import llvm, nvvm
+from cutlass.cute.typing import AddressSpace, Numeric, Pointer, Type
+from cutlass.cutlass_dsl import T, dsl_user_op
+
+TRTLLM_ENABLE_PDL = os.environ.get("TRTLLM_ENABLE_PDL", "1") == "1"
+
+
+# WAR for CuTeDSL make_ptr implementation
+class _Pointer(Pointer):
+    """Represents a runtime pointer that can interoperate with various data structures,
+    including numpy arrays and device memory.
+
+    Args:
+        pointer (int or pointer-like object): The pointer to the data.
+        dtype (Type): Data type of the elements pointed to.
+        mem_space (_cute_ir.AddressSpace, optional): Memory space where the pointer resides. Defaults to generic.
+        assumed_align (int, optional): Alignment of the input pointer in bytes. Defaults to None.
+
+    Attributes:
+        _pointer: The underlying pointer.
+        _dtype: Data type of the elements.
+        _addr_space: Memory space of the pointer.
+        _assumed_align: Alignment of the pointer in bytes.
+        _desc: C-type descriptor for the pointer.
+        _c_pointer: C-compatible pointer representation.
+    """
+
+    def __init__(
+        self,
+        pointer,
+        dtype,
+        mem_space: _cute_ir.AddressSpace = _cute_ir.AddressSpace.generic,
+        assumed_align=None,
+    ):
+        self._pointer = pointer
+        self._dtype = dtype
+        self._addr_space = mem_space
+
+        if assumed_align is None:
+            self._assumed_align = dtype.width // 8
+        else:
+            self._assumed_align = assumed_align
+
+        self._desc = None
+        self._c_pointer = None
+        assert int(self._pointer) % self._assumed_align == 0, (
+            f"pointer must be {self._assumed_align} bytes aligned"
+        )
+
+    def size_in_bytes(self) -> int:
+        return ctypes.sizeof(ctypes.c_void_p(int(self._pointer)))
+
+    def __get_mlir_types__(self):
+        return [self.mlir_type]
+
+    def __c_pointers__(self):
+        if self._c_pointer is None:
+            self._desc = ctypes.c_void_p(int(self._pointer))
+            self._c_pointer = ctypes.addressof(self._desc)
+        return [self._c_pointer]
+
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 1
+        return values[0]
+
+    # Move mlir Type out of __init__ to decouple with mlir Context
+    @property
+    def mlir_type(self) -> ir.Type:
+        return _cute_ir.PtrType.get(
+            self._dtype.mlir_type, self._addr_space, self._assumed_align
+        )
+
+    @property
+    def dtype(self) -> Type[Numeric]:
+        return self._dtype
+
+    @property
+    def memspace(self):
+        return self._addr_space
+
+    def align(self, min_align: int, *, loc=None, ip=None) -> Pointer:
+        raise NotImplementedError("align is not supported in runtime")
+
+    def verify(self, expected_py_type):
+        if expected_py_type is Pointer or (
+            isinstance(expected_py_type, ir.Value) and expected_py_type.ty is Pointer
+        ):
+            return True
+
+        return False
+
+    def __str__(self) -> str:
+        return f"Ptr<0x{int(self._pointer):016x}@{self._addr_space}>"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def make_ptr(
+    dtype: Type[Numeric],
+    value: Union[int, ctypes._Pointer],
+    mem_space: AddressSpace = AddressSpace.generic,
+    assumed_align=None,
+) -> Pointer:
+    """Creates a pointer from a memory address.
+
+    Args:
+        dtype (Type[Numeric]): Data type of the pointer elements.
+        value (Union[int, ctypes._Pointer]): Memory address as an integer or ctypes pointer.
+        mem_space (AddressSpace, optional): Memory address space. Defaults to AddressSpace.generic.
+        assumed_align (int, optional): Alignment in bytes. Defaults to None.
+
+    Returns:
+        Pointer: A pointer object.
+
+    Example:
+        ```python
+        import numpy as np
+        import ctypes
+        from cutlass import Float32
+        from cutlass.cute.runtime import make_ptr
+
+        # Create a numpy array
+        a = np.random.randn(16, 32).astype(np.float32)
+        # Get pointer address as ctypes pointer
+        ptr_address = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+        # Create pointer from address
+        y = make_ptr(cutlass.Float32, ptr_address)
+        ```
+    """
+    # check if value is int or ctypes.POINTER
+    if isinstance(value, int):
+        address_value = value
+    elif isinstance(value, ctypes._Pointer):
+        # get address value
+        address_value = ctypes.cast(value, ctypes.c_void_p).value
+        assert address_value is not None, "Pointer address is None"
+    else:
+        raise TypeError(
+            f"Expect int or ctypes.POINTER for value but got {type(value)=}"
+        )
+
+    return _Pointer(address_value, dtype, mem_space, assumed_align=assumed_align)
+
+
+def is_power_of_2(x: int) -> bool:
+    return x > 0 and (x & (x - 1)) == 0
+
+
+@dsl_user_op
+def fmin(
+    a: Union[float, cutlass.Float32],
+    b: Union[float, cutlass.Float32],
+    *,
+    nan=False,
+    loc=None,
+    ip=None,
+) -> cutlass.Float32:
+    return cutlass.Float32(
+        nvvm.fmin(
+            T.f32(),
+            cutlass.Float32(a).ir_value(loc=loc, ip=ip),
+            cutlass.Float32(b).ir_value(loc=loc, ip=ip),
+            nan=nan,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+def sigmoid_f32(
+    a: Union[float, cutlass.Float32], fastmath: bool = False
+) -> Union[float, cutlass.Float32]:
+    """
+    Compute the sigmoid of the input tensor.
+    """
+    return cute.arch.rcp_approx(1.0 + cute.math.exp(-a, fastmath=fastmath))
+
+
+def silu_f32(
+    a: Union[float, cutlass.Float32], fastmath: bool = False
+) -> Union[float, cutlass.Float32]:
+    """
+    Compute the silu of the input tensor.
+    """
+    return a * sigmoid_f32(a, fastmath=fastmath)
+
+
+# TODO(zhichenj): try to move these to NVVM wrapper or helper functions
+@dsl_user_op
+def vectorized_atomic_add_bf16x8(
+    rOut_epi_packed, scatter_out_offset, loc=None, ip=None
+):
+    llvm.inline_asm(
+        None,
+        [
+            scatter_out_offset.iterator.llvm_ptr,
+            llvm.bitcast(T.i32(), rOut_epi_packed[0, None].load().ir_value()),
+            llvm.bitcast(T.i32(), rOut_epi_packed[1, None].load().ir_value()),
+            llvm.bitcast(T.i32(), rOut_epi_packed[2, None].load().ir_value()),
+            llvm.bitcast(T.i32(), rOut_epi_packed[3, None].load().ir_value()),
+        ],
+        "red.global.v4.bf16x2.add.noftz [$0], {$1, $2, $3, $4};",
+        "l,r,r,r,r",
+        has_side_effects=True,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def vectorized_atomic_add_fp32x2(
+    rOut_epi_packed, scatter_out_offset, loc=None, ip=None
+):
+    llvm.inline_asm(
+        None,
+        [
+            scatter_out_offset.iterator.llvm_ptr,
+            rOut_epi_packed[0].ir_value(),
+            rOut_epi_packed[1].ir_value(),
+        ],
+        "red.global.v2.f32.add [$0], {$1, $2};",
+        "l,f,f",
+        has_side_effects=True,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def atomic_add_func(rOut_epi_packed, scatter_out_offset, loc=None, ip=None):
+    if cutlass.const_expr(rOut_epi_packed.dtype == cutlass.Float32):
+        llvm.inline_asm(
+            None,
+            [
+                scatter_out_offset.iterator.llvm_ptr,
+                rOut_epi_packed.ir_value(),
+            ],
+            "red.global.add.f32 [$0], $1;",
+            "l,f",
+            has_side_effects=True,
+            loc=loc,
+            ip=ip,
+        )
+    elif cutlass.const_expr(rOut_epi_packed.dtype == cutlass.BFloat16):
+        llvm.inline_asm(
+            None,
+            [
+                scatter_out_offset.iterator.llvm_ptr,
+                llvm.bitcast(T.i16(), rOut_epi_packed.ir_value()),
+            ],
+            "red.add.noftz.bf16 [$0], $1;",
+            "l,h",
+            has_side_effects=True,
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def griddepcontrol_wait(*, loc=None, ip=None) -> None:
+    """
+    This instruction is used to wait for the previous kernel's grid ending
+    (all blocks of the previous kernel have finished and memflushed), i.e.,
+    the instruction after this instruction will not be issued until the previous
+    grid has finished.
+    """
+    llvm.inline_asm(
+        res=None,
+        operands_=[],
+        asm_string="griddepcontrol.wait;",
+        constraints="",
+        has_side_effects=True,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def griddepcontrol_launch_dependents(*, loc=None, ip=None) -> None:
+    """
+    Issuing the launch_dependents instruction hints a dependent kernel to launch earlier.
+    launch_dependents doesn't impact the functionality but the performance:
+    Launching a dependent kernel too early can compete with current kernels,
+    while launching too late can lead to a long latency.
+    """
+    llvm.inline_asm(
+        res=None,
+        operands_=[],
+        asm_string="griddepcontrol.launch_dependents;",
+        constraints="",
+        has_side_effects=True,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+        loc=loc,
+        ip=ip,
+    )
diff --git a/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py b/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py
new file mode 100644
index 0000000000..39b008e48a
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py
@@ -0,0 +1,613 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file wraps TensorRT-LLM's CuteDSL grouped GEMM with gather and SwiGLU fusion:
+# tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py
+#
+# Original copyright:
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Contiguous Grouped GEMM kernel with Gather and SwiGLU Fusion for MoE workloads on Blackwell GPUs.
+
+This module provides a FlashInfer-style API wrapper around the TensorRT-LLM CuteDSL
+grouped GEMM kernel with fused gather and SwiGLU activation designed for MoE GEMM1 layers:
+- Input A: (seq_len, k) - original unpermuted tokens (no need for moe_permute!)
+- Input B: (num_experts, 2*intermediate_size, k) - expert gate and up weights interleaved
+- Output C: (permuted_m, intermediate_size) - SwiGLU activated outputs in permuted order
+
+Key features:
+- NVFP4 x NVFP4 grouped GEMM with FP8 scale factors
+- Fused gather operation using LDGSTS instructions with token_id_mapping
+- Eliminates the need for a separate moe_permute kernel
+- Fused SwiGLU activation in epilogue: output = up * silu(gate)
+- Optional FP4 quantization of output with scale factor generation
+- Persistent tile scheduling with per-expert group mapping
+- Warp specialization for overlapped memory and compute
+- Support for SM100 (Blackwell) architecture
+
+Comparison with Non-Gather SwiGLU Fusion:
+- Non-Gather: Requires separate moe_permute kernel, then uses TMA for contiguous A load
+- Gather: Uses LDGSTS to gather A directly using token_id_mapping, no moe_permute needed
+"""
+
+from typing import Any, Dict, List, Optional, Tuple
+
+import cutlass
+import cutlass.cute as cute
+import cuda.bindings.driver as cuda
+import torch
+
+from flashinfer.utils import get_compute_capability
+from flashinfer.api_logging import flashinfer_api
+from flashinfer.cute_dsl.utils import (
+    get_cutlass_dtype,
+    cutlass_to_torch_dtype,
+    get_num_sm,
+    get_max_active_clusters,
+    make_ptr,
+)
+
+# Import the TRT-LLM kernel implementation
+from .blackwell.blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion import (
+    BlockScaledContiguousGatherGroupedGemmKernel,
+)
+
+# Re-export the kernel class
+
+
+def create_gather_gemm_tensors(
+    seq_len: int,
+    topk: int,
+    group_m_list: List[int],
+    mma_tiler_m: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, List[int]]:
+    """Create tensors required for gather grouped GEMM.
+
+    This function creates the mapping tensors needed for the fused gather operation
+    in GEMM1 with SwiGLU activation.
+
+    Args:
+        seq_len: Number of input tokens (original sequence length before routing)
+        topk: Number of experts per token
+        group_m_list: List of actual (unaligned) M values per expert
+        mma_tiler_m: MMA tile M dimension for alignment (128 or 256)
+
+    Returns:
+        Tuple of:
+        - token_id_mapping: Maps permuted row to token_idx * topk + k_idx, shape (permuted_m,), int32
+          Used by LDGSTS to gather from the original unpermuted A tensor.
+          Invalid rows are marked with -1.
+        - tile_idx_to_expert_idx: Tile to expert mapping, shape (num_tiles,), int32
+        - tile_idx_to_mn_limit: M limit for each tile, shape (num_tiles,), int32
+        - num_non_exiting_tiles: Number of valid tiles, shape (1,), int32
+        - valid_m: Total valid M dimension (sum of aligned group sizes)
+        - aligned_group_m_list: List of aligned M values per expert
+
+    Example:
+        >>> seq_len, topk, num_experts = 4096, 8, 8
+        >>> group_m_list = [512, 480, 256, 320, 640, 512, 384, 704]  # Tokens per expert
+        >>>
+        >>> token_id_map, tile_map, mn_limit, num_tiles, valid_m, aligned_m = create_gather_gemm_tensors(
+        ...     seq_len=seq_len,
+        ...     topk=topk,
+        ...     group_m_list=group_m_list,
+        ...     mma_tiler_m=256,
+        ... )
+    """
+    valid_m = 0
+    aligned_group_m_list = []
+    tile_idx_to_expert_idx = []
+    tile_idx_to_mn_limit = []
+
+    for i, group_m in enumerate(group_m_list):
+        aligned_group_m = ((group_m + mma_tiler_m - 1) // mma_tiler_m) * mma_tiler_m
+        aligned_group_m_list.append(aligned_group_m)
+
+        # Calculate number of tiles for this group
+        num_tiles_in_group = aligned_group_m // mma_tiler_m
+        tile_idx_to_expert_idx.extend([i] * num_tiles_in_group)
+
+        # M limit for boundary checking
+        for tile_idx_in_group in range(num_tiles_in_group):
+            tile_idx_to_mn_limit.append(
+                valid_m + min(tile_idx_in_group * mma_tiler_m + mma_tiler_m, group_m)
+            )
+        valid_m += aligned_group_m
+
+    num_non_exiting_tiles = len(tile_idx_to_expert_idx)
+
+    # Create token_id_mapping for gather operation
+    # Maps permuted row index to expanded_idx = token_idx * topk + k_idx
+    token_id_mapping = torch.empty((valid_m,), dtype=torch.int32, device="cuda").fill_(
+        -1
+    )
+
+    start_idx = 0
+    for group_idx, m_per_group in enumerate(group_m_list):
+        if m_per_group > 0:
+            # Sequential/Blocked assignment for better memory access patterns
+            # Experts are grouped into sets of size topk
+            expert_set_idx = group_idx // topk
+            k_in_set = group_idx % topk
+
+            # Start token index for this expert set
+            start_token = expert_set_idx * m_per_group
+
+            # Generate sequential token indices for this expert
+            token_indices = torch.arange(
+                start_token, start_token + m_per_group, dtype=torch.int32, device="cuda"
+            )
+            token_indices = token_indices % seq_len
+
+            # expanded_idx = token_idx * topk + k
+            expanded_idx = token_indices * topk + k_in_set
+
+            token_id_mapping[start_idx : (start_idx + m_per_group)] = expanded_idx
+
+        # Move to next aligned group
+        aligned_group_m = aligned_group_m_list[group_idx]
+        start_idx += aligned_group_m
+
+    # Convert to tensors
+    tile_idx_to_expert_idx = torch.tensor(
+        tile_idx_to_expert_idx, device="cuda", dtype=torch.int32
+    )
+    tile_idx_to_mn_limit = torch.tensor(
+        tile_idx_to_mn_limit, device="cuda", dtype=torch.int32
+    )
+    num_non_exiting_tiles_tensor = torch.tensor(
+        [num_non_exiting_tiles], device="cuda", dtype=torch.int32
+    )
+
+    return (
+        token_id_mapping,
+        tile_idx_to_expert_idx,
+        tile_idx_to_mn_limit,
+        num_non_exiting_tiles_tensor,
+        valid_m,
+        aligned_group_m_list,
+    )
+
+
+# Kernel cache for compiled kernels (class-level to persist across calls)
+_gather_kernel_cache: Dict[Tuple, Any] = {}
+
+
+def _get_compiled_gather_kernel(
+    # Problem dimensions (runtime parameters - NOT in cache key)
+    orig_m: int,
+    permuted_m: int,
+    n: int,  # This is 2*intermediate_size
+    k: int,
+    num_experts: int,
+    # Tensor pointers (runtime parameters - NOT in cache key)
+    a_ptr,
+    b_ptr,
+    a_sf_ptr,
+    b_sf_ptr,
+    c_ptr,
+    c_sf_ptr,
+    alpha_ptr,
+    tile_idx_ptr,
+    mn_limit_ptr,
+    token_id_ptr,
+    num_tiles_ptr,
+    norm_const_ptr,
+    max_active_clusters: int,
+    stream,
+    # Dtype parameters (compile-time - IN cache key)
+    # cute.compile specializes on pointer types, so dtype must be in cache key
+    ab_dtype: str,
+    sf_dtype: str,
+    c_dtype: str,
+    # Tactic parameters (compile-time - IN cache key)
+    sf_vec_size: int,
+    tile_size: int,
+    topk: int,
+    mma_tiler_mn: Tuple[int, int],
+    cluster_shape_mn: Tuple[int, int],
+    vectorized_f32: bool,
+    raster_along_m: bool,
+):
+    """Get or compile the gather grouped GEMM with SwiGLU kernel.
+
+    This function caches compiled kernels by tactic and dtype parameters.
+    Problem dimensions (m, n, k, num_experts) are runtime parameters.
+
+    The cache key includes dtype parameters because cute.compile specializes
+    on the types of pointer arguments. Using the same compiled kernel with
+    different dtypes would cause incorrect results or crashes.
+
+    This matches TRT-LLM's approach where the same compiled kernel can be
+    reused for different problem sizes, significantly reducing JIT compilation
+    overhead during autotuning.
+    """
+    global _gather_kernel_cache
+
+    # Cache key includes dtype and tactic parameters, NOT problem dimensions
+    cache_key = (
+        ab_dtype,
+        sf_dtype,
+        c_dtype,
+        sf_vec_size,
+        tile_size,
+        topk,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        vectorized_f32,
+        raster_along_m,
+    )
+
+    if cache_key not in _gather_kernel_cache:
+        # Create kernel instance
+        gemm = BlockScaledContiguousGatherGroupedGemmKernel(
+            sf_vec_size=sf_vec_size,
+            mma_tiler_mn=mma_tiler_mn,
+            cluster_shape_mn=cluster_shape_mn,
+            vectorized_f32=vectorized_f32,
+            topk=topk,
+            raster_along_m=raster_along_m,
+        )
+
+        # Compile with runtime parameters - they can vary across calls
+        # Order must match wrapper signature:
+        # (a_ptr, b_ptr, a_sf_ptr, b_sf_ptr, c_ptr, c_sf_ptr, alpha_ptr,
+        #  tile_idx_to_group_idx_ptr, tile_idx_to_mn_limit_ptr, token_id_mapping_ptr,
+        #  num_non_exiting_tiles_ptr, global_sf_ptr, orig_m, m, n, k, l,
+        #  tile_size, scaling_vector_size, max_active_clusters, stream)
+        compiled_gemm = cute.compile(
+            gemm.wrapper,
+            a_ptr,
+            b_ptr,
+            a_sf_ptr,
+            b_sf_ptr,
+            c_ptr,
+            c_sf_ptr,
+            alpha_ptr,
+            tile_idx_ptr,
+            mn_limit_ptr,
+            token_id_ptr,
+            num_tiles_ptr,
+            norm_const_ptr,
+            orig_m,
+            permuted_m,
+            n,
+            k,
+            num_experts,
+            tile_size=tile_size,
+            scaling_vector_size=sf_vec_size,
+            max_active_clusters=max_active_clusters,
+            stream=stream,
+        )
+
+        _gather_kernel_cache[cache_key] = compiled_gemm
+
+    return _gather_kernel_cache[cache_key]
+
+
+@flashinfer_api
+def blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion_nvfp4(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    alpha: torch.Tensor,
+    tile_idx_to_expert_idx: torch.Tensor,
+    tile_idx_to_mn_limit: torch.Tensor,
+    token_id_mapping: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    out_scale: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
+    *,
+    topk: int = 8,
+    ab_dtype: str = "float4_e2m1fn",
+    sf_dtype: str = "float8_e4m3fn",
+    c_dtype: str = "bfloat16",
+    sf_vec_size: int = 16,
+    mma_tiler_mn: Tuple[int, int] = (256, 128),
+    cluster_shape_mn: Tuple[int, int] = (2, 1),
+    vectorized_f32: bool = True,
+    raster_along_m: bool = False,
+    sm_count: Optional[int] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """Blockscaled Contiguous Gather Grouped GEMM with SwiGLU Fusion for MoE workloads.
+
+    Performs grouped matrix multiplication with fused gather and SwiGLU activation:
+    C[row] = up * silu(gate), where [gate, up] = alpha[expert] * (A[token_id] @ B[expert])
+
+    This kernel is designed for Mixture of Experts (MoE) GEMM1 layers where:
+    - Input tokens are NOT pre-permuted (no need for moe_permute kernel!)
+    - The kernel gathers input tokens using token_id_mapping during LDGSTS load
+    - Each expert has gate and up projection weights interleaved
+    - SwiGLU activation is fused into the GEMM epilogue
+    - Optional FP4 quantization of output
+
+    Args:
+        a: Input tensor A (original unpermuted tokens), shape (seq_len, k) for FP4
+           stored as (seq_len, k//2) uint8. This is the ORIGINAL unpermuted tensor!
+        b: Weight tensor B (expert gate+up weights), shape (num_experts, 2*intermediate_size, k)
+           for FP4 stored as (num_experts, 2*intermediate_size, k//2) uint8
+           The N dimension contains interleaved gate and up projection weights.
+        a_scale: Scale factors for A in MMA-compatible layout
+        b_scale: Scale factors for B in MMA-compatible layout
+        alpha: Per-expert scaling factors, shape (num_experts,), float32
+        tile_idx_to_expert_idx: Mapping from tile index to expert index, shape (num_tiles,), int32
+        tile_idx_to_mn_limit: M limit for each tile for boundary checking, shape (num_tiles,), int32
+        token_id_mapping: Mapping from permuted row to token_id, shape (permuted_m,), int32
+            token_id = token_idx * topk + k_idx. Invalid rows have -1.
+            Used by LDGSTS to gather from A tensor.
+        num_non_exiting_tiles: Number of valid tiles, shape (1,), int32
+        out: Optional output tensor, shape (permuted_m, intermediate_size). Created if None.
+             For FP4 output, shape is (permuted_m, intermediate_size//2) uint8.
+        out_scale: Optional output scale factor tensor for FP4 quantized output.
+        global_scale: Global scale factor for FP4 quantization, shape (1,), float32.
+        topk: Number of experts per token. Default: 8
+        ab_dtype: Data type for A and B matrices. Default: "float4_e2m1fn"
+        sf_dtype: Data type for scale factors. Default: "float8_e4m3fn"
+        c_dtype: Data type for output matrix. Default: "bfloat16"
+        sf_vec_size: Scale factor vector size. Default: 16 (for NVFP4)
+        mma_tiler_mn: MMA tile shape (M, N). Default: (256, 128)
+        cluster_shape_mn: Cluster shape (ClusterM, ClusterN). Default: (2, 1)
+        vectorized_f32: Use vectorized f32x2 operations. Default: True
+        raster_along_m: If True, raster tiles along M dimension. Default: False
+        sm_count: Number of SMs to use. Default: max available.
+
+    Returns:
+        Tuple of:
+        - out: Output tensor C, shape (permuted_m, intermediate_size) with dtype c_dtype
+               For FP4 output: (permuted_m, intermediate_size//2) uint8
+        - out_scale: Output scale factors if c_dtype is FP4, else None
+
+    Notes:
+        - Unlike the Non-Gather SwiGLU kernel, this kernel does NOT require moe_permute!
+        - The A tensor is the original unpermuted input
+        - The output is in permuted order (can be fed directly to GEMM2)
+        - Use create_gather_gemm_tensors() to create required mapping tensors
+        - Requires SM100 (Blackwell) GPU architecture
+
+    Example:
+        >>> # Setup for MoE GEMM1 with 8 experts, no moe_permute needed!
+        >>> num_experts, hidden_dim, intermediate_dim = 8, 4096, 14336
+        >>> seq_len, topk = 4096, 8
+        >>>
+        >>> # Create gather mapping tensors
+        >>> group_m = torch.tensor([512, 480, 256, 320, 640, 512, 384, 704], device="cuda")
+        >>> token_map, tile_map, mn_limit, num_tiles, valid_m, aligned_m = create_gather_gemm_tensors(
+        ...     seq_len=seq_len, topk=topk, group_m_list=group_m.tolist(), mma_tiler_m=256
+        ... )
+        >>>
+        >>> # Run gathered GEMM with SwiGLU fusion - NO moe_permute needed!
+        >>> out, _ = blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion_nvfp4(
+        ...     a=original_input_fp4,            # (seq_len, hidden_dim//2) - UNPERMUTED!
+        ...     b=expert_gate_up_weights_fp4,    # (num_experts, 2*intermediate_dim, hidden_dim//2)
+        ...     a_scale=input_scale,
+        ...     b_scale=weight_scale,
+        ...     alpha=expert_alpha,              # (num_experts,)
+        ...     tile_idx_to_expert_idx=tile_map,
+        ...     tile_idx_to_mn_limit=mn_limit,
+        ...     token_id_mapping=token_map,
+        ...     num_non_exiting_tiles=num_tiles,
+        ...     topk=topk,
+        ... )  # out shape: (valid_m, intermediate_dim)
+    """
+    # Validate inputs
+    assert a.device.type == "cuda", "Input tensors must be on CUDA device"
+    assert b.device.type == "cuda", "Input tensors must be on CUDA device"
+
+    # Get dimensions
+    seq_len = a.shape[0]
+    num_experts = b.shape[0]
+    n = b.shape[1]  # This is 2*intermediate_size
+    k = a.shape[1]
+    if ab_dtype == "float4_e2m1fn":
+        k = k * 2  # FP4 is packed 2 elements per byte
+
+    intermediate_size = n // 2  # Output dimension after SwiGLU
+    permuted_m = token_id_mapping.shape[0]
+
+    # Check compute capability
+    major, minor = get_compute_capability(a.device)
+    if major != 10:
+        raise ValueError(
+            f"Blockscaled contiguous gather grouped GEMM with SwiGLU requires SM100 family (Blackwell: SM100, SM103, SM110). "
+            f"Got SM{major}{minor}."
+        )
+
+    # Validate configuration
+    ab_dtype_cutlass = get_cutlass_dtype(ab_dtype)
+    sf_dtype_cutlass = get_cutlass_dtype(sf_dtype)
+    c_dtype_cutlass = get_cutlass_dtype(c_dtype)
+
+    if not BlockScaledContiguousGatherGroupedGemmKernel.can_implement(
+        ab_dtype_cutlass,
+        sf_dtype_cutlass,
+        sf_vec_size,
+        c_dtype_cutlass,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        permuted_m,
+        n,
+        k,
+        num_experts,
+        a_major="k",
+        b_major="k",
+        c_major="n",
+    ):
+        raise ValueError(
+            f"Unsupported configuration: ab_dtype={ab_dtype}, sf_dtype={sf_dtype}, "
+            f"sf_vec_size={sf_vec_size}, c_dtype={c_dtype}, mma_tiler_mn={mma_tiler_mn}, "
+            f"cluster_shape_mn={cluster_shape_mn}, shape=({permuted_m}, {n}, {k}, {num_experts})"
+        )
+
+    # Check if we're doing FP4 quantization
+    generate_sfc = c_dtype == "float4_e2m1fn"
+    if generate_sfc:
+        if global_scale is None:
+            raise ValueError("global_scale is required when c_dtype is 'float4_e2m1fn'")
+
+    # Create output tensor if not provided
+    if out is None:
+        if generate_sfc:
+            # FP4 output: 2 values per byte
+            out = torch.empty(
+                (permuted_m, intermediate_size // 2),
+                dtype=torch.uint8,
+                device=a.device,
+            )
+        else:
+            out = torch.empty(
+                (permuted_m, intermediate_size),
+                dtype=cutlass_to_torch_dtype(c_dtype_cutlass),
+                device=a.device,
+            )
+
+    # Create output scale tensor if needed and not provided
+    if generate_sfc and out_scale is None:
+        # Scale factor layout for output
+        scale_intermediate_size = intermediate_size // sf_vec_size
+        # MMA-compatible scale factor shape
+        out_scale = torch.empty(
+            (32, 4, permuted_m // 128, 4, scale_intermediate_size // 4, 1),
+            dtype=torch.uint8,  # FP8 E4M3
+            device=a.device,
+        )
+
+    # Get SM count
+    if sm_count is None:
+        sm_count = get_num_sm(a.device)
+
+    # Compute max active clusters (cached to avoid expensive HardwareInfo queries)
+    max_active_clusters = get_max_active_clusters(
+        cluster_shape_mn[0] * cluster_shape_mn[1]
+    )
+
+    # Get tile_size from mma_tiler_mn
+    tile_size = mma_tiler_mn[0]
+
+    # Create raw pointers (TRT-LLM style) - allows same compiled kernel for different sizes
+    a_ptr = make_ptr(
+        ab_dtype_cutlass, a.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    b_ptr = make_ptr(
+        ab_dtype_cutlass, b.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    a_sf_ptr = make_ptr(
+        sf_dtype_cutlass, a_scale.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    b_sf_ptr = make_ptr(
+        sf_dtype_cutlass, b_scale.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    c_ptr = make_ptr(
+        c_dtype_cutlass, out.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+
+    if generate_sfc:
+        c_sf_ptr = make_ptr(
+            sf_dtype_cutlass,
+            out_scale.data_ptr(),
+            cute.AddressSpace.gmem,
+            assumed_align=16,
+        )
+        norm_const_ptr = make_ptr(
+            cutlass.Float32, global_scale.data_ptr(), cute.AddressSpace.gmem
+        )
+    else:
+        c_sf_ptr = None
+        norm_const_ptr = None
+
+    alpha_ptr = make_ptr(cutlass.Float32, alpha.data_ptr(), cute.AddressSpace.gmem)
+    tile_idx_ptr = make_ptr(
+        cutlass.Int32, tile_idx_to_expert_idx.data_ptr(), cute.AddressSpace.gmem
+    )
+    mn_limit_ptr = make_ptr(
+        cutlass.Int32, tile_idx_to_mn_limit.data_ptr(), cute.AddressSpace.gmem
+    )
+    token_id_ptr = make_ptr(
+        cutlass.Int32, token_id_mapping.data_ptr(), cute.AddressSpace.gmem
+    )
+    num_tiles_ptr = make_ptr(
+        cutlass.Int32, num_non_exiting_tiles.data_ptr(), cute.AddressSpace.gmem
+    )
+
+    # Get CUDA stream
+    torch_stream = torch.cuda.current_stream()
+    stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    # Get or compile the kernel (cached by dtype and tactic parameters)
+    compiled_gemm = _get_compiled_gather_kernel(
+        # Runtime parameters (problem dimensions)
+        orig_m=seq_len,
+        permuted_m=permuted_m,
+        n=n,
+        k=k,
+        num_experts=num_experts,
+        # Tensor pointers (order must match wrapper signature)
+        a_ptr=a_ptr,
+        b_ptr=b_ptr,
+        a_sf_ptr=a_sf_ptr,
+        b_sf_ptr=b_sf_ptr,
+        c_ptr=c_ptr,
+        c_sf_ptr=c_sf_ptr,
+        alpha_ptr=alpha_ptr,
+        tile_idx_ptr=tile_idx_ptr,
+        mn_limit_ptr=mn_limit_ptr,
+        token_id_ptr=token_id_ptr,
+        num_tiles_ptr=num_tiles_ptr,
+        norm_const_ptr=norm_const_ptr,
+        max_active_clusters=max_active_clusters,
+        stream=stream,
+        # Dtype parameters (compile-time, in cache key)
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        # Tactic parameters (compile-time, cached)
+        sf_vec_size=sf_vec_size,
+        tile_size=tile_size,
+        topk=topk,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        vectorized_f32=vectorized_f32,
+        raster_along_m=raster_along_m,
+    )
+
+    # Execute kernel with runtime parameters
+    # Order must match wrapper signature:
+    # (a_ptr, b_ptr, a_sf_ptr, b_sf_ptr, c_ptr, c_sf_ptr, alpha_ptr,
+    #  tile_idx_ptr, mn_limit_ptr, token_id_ptr, num_tiles_ptr, global_sf_ptr,
+    #  orig_m, m, n, k, l, stream)
+    compiled_gemm(
+        a_ptr,
+        b_ptr,
+        a_sf_ptr,
+        b_sf_ptr,
+        c_ptr,
+        c_sf_ptr,
+        alpha_ptr,
+        tile_idx_ptr,
+        mn_limit_ptr,
+        token_id_ptr,
+        num_tiles_ptr,
+        norm_const_ptr,
+        seq_len,  # orig_m
+        permuted_m,
+        n,
+        k,
+        num_experts,
+        stream=stream,
+    )
+
+    return out, out_scale if generate_sfc else None
diff --git a/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_grouped_gemm.py b/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_grouped_gemm.py
new file mode 100644
index 0000000000..4a9a96513c
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_grouped_gemm.py
@@ -0,0 +1,486 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file wraps TensorRT-LLM's CuteDSL grouped GEMM implementation:
+# tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py
+#
+# Original copyright:
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Contiguous Grouped GEMM kernel for MoE (Mixture of Experts) workloads on Blackwell GPUs.
+
+This module provides a FlashInfer-style API wrapper around the TensorRT-LLM CuteDSL
+grouped GEMM kernel designed for MoE layers:
+- Input A: (permuted_m, k) - permuted tokens from all batches
+- Input B: (num_experts, n, k) - expert weights
+- Output C: (permuted_m, n) - intermediate outputs
+
+Key features:
+- NVFP4 x NVFP4 grouped GEMM with FP8 scale factors
+- Persistent tile scheduling with per-expert group mapping
+- Warp specialization for overlapped memory and compute
+- Support for SM100 (Blackwell) architecture
+"""
+
+from typing import Optional, Tuple, Type
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+import functools
+import torch
+from cutlass.cute.runtime import from_dlpack
+
+from flashinfer.utils import get_compute_capability
+from flashinfer.api_logging import flashinfer_api
+from flashinfer.cute_dsl.utils import (
+    get_cutlass_dtype,
+    cutlass_to_torch_dtype,
+    get_num_sm,
+    get_max_active_clusters,
+)
+
+# Import the TRT-LLM kernel implementation
+from .blackwell.blockscaled_contiguous_grouped_gemm import (
+    Sm100BlockScaledContiguousGroupedGemmKernel,
+    cvt_sf_MKL_to_M32x4xrm_K4xrk_L,
+)
+
+# Re-export the kernel class
+
+
+def create_tile_mapping(
+    group_m_list: torch.Tensor,
+    mma_tiler_m: int,
+    permuted_m: Optional[int] = None,
+) -> Tuple[int, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Create tile-to-group mapping for contiguous grouped GEMM.
+
+    This function creates the necessary mapping tensors for the grouped GEMM kernel:
+    - tile_idx_to_group_idx: Maps each tile to its expert/group index
+    - num_non_exiting_tiles: Number of valid tiles (for early exit)
+    - aligned_group_m: Aligned M values for each group
+
+    Args:
+        group_m_list: 1D tensor of M values for each group (expert)
+        mma_tiler_m: CTA tile M size (e.g., 128 or 256)
+        permuted_m: Optional padded M dimension for CUDA graph support
+
+    Returns:
+        Tuple of:
+        - valid_m: Total valid M (sum of aligned group M values)
+        - aligned_group_m: Aligned M values per group
+        - tile_idx_to_group_idx: Tensor mapping tile index to group index
+        - num_non_exiting_tiles: Scalar tensor with number of valid tiles
+
+    Example:
+        >>> group_m = torch.tensor([256, 128, 384], device="cuda")
+        >>> valid_m, aligned_m, tile_map, num_tiles = create_tile_mapping(group_m, mma_tiler_m=128)
+    """
+    device = group_m_list.device
+    group_m_list_cpu = group_m_list.cpu().tolist()
+
+    valid_m = 0
+    aligned_group_m_list = []
+    tile_idx_to_group_idx_list = []
+
+    for i, group_m in enumerate(group_m_list_cpu):
+        # Align each group's M to the MMA tiler M size
+        aligned_group_m = ((group_m + mma_tiler_m - 1) // mma_tiler_m) * mma_tiler_m
+        valid_m += aligned_group_m
+        aligned_group_m_list.append(aligned_group_m)
+
+        # Calculate number of tiles for this group
+        num_tiles_in_group = aligned_group_m // mma_tiler_m
+        # Add group index for each tile in this group
+        tile_idx_to_group_idx_list.extend([i] * num_tiles_in_group)
+
+    num_non_exiting_tiles = len(tile_idx_to_group_idx_list)
+
+    # Apply padding if requested (for CUDA graph support)
+    if permuted_m is not None:
+        if permuted_m < valid_m:
+            raise ValueError(
+                f"permuted_m ({permuted_m}) must be >= valid_m ({valid_m}). "
+                f"Cannot pad to a smaller size."
+            )
+        if permuted_m > valid_m:
+            num_padding_tiles = (permuted_m - valid_m) // mma_tiler_m
+            # Pad with invalid index (these tiles won't be accessed)
+            tile_idx_to_group_idx_list.extend([int(-2e9)] * num_padding_tiles)
+
+    tile_idx_to_group_idx = torch.tensor(
+        tile_idx_to_group_idx_list, device=device, dtype=torch.int32
+    )
+    num_non_exiting_tiles_tensor = torch.tensor(
+        [num_non_exiting_tiles], device=device, dtype=torch.int32
+    )
+    aligned_group_m = torch.tensor(
+        aligned_group_m_list, device=device, dtype=torch.int32
+    )
+
+    return valid_m, aligned_group_m, tile_idx_to_group_idx, num_non_exiting_tiles_tensor
+
+
+def create_scale_factor_tensor(
+    l: int,
+    mn: int,
+    k: int,
+    sf_vec_size: int,
+    dtype: Type[cutlass.Numeric],
+) -> Tuple[torch.Tensor, cute.Tensor, torch.Tensor]:
+    """Create scale factor tensors in the MMA-compatible layout.
+
+    This function creates scale factor tensors with the proper layout for
+    the blockscaled GEMM kernel. The layout follows the MMA specification:
+    (32, 4, rest_m, 4, rest_k, l) order.
+
+    Args:
+        l: Batch/expert dimension
+        mn: M or N dimension
+        k: K dimension
+        sf_vec_size: Scale factor vector size (16 for NVF4, 32 for MXF4)
+        dtype: Scale factor data type (e.g., cutlass.Float8E4M3FN)
+
+    Returns:
+        Tuple of:
+        - ref_f32_torch_tensor: Reference tensor in (mn, k, l) layout for validation
+        - cute_tensor: CuTe tensor in MMA layout
+        - cute_torch_tensor: PyTorch tensor backing the CuTe tensor
+
+    Example:
+        >>> ref, cute_sf, torch_sf = create_scale_factor_tensor(
+        ...     l=8, mn=1024, k=4096, sf_vec_size=16, dtype=cutlass.Float8E4M3FN
+        ... )
+    """
+
+    def ceil_div(a, b):
+        return (a + b - 1) // b
+
+    sf_k = ceil_div(k, sf_vec_size)
+    ref_shape = (l, mn, sf_k)
+
+    atom_m = (32, 4)
+    atom_k = 4
+    mma_shape = (
+        l,
+        ceil_div(mn, atom_m[0] * atom_m[1]),
+        ceil_div(sf_k, atom_k),
+        atom_m[0],
+        atom_m[1],
+        atom_k,
+    )
+
+    ref_permute_order = (1, 2, 0)
+    mma_permute_order = (3, 4, 1, 5, 2, 0)
+
+    # Create f32 ref torch tensor (cpu)
+    ref_f32_torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+        ref_shape,
+        torch.float32,
+        permute_order=ref_permute_order,
+        init_type=cutlass_torch.TensorInitType.RANDOM,
+        init_config=cutlass_torch.RandomInitConfig(
+            min_val=1,
+            max_val=3,
+        ),
+    )
+
+    # Create f32 cute torch tensor (cpu)
+    cute_f32_torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+        mma_shape,
+        torch.float32,
+        permute_order=mma_permute_order,
+        init_type=cutlass_torch.TensorInitType.RANDOM,
+        init_config=cutlass_torch.RandomInitConfig(
+            min_val=0,
+            max_val=1,
+        ),
+    )
+
+    # convert ref f32 tensor to cute f32 tensor
+    cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+        from_dlpack(ref_f32_torch_tensor_cpu),
+        from_dlpack(cute_f32_torch_tensor_cpu),
+    )
+
+    cute_f32_torch_tensor = cute_f32_torch_tensor_cpu.cuda()
+
+    # reshape makes memory contiguous
+    ref_f32_torch_tensor_cpu = (
+        ref_f32_torch_tensor_cpu.permute(2, 0, 1)
+        .unsqueeze(-1)
+        .expand(l, mn, sf_k, sf_vec_size)
+        .reshape(l, mn, sf_k * sf_vec_size)
+        .permute(*ref_permute_order)
+    )
+    # prune to mkl for reference check.
+    ref_f32_torch_tensor_cpu = ref_f32_torch_tensor_cpu[:, :k, :]
+
+    # Create dtype cute torch tensor (cpu)
+    cute_tensor, cute_torch_tensor = cutlass_torch.cute_tensor_like(
+        cute_f32_torch_tensor_cpu,
+        dtype,
+        is_dynamic_layout=True,
+        assumed_align=16,
+    )
+
+    # Convert f32 cute tensor to dtype cute tensor
+    cute_tensor = cutlass_torch.convert_cute_tensor(
+        cute_f32_torch_tensor,
+        cute_tensor,
+        dtype,
+        is_dynamic_layout=True,
+    )
+    return ref_f32_torch_tensor_cpu, cute_tensor, cute_torch_tensor
+
+
+@functools.lru_cache(maxsize=None)
+def _get_compiled_kernel(
+    permuted_m: int,
+    n: int,
+    k: int,
+    num_experts: int,
+    ab_dtype_name: str,
+    sf_dtype_name: str,
+    c_dtype_name: str,
+    sf_vec_size: int,
+    mma_tiler_mn: Tuple[int, int],
+    cluster_shape_mn: Tuple[int, int],
+):
+    """Get or compile the grouped GEMM kernel.
+
+    This function is cached to avoid recompilation for the same parameters.
+    """
+    ab_dtype = get_cutlass_dtype(ab_dtype_name)
+    sf_dtype = get_cutlass_dtype(sf_dtype_name)
+    c_dtype = get_cutlass_dtype(c_dtype_name)
+
+    # Create kernel instance
+    gemm = Sm100BlockScaledContiguousGroupedGemmKernel(
+        sf_vec_size=sf_vec_size,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+    )
+
+    return gemm, ab_dtype, sf_dtype, c_dtype
+
+
+@flashinfer_api
+def blockscaled_contiguous_grouped_gemm_nvfp4(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    alpha: torch.Tensor,
+    tile_idx_to_group_idx: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    *,
+    ab_dtype: str = "float4_e2m1fn",
+    sf_dtype: str = "float8_e4m3fn",
+    c_dtype: str = "bfloat16",
+    sf_vec_size: int = 16,
+    mma_tiler_mn: Tuple[int, int] = (128, 128),
+    cluster_shape_mn: Tuple[int, int] = (1, 1),
+    sm_count: Optional[int] = None,
+) -> torch.Tensor:
+    """Blockscaled Contiguous Grouped GEMM for MoE workloads with NVFP4 quantization.
+
+    Performs grouped matrix multiplication: C[tile] = alpha[group] * (A[tile] @ B[group])
+
+    This kernel is designed for Mixture of Experts (MoE) layers where:
+    - Tokens are permuted and contiguously arranged by expert assignment
+    - Each expert has its own weight matrix
+    - Per-expert alpha scaling is applied to the output
+
+    Args:
+        a: Input tensor A (permuted tokens), shape (permuted_m, k) for FP4 stored as (permuted_m, k//2) uint8
+        b: Weight tensor B (expert weights), shape (num_experts, n, k) for FP4 stored as (num_experts, n, k//2) uint8
+        a_scale: Scale factors for A in MMA-compatible layout
+        b_scale: Scale factors for B in MMA-compatible layout
+        alpha: Per-expert scaling factors, shape (num_experts,), float32
+        tile_idx_to_group_idx: Mapping from tile index to expert index, shape (num_tiles,), int32
+        num_non_exiting_tiles: Number of valid tiles, shape (1,), int32
+        out: Optional output tensor, shape (permuted_m, n). Created if None.
+        ab_dtype: Data type for A and B matrices. Default: "float4_e2m1fn"
+        sf_dtype: Data type for scale factors. Default: "float8_e4m3fn"
+        c_dtype: Data type for output matrix. Default: "bfloat16"
+        sf_vec_size: Scale factor vector size. Default: 16 (for NVFP4)
+        mma_tiler_mn: MMA tile shape (M, N). Default: (128, 128)
+        cluster_shape_mn: Cluster shape (ClusterM, ClusterN). Default: (1, 1)
+        sm_count: Number of SMs to use. Default: max available.
+
+    Returns:
+        Output tensor C, shape (permuted_m, n) with dtype c_dtype
+
+    Notes:
+        - Use create_tile_mapping() to create tile_idx_to_group_idx and num_non_exiting_tiles
+        - Use create_scale_factor_tensor() to create properly formatted scale factors
+        - Requires SM100 (Blackwell) GPU architecture
+        - For CUDA graph support, pre-allocate output and use fixed-size tile mapping
+
+    Example:
+        >>> # Setup for MoE with 8 experts
+        >>> num_experts, hidden_dim, intermediate_dim = 8, 4096, 14336
+        >>>
+        >>> # Create tile mapping from routing decisions
+        >>> group_m = torch.tensor([256, 128, 384, 256, 128, 256, 256, 384], device="cuda")
+        >>> valid_m, aligned_m, tile_map, num_tiles = create_tile_mapping(group_m, mma_tiler_m=128)
+        >>>
+        >>> # Run grouped GEMM
+        >>> out = blockscaled_contiguous_grouped_gemm_nvfp4(
+        ...     a=permuted_input_fp4,           # (valid_m, hidden_dim//2)
+        ...     b=expert_weights_fp4,           # (num_experts, intermediate_dim, hidden_dim//2)
+        ...     a_scale=input_scale,
+        ...     b_scale=weight_scale,
+        ...     alpha=expert_alpha,             # (num_experts,)
+        ...     tile_idx_to_group_idx=tile_map,
+        ...     num_non_exiting_tiles=num_tiles,
+        ... )
+    """
+    # Validate inputs
+    assert a.device.type == "cuda", "Input tensors must be on CUDA device"
+    assert b.device.type == "cuda", "Input tensors must be on CUDA device"
+
+    # Get dimensions
+    permuted_m = a.shape[0]
+    num_experts = b.shape[0]
+    n = b.shape[1]
+    k = a.shape[1]
+    if ab_dtype == "float4_e2m1fn":
+        k = k * 2  # FP4 is packed 2 elements per byte
+
+    # Check compute capability
+    major, minor = get_compute_capability(a.device)
+    if major != 10:
+        raise ValueError(
+            f"Blockscaled contiguous grouped GEMM requires SM100 family (Blackwell: SM100, SM103, SM110). "
+            f"Got SM{major}{minor}."
+        )
+
+    # Validate configuration
+    ab_dtype_cutlass = get_cutlass_dtype(ab_dtype)
+    sf_dtype_cutlass = get_cutlass_dtype(sf_dtype)
+    c_dtype_cutlass = get_cutlass_dtype(c_dtype)
+
+    if not Sm100BlockScaledContiguousGroupedGemmKernel.can_implement(
+        ab_dtype_cutlass,
+        sf_dtype_cutlass,
+        sf_vec_size,
+        c_dtype_cutlass,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        permuted_m,
+        n,
+        k,
+        num_experts,
+        a_major="k",
+        b_major="k",
+        c_major="n",
+    ):
+        raise ValueError(
+            f"Unsupported configuration: ab_dtype={ab_dtype}, sf_dtype={sf_dtype}, "
+            f"sf_vec_size={sf_vec_size}, c_dtype={c_dtype}, mma_tiler_mn={mma_tiler_mn}, "
+            f"cluster_shape_mn={cluster_shape_mn}, shape=({permuted_m}, {n}, {k}, {num_experts})"
+        )
+
+    # Create output tensor if not provided
+    if out is None:
+        out = torch.empty(
+            (permuted_m, n),
+            dtype=cutlass_to_torch_dtype(c_dtype_cutlass),
+            device=a.device,
+        )
+
+    # Get SM count
+    if sm_count is None:
+        sm_count = get_num_sm(a.device)
+
+    # Get or compile the kernel
+    gemm, _, _, _ = _get_compiled_kernel(
+        permuted_m=permuted_m,
+        n=n,
+        k=k,
+        num_experts=num_experts,
+        ab_dtype_name=ab_dtype,
+        sf_dtype_name=sf_dtype,
+        c_dtype_name=c_dtype,
+        sf_vec_size=sf_vec_size,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+    )
+
+    # Compute max active clusters (cached to avoid expensive HardwareInfo queries)
+    max_active_clusters = get_max_active_clusters(
+        cluster_shape_mn[0] * cluster_shape_mn[1]
+    )
+
+    # Create CuTe tensors from PyTorch tensors
+    # A: (permuted_m, k, 1) - single batch of permuted tokens
+    # B: (n, k, num_experts) - expert weights
+    # C: (permuted_m, n, 1) - output
+    a_tensor = from_dlpack(a.unsqueeze(-1), assumed_align=16).mark_layout_dynamic(
+        leading_dim=1
+    )
+    b_tensor = from_dlpack(b.permute(1, 2, 0), assumed_align=16).mark_layout_dynamic(
+        leading_dim=1
+    )
+    c_tensor = from_dlpack(out.unsqueeze(-1), assumed_align=16).mark_layout_dynamic(
+        leading_dim=1
+    )
+
+    # Scale factor tensors
+    sfa_tensor = from_dlpack(a_scale, assumed_align=16).mark_layout_dynamic()
+    sfb_tensor = from_dlpack(b_scale, assumed_align=16).mark_layout_dynamic()
+
+    # Mapping tensors
+    tile_idx_tensor = from_dlpack(tile_idx_to_group_idx).mark_layout_dynamic()
+    num_tiles_tensor = from_dlpack(num_non_exiting_tiles).mark_layout_dynamic()
+    alpha_tensor = from_dlpack(alpha).mark_layout_dynamic()
+
+    # Get current CUDA stream
+    current_stream = cutlass_torch.current_stream()
+
+    # Compile and run the kernel
+    compiled_gemm = cute.compile(
+        gemm,
+        a_tensor,
+        b_tensor,
+        c_tensor,
+        sfa_tensor,
+        sfb_tensor,
+        tile_idx_tensor,
+        num_tiles_tensor,
+        alpha_tensor,
+        max_active_clusters,
+        current_stream,
+    )
+
+    # Execute
+    compiled_gemm(
+        a_tensor,
+        b_tensor,
+        c_tensor,
+        sfa_tensor,
+        sfb_tensor,
+        tile_idx_tensor,
+        num_tiles_tensor,
+        alpha_tensor,
+        current_stream,
+    )
+
+    return out
diff --git a/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_grouped_gemm_finalize_fusion.py b/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_grouped_gemm_finalize_fusion.py
new file mode 100644
index 0000000000..3f41edec08
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_grouped_gemm_finalize_fusion.py
@@ -0,0 +1,529 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file wraps TensorRT-LLM's CuteDSL grouped GEMM with finalize fusion:
+# tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py
+#
+# Original copyright:
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Contiguous Grouped GEMM kernel with Finalize Fusion for MoE workloads on Blackwell GPUs.
+
+This module provides a FlashInfer-style API wrapper around the TensorRT-LLM CuteDSL
+grouped GEMM kernel with fused finalize operation designed for MoE GEMM2 layers:
+- Input A: (permuted_m, k) - permuted activations from GEMM1
+- Input B: (num_experts, n, k) - expert down projection weights
+- Output C: (seq_len, n) - finalized output with atomic scatter reduction
+
+Key features:
+- NVFP4 x NVFP4 grouped GEMM with FP8 scale factors
+- Fused finalize operation in epilogue:
+  a) Map permuted rows to (token_idx, topk_idx) using permuted_idx_to_expanded_idx
+  b) Apply router scale: scaled_output = gemm_output * token_final_scales[token_idx, topk_idx]
+  c) Scatter-reduce to output: out[token_idx] += scaled_output (atomic add)
+- Eliminates separate moe_unpermute kernel
+- Persistent tile scheduling with per-expert group mapping
+- Warp specialization for overlapped memory and compute
+- Support for SM100 (Blackwell) architecture
+"""
+
+from typing import Any, Dict, List, Optional, Tuple
+
+import cutlass
+import cutlass.cute as cute
+import cuda.bindings.driver as cuda
+import torch
+
+from flashinfer.utils import get_compute_capability
+from flashinfer.api_logging import flashinfer_api
+from flashinfer.cute_dsl.utils import (
+    get_cutlass_dtype,
+    cutlass_to_torch_dtype,
+    get_num_sm,
+    get_max_active_clusters,
+    make_ptr,
+)
+
+# Import the TRT-LLM kernel implementation
+from .blackwell.blockscaled_contiguous_grouped_gemm_finalize_fusion import (
+    Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel,
+)
+
+# Re-export the kernel class
+
+
+def create_finalize_fusion_tensors(
+    seq_len: int,
+    topk: int,
+    permuted_m: int,
+    group_m_list: List[int],
+    mma_tiler_mn: Tuple[int, int],
+    final_scale_dtype: torch.dtype = torch.float32,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Create tensors required for finalize fusion.
+
+    This function creates the mapping tensor and final scale tensor needed
+    for the fused finalize operation in GEMM2.
+
+    Args:
+        seq_len: Number of output tokens (original sequence length)
+        topk: Number of experts per token
+        permuted_m: Total permuted M dimension (sum of aligned group sizes)
+        group_m_list: List of actual (unaligned) M values per expert
+        mma_tiler_mn: MMA tile shape (M, N) for alignment
+        final_scale_dtype: Data type for token final scales. Default: torch.float32
+
+    Returns:
+        Tuple of:
+        - permuted_idx_to_expanded_idx: Mapping tensor, shape (permuted_m,), int32
+          Maps permuted row index to expanded_idx = token_idx * topk + k_idx
+          Invalid rows are marked with -1.
+        - token_final_scales: Router scale tensor, shape (seq_len, topk), final_scale_dtype
+          Normalized routing weights for each (token, topk) pair.
+
+    Example:
+        >>> seq_len, topk, num_experts = 4096, 8, 8
+        >>> group_m_list = [512, 480, 256, 320, 640, 512, 384, 704]  # Tokens per expert
+        >>> permuted_m = sum(align_to(m, 256) for m in group_m_list)  # Aligned total
+        >>>
+        >>> permuted_idx_to_expanded_idx, token_final_scales = create_finalize_fusion_tensors(
+        ...     seq_len=seq_len,
+        ...     topk=topk,
+        ...     permuted_m=permuted_m,
+        ...     group_m_list=group_m_list,
+        ...     mma_tiler_mn=(256, 128),
+        ... )
+    """
+    m_aligned = mma_tiler_mn[0]
+
+    # Initialize mapping tensor with -1 (invalid)
+    permuted_idx_to_expanded_idx = torch.empty(
+        (permuted_m,), dtype=torch.int32, device="cuda"
+    ).fill_(-1)
+
+    # Create normalized token final scales
+    token_final_scales = torch.rand(
+        seq_len, topk, dtype=final_scale_dtype, device="cuda"
+    )
+    token_final_scales = token_final_scales / token_final_scales.sum(
+        dim=1, keepdim=True
+    )
+
+    start_idx = 0
+    for group_idx, m_per_group in enumerate(group_m_list):
+        if m_per_group > 0:
+            # Sequential/Blocked assignment for better atomic add memory access
+            # Experts are grouped into sets of size topk.
+            # Expert Set S (experts S*topk ... S*topk+topk-1) serves a contiguous block of tokens.
+            # This ensures that within an expert, we process tokens T, T+1, T+2... sequentially.
+
+            expert_set_idx = group_idx // topk
+            k_in_set = group_idx % topk
+
+            # Start token index for this expert set
+            start_token = expert_set_idx * m_per_group
+
+            # Generate sequential token indices for this expert
+            token_indices = torch.arange(
+                start_token, start_token + m_per_group, dtype=torch.int32, device="cuda"
+            )
+            token_indices = token_indices % seq_len
+
+            # expanded_idx = token_idx * topk + k
+            expanded_idx = token_indices * topk + k_in_set
+
+            permuted_idx_to_expanded_idx[start_idx : (start_idx + m_per_group)] = (
+                expanded_idx
+            )
+
+        # Move to next aligned group
+        m_aligned_per_group = ((m_per_group + m_aligned - 1) // m_aligned) * m_aligned
+        start_idx += m_aligned_per_group
+
+    return permuted_idx_to_expanded_idx, token_final_scales
+
+
+# Kernel cache for compiled kernels (class-level to persist across calls)
+_finalize_kernel_cache: Dict[Tuple, Any] = {}
+
+
+def _get_compiled_finalize_kernel(
+    # Problem dimensions (runtime parameters - NOT in cache key)
+    seq_len: int,
+    permuted_m: int,
+    n: int,
+    k: int,
+    num_experts: int,
+    topk: int,
+    # Tensor pointers (runtime parameters - NOT in cache key)
+    a_ptr,
+    b_ptr,
+    a_sf_ptr,
+    b_sf_ptr,
+    c_ptr,
+    alpha_ptr,
+    tile_idx_ptr,
+    mn_limit_ptr,
+    permuted_idx_ptr,
+    num_tiles_ptr,
+    token_scales_ptr,
+    max_active_clusters: int,
+    stream,
+    # Tactic parameters (compile-time - IN cache key)
+    sf_vec_size: int,
+    tile_size: int,
+    mma_tiler_mn: Tuple[int, int],
+    cluster_shape_mn: Tuple[int, int],
+    raster_along_m: bool,
+):
+    """Get or compile the grouped GEMM with finalize fusion kernel.
+
+    This function caches compiled kernels by tactic parameters only.
+    Problem dimensions (m, n, k, num_experts) are runtime parameters.
+
+    This matches TRT-LLM's approach where the same compiled kernel can be
+    reused for different problem sizes, significantly reducing JIT compilation
+    overhead during autotuning.
+    """
+    global _finalize_kernel_cache
+
+    # Cache key only includes tactic parameters, NOT problem dimensions
+    cache_key = (sf_vec_size, tile_size, mma_tiler_mn, cluster_shape_mn, raster_along_m)
+
+    if cache_key not in _finalize_kernel_cache:
+        # Create kernel instance
+        gemm = Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel(
+            sf_vec_size=sf_vec_size,
+            mma_tiler_mn=mma_tiler_mn,
+            cluster_shape_mn=cluster_shape_mn,
+            raster_along_m=raster_along_m,
+        )
+
+        # Compile with runtime parameters - they can vary across calls
+        # Order must match wrapper signature:
+        # (a_ptr, b_ptr, a_sf_ptr, b_sf_ptr, c_ptr, alpha_ptr,
+        #  tile_idx_to_group_idx_ptr, tile_idx_to_mn_limit_ptr,
+        #  permuted_idx_to_expanded_idx_ptr, num_non_exiting_tiles_ptr,
+        #  token_final_scales_ptr, m, n, k, l, num_tokens, top_k,
+        #  tile_size, scaling_vector_size, max_active_clusters, stream)
+        compiled_gemm = cute.compile(
+            gemm.wrapper,
+            a_ptr,
+            b_ptr,
+            a_sf_ptr,
+            b_sf_ptr,
+            c_ptr,
+            alpha_ptr,
+            tile_idx_ptr,
+            mn_limit_ptr,
+            permuted_idx_ptr,
+            num_tiles_ptr,
+            token_scales_ptr,
+            permuted_m,
+            n,
+            k,
+            num_experts,
+            seq_len,
+            topk,
+            tile_size=tile_size,
+            scaling_vector_size=sf_vec_size,
+            max_active_clusters=max_active_clusters,
+            stream=stream,
+        )
+
+        _finalize_kernel_cache[cache_key] = compiled_gemm
+
+    return _finalize_kernel_cache[cache_key]
+
+
+@flashinfer_api
+def blockscaled_contiguous_grouped_gemm_finalize_fusion_nvfp4(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    alpha: torch.Tensor,
+    tile_idx_to_expert_idx: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    tile_idx_to_mn_limit: torch.Tensor,
+    permuted_idx_to_expanded_idx: torch.Tensor,
+    token_final_scales: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    *,
+    ab_dtype: str = "float4_e2m1fn",
+    sf_dtype: str = "float8_e4m3fn",
+    out_dtype: str = "bfloat16",
+    sf_vec_size: int = 16,
+    mma_tiler_mn: Tuple[int, int] = (256, 128),
+    cluster_shape_mn: Tuple[int, int] = (2, 1),
+    raster_along_m: bool = False,
+    sm_count: Optional[int] = None,
+) -> torch.Tensor:
+    """Blockscaled Contiguous Grouped GEMM with Finalize Fusion for MoE workloads.
+
+    Performs grouped matrix multiplication with fused finalize (scatter-reduce):
+    out[token_idx] += alpha[group] * (A[row] @ B[group]) * router_scale[token_idx, topk_idx]
+
+    This kernel is designed for Mixture of Experts (MoE) GEMM2 layers where:
+    - Tokens are permuted and contiguously arranged by expert assignment
+    - Each expert has a down projection weight matrix
+    - The finalize operation (unpermute + scale + reduce) is fused into the epilogue
+    - Uses atomic adds for scatter-reduction to handle tokens routed to multiple experts
+
+    Args:
+        a: Input tensor A (permuted activations), shape (permuted_m, k) for FP4 stored as (permuted_m, k//2) uint8
+        b: Weight tensor B (expert down weights), shape (num_experts, n, k)
+           for FP4 stored as (num_experts, n, k//2) uint8
+        a_scale: Scale factors for A in MMA-compatible layout
+        b_scale: Scale factors for B in MMA-compatible layout
+        alpha: Per-expert scaling factors, shape (num_experts,), float32
+        tile_idx_to_expert_idx: Mapping from tile index to expert index, shape (num_tiles,), int32
+        num_non_exiting_tiles: Number of valid tiles, shape (1,), int32
+        tile_idx_to_mn_limit: M limit for each tile, shape (num_tiles,), int32
+        permuted_idx_to_expanded_idx: Mapping from permuted row to expanded index, shape (permuted_m,), int32
+            expanded_idx = token_idx * topk + topk_idx. Invalid rows have -1.
+        token_final_scales: Router scaling factors, shape (seq_len, topk), float32/bf16/fp16
+        out: Optional output tensor, shape (seq_len, n). Created if None.
+             This tensor is used for atomic accumulation, so it should be zero-initialized.
+        ab_dtype: Data type for A and B matrices. Default: "float4_e2m1fn"
+        sf_dtype: Data type for scale factors. Default: "float8_e4m3fn"
+        out_dtype: Data type for output matrix. Default: "bfloat16"
+        sf_vec_size: Scale factor vector size. Default: 16 (for NVFP4)
+        mma_tiler_mn: MMA tile shape (M, N). Default: (256, 128)
+        cluster_shape_mn: Cluster shape (ClusterM, ClusterN). Default: (2, 1)
+        raster_along_m: If True, raster tiles along M dimension. Default: False
+        sm_count: Number of SMs to use. Default: max available.
+
+    Returns:
+        out: Output tensor, shape (seq_len, n) with dtype out_dtype.
+             Contains the finalized MoE output after scatter-reduce.
+
+    Notes:
+        - The output tensor is modified in-place using atomic adds for scatter-reduction.
+        - Call create_finalize_fusion_tensors() to create permuted_idx_to_expanded_idx and token_final_scales.
+        - Requires SM100 (Blackwell) GPU architecture
+        - The finalize fusion eliminates the need for a separate moe_unpermute kernel
+
+    Example:
+        >>> # Setup for MoE GEMM2 with 8 experts
+        >>> num_experts, intermediate_dim, hidden_dim = 8, 14336, 4096
+        >>> seq_len, topk = 4096, 8
+        >>>
+        >>> # Create tile mapping from routing decisions
+        >>> group_m = torch.tensor([512, 480, 256, 320, 640, 512, 384, 704], device="cuda")
+        >>> valid_m, aligned_m, tile_map, num_tiles, mn_limit = create_tile_mapping_finalize(
+        ...     group_m, mma_tiler_m=256
+        ... )
+        >>>
+        >>> # Create finalize fusion tensors
+        >>> permuted_idx, final_scales = create_finalize_fusion_tensors(
+        ...     seq_len=seq_len, topk=topk, permuted_m=sum(aligned_m),
+        ...     group_m_list=group_m.tolist(), mma_tiler_mn=(256, 128)
+        ... )
+        >>>
+        >>> # Run grouped GEMM with finalize fusion
+        >>> out = blockscaled_contiguous_grouped_gemm_finalize_fusion_nvfp4(
+        ...     a=gemm1_output_fp4,              # (valid_m, intermediate_dim//2)
+        ...     b=expert_down_weights_fp4,       # (num_experts, hidden_dim, intermediate_dim//2)
+        ...     a_scale=gemm1_output_scale,
+        ...     b_scale=down_weight_scale,
+        ...     alpha=expert_alpha,              # (num_experts,)
+        ...     tile_idx_to_expert_idx=tile_map,
+        ...     num_non_exiting_tiles=num_tiles,
+        ...     tile_idx_to_mn_limit=mn_limit,
+        ...     permuted_idx_to_expanded_idx=permuted_idx,
+        ...     token_final_scales=final_scales,
+        ... )  # out shape: (seq_len, hidden_dim)
+    """
+    # Validate inputs
+    assert a.device.type == "cuda", "Input tensors must be on CUDA device"
+    assert b.device.type == "cuda", "Input tensors must be on CUDA device"
+
+    # Get dimensions
+    permuted_m = a.shape[0]
+    num_experts = b.shape[0]
+    n = b.shape[1]
+    k = a.shape[1]
+    if ab_dtype == "float4_e2m1fn":
+        k = k * 2  # FP4 is packed 2 elements per byte
+
+    seq_len = token_final_scales.shape[0]
+    topk = token_final_scales.shape[1]
+
+    # Check compute capability
+    major, minor = get_compute_capability(a.device)
+    if major != 10:
+        raise ValueError(
+            f"Blockscaled contiguous grouped GEMM with finalize fusion requires SM100 family (Blackwell: SM100, SM103, SM110). "
+            f"Got SM{major}{minor}."
+        )
+
+    # Validate configuration
+    ab_dtype_cutlass = get_cutlass_dtype(ab_dtype)
+    sf_dtype_cutlass = get_cutlass_dtype(sf_dtype)
+    out_dtype_cutlass = get_cutlass_dtype(out_dtype)
+
+    if not Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel.can_implement(
+        ab_dtype_cutlass,
+        sf_dtype_cutlass,
+        sf_vec_size,
+        out_dtype_cutlass,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        permuted_m,
+        n,
+        k,
+        num_experts,
+        a_major="k",
+        b_major="k",
+        out_major="n",
+    ):
+        raise ValueError(
+            f"Unsupported configuration: ab_dtype={ab_dtype}, sf_dtype={sf_dtype}, "
+            f"sf_vec_size={sf_vec_size}, out_dtype={out_dtype}, mma_tiler_mn={mma_tiler_mn}, "
+            f"cluster_shape_mn={cluster_shape_mn}, shape=({permuted_m}, {n}, {k}, {num_experts})"
+        )
+
+    # Create output tensor if not provided (zero-initialized for atomic adds)
+    if out is None:
+        out = torch.zeros(
+            (seq_len, n),
+            dtype=cutlass_to_torch_dtype(out_dtype_cutlass),
+            device=a.device,
+        )
+    else:
+        # Ensure output is zero for proper accumulation
+        out.zero_()
+
+    # Get SM count
+    if sm_count is None:
+        sm_count = get_num_sm(a.device)
+
+    # Compute max active clusters (cached to avoid expensive HardwareInfo queries)
+    max_active_clusters = get_max_active_clusters(
+        cluster_shape_mn[0] * cluster_shape_mn[1]
+    )
+
+    # Get tile_size from mma_tiler_mn
+    tile_size = mma_tiler_mn[0]
+
+    # Create raw pointers (TRT-LLM style) - allows same compiled kernel for different sizes
+    a_ptr = make_ptr(
+        ab_dtype_cutlass, a.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    b_ptr = make_ptr(
+        ab_dtype_cutlass, b.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    a_sf_ptr = make_ptr(
+        sf_dtype_cutlass, a_scale.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    b_sf_ptr = make_ptr(
+        sf_dtype_cutlass, b_scale.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    c_ptr = make_ptr(
+        out_dtype_cutlass, out.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+
+    alpha_ptr = make_ptr(cutlass.Float32, alpha.data_ptr(), cute.AddressSpace.gmem)
+    tile_idx_ptr = make_ptr(
+        cutlass.Int32, tile_idx_to_expert_idx.data_ptr(), cute.AddressSpace.gmem
+    )
+    mn_limit_ptr = make_ptr(
+        cutlass.Int32, tile_idx_to_mn_limit.data_ptr(), cute.AddressSpace.gmem
+    )
+    num_tiles_ptr = make_ptr(
+        cutlass.Int32, num_non_exiting_tiles.data_ptr(), cute.AddressSpace.gmem
+    )
+    permuted_idx_ptr = make_ptr(
+        cutlass.Int32, permuted_idx_to_expanded_idx.data_ptr(), cute.AddressSpace.gmem
+    )
+
+    # Token final scales - determine dtype and create pointer
+    if token_final_scales.dtype == torch.float32:
+        token_scales_dtype = cutlass.Float32
+    elif token_final_scales.dtype == torch.bfloat16:
+        token_scales_dtype = cutlass.BFloat16
+    else:
+        token_scales_dtype = cutlass.Float16
+    token_scales_ptr = make_ptr(
+        token_scales_dtype,
+        token_final_scales.data_ptr(),
+        cute.AddressSpace.gmem,
+        assumed_align=16,
+    )
+
+    # Get CUDA stream
+    torch_stream = torch.cuda.current_stream()
+    stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    # Get or compile the kernel (cached by tactic parameters only)
+    compiled_gemm = _get_compiled_finalize_kernel(
+        # Runtime parameters (problem dimensions)
+        seq_len=seq_len,
+        permuted_m=permuted_m,
+        n=n,
+        k=k,
+        num_experts=num_experts,
+        topk=topk,
+        # Tensor pointers (order must match wrapper signature)
+        a_ptr=a_ptr,
+        b_ptr=b_ptr,
+        a_sf_ptr=a_sf_ptr,
+        b_sf_ptr=b_sf_ptr,
+        c_ptr=c_ptr,
+        alpha_ptr=alpha_ptr,
+        tile_idx_ptr=tile_idx_ptr,
+        mn_limit_ptr=mn_limit_ptr,
+        permuted_idx_ptr=permuted_idx_ptr,
+        num_tiles_ptr=num_tiles_ptr,
+        token_scales_ptr=token_scales_ptr,
+        max_active_clusters=max_active_clusters,
+        stream=stream,
+        # Tactic parameters (compile-time, cached)
+        sf_vec_size=sf_vec_size,
+        tile_size=tile_size,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        raster_along_m=raster_along_m,
+    )
+
+    # Execute kernel with runtime parameters
+    # Order must match wrapper signature:
+    # (a_ptr, b_ptr, a_sf_ptr, b_sf_ptr, c_ptr, alpha_ptr, tile_idx_ptr,
+    #  mn_limit_ptr, permuted_idx_ptr, num_tiles_ptr, token_scales_ptr,
+    #  m, n, k, l, num_tokens, top_k, stream)
+    compiled_gemm(
+        a_ptr,
+        b_ptr,
+        a_sf_ptr,
+        b_sf_ptr,
+        c_ptr,
+        alpha_ptr,
+        tile_idx_ptr,
+        mn_limit_ptr,
+        permuted_idx_ptr,
+        num_tiles_ptr,
+        token_scales_ptr,
+        permuted_m,
+        n,
+        k,
+        num_experts,
+        seq_len,
+        topk,
+        stream=stream,
+    )
+
+    return out
diff --git a/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py b/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py
new file mode 100644
index 0000000000..766620b8c7
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py
@@ -0,0 +1,376 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file wraps TensorRT-LLM's CuteDSL grouped GEMM with SwiGLU fusion:
+# tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py
+#
+# Original copyright:
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Contiguous Grouped GEMM kernel with SwiGLU fusion for MoE workloads on Blackwell GPUs.
+
+This module provides a FlashInfer-style API wrapper around the TensorRT-LLM CuteDSL
+grouped GEMM kernel with fused SwiGLU activation designed for MoE GEMM1 layers:
+- Input A: (permuted_m, k) - permuted tokens from all batches
+- Input B: (num_experts, 2*intermediate_size, k) - expert gate and up weights interleaved
+- Output C: (permuted_m, intermediate_size) - SwiGLU activated outputs
+
+Key features:
+- NVFP4 x NVFP4 grouped GEMM with FP8 scale factors
+- Fused SwiGLU activation in epilogue: output = up * silu(gate)
+- Optional FP4 quantization of output with scale factor generation
+- Persistent tile scheduling with per-expert group mapping
+- Warp specialization for overlapped memory and compute
+- Support for SM100 (Blackwell) architecture
+"""
+
+from typing import Optional, Tuple
+
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+import functools
+import torch
+from cutlass.cute.runtime import from_dlpack
+
+from flashinfer.utils import get_compute_capability
+from flashinfer.api_logging import flashinfer_api
+from flashinfer.cute_dsl.utils import (
+    get_cutlass_dtype,
+    cutlass_to_torch_dtype,
+    get_num_sm,
+    get_max_active_clusters,
+)
+
+# Import the TRT-LLM kernel implementation
+from .blackwell.blockscaled_contiguous_grouped_gemm_swiglu_fusion import (
+    Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel,
+)
+
+# Re-export the kernel class
+
+
+@functools.cache
+def _get_compiled_swiglu_kernel(
+    ab_dtype_name: str,
+    sf_dtype_name: str,
+    c_dtype_name: str,
+    sf_vec_size: int,
+    mma_tiler_mn: Tuple[int, int],
+    cluster_shape_mn: Tuple[int, int],
+    vectorized_f32: bool,
+):
+    """Get or compile the grouped GEMM with SwiGLU kernel.
+
+    This function is cached to avoid recompilation for the same parameters.
+    Shape parameters (permuted_m, n, k, num_experts) are not included in the
+    cache key since the kernel is shape-agnostic.
+    """
+    ab_dtype = get_cutlass_dtype(ab_dtype_name)
+    sf_dtype = get_cutlass_dtype(sf_dtype_name)
+    c_dtype = get_cutlass_dtype(c_dtype_name)
+
+    # Create kernel instance
+    gemm = Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel(
+        sf_vec_size=sf_vec_size,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        vectorized_f32=vectorized_f32,
+    )
+
+    return gemm, ab_dtype, sf_dtype, c_dtype
+
+
+@flashinfer_api
+def blockscaled_contiguous_grouped_gemm_swiglu_fusion_nvfp4(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    alpha: torch.Tensor,
+    tile_idx_to_group_idx: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    out_scale: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
+    *,
+    ab_dtype: str = "float4_e2m1fn",
+    sf_dtype: str = "float8_e4m3fn",
+    c_dtype: str = "bfloat16",
+    sf_vec_size: int = 16,
+    mma_tiler_mn: Tuple[int, int] = (256, 128),
+    cluster_shape_mn: Tuple[int, int] = (2, 1),
+    vectorized_f32: bool = True,
+    sm_count: Optional[int] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """Blockscaled Contiguous Grouped GEMM with SwiGLU Fusion for MoE workloads.
+
+    Performs grouped matrix multiplication with fused SwiGLU activation:
+    C[tile] = up * silu(gate), where [gate, up] = alpha[group] * (A[tile] @ B[group])
+
+    This kernel is designed for Mixture of Experts (MoE) GEMM1 layers where:
+    - Tokens are permuted and contiguously arranged by expert assignment
+    - Each expert has gate and up projection weights interleaved
+    - SwiGLU activation is fused into the GEMM epilogue
+    - Optional FP4 quantization of output
+
+    Args:
+        a: Input tensor A (permuted tokens), shape (permuted_m, k) for FP4 stored as (permuted_m, k//2) uint8
+        b: Weight tensor B (expert gate+up weights), shape (num_experts, 2*intermediate_size, k)
+           for FP4 stored as (num_experts, 2*intermediate_size, k//2) uint8
+           The N dimension contains interleaved gate and up projection weights.
+        a_scale: Scale factors for A in MMA-compatible layout
+        b_scale: Scale factors for B in MMA-compatible layout
+        alpha: Per-expert scaling factors, shape (num_experts,), float32
+        tile_idx_to_group_idx: Mapping from tile index to expert index, shape (num_tiles,), int32
+        num_non_exiting_tiles: Number of valid tiles, shape (1,), int32
+        out: Optional output tensor, shape (permuted_m, intermediate_size). Created if None.
+             For FP4 output, shape is (permuted_m, intermediate_size//2) uint8.
+        out_scale: Optional output scale factor tensor for FP4 quantized output.
+                   Shape depends on MMA layout. Only used when c_dtype is "float4_e2m1fn".
+        global_scale: Global scale factor for FP4 quantization, shape (1,), float32.
+                      Required when c_dtype is "float4_e2m1fn".
+        ab_dtype: Data type for A and B matrices. Default: "float4_e2m1fn"
+        sf_dtype: Data type for scale factors. Default: "float8_e4m3fn"
+        c_dtype: Data type for output matrix. Default: "bfloat16"
+                 Set to "float4_e2m1fn" for fused FP4 quantization.
+        sf_vec_size: Scale factor vector size. Default: 16 (for NVFP4)
+        mma_tiler_mn: MMA tile shape (M, N). Default: (256, 128)
+        cluster_shape_mn: Cluster shape (ClusterM, ClusterN). Default: (2, 1)
+        vectorized_f32: Use vectorized f32x2 operations. Default: True
+        sm_count: Number of SMs to use. Default: max available.
+
+    Returns:
+        Tuple of:
+        - out: Output tensor C, shape (permuted_m, intermediate_size) with dtype c_dtype
+               For FP4 output: (permuted_m, intermediate_size//2) uint8
+        - out_scale: Output scale factors if c_dtype is FP4, else None
+
+    Notes:
+        - The B tensor N dimension is 2*intermediate_size (gate + up interleaved)
+        - Output N dimension is intermediate_size (after SwiGLU)
+        - Use create_tile_mapping() to create tile_idx_to_group_idx and num_non_exiting_tiles
+        - Requires SM100 (Blackwell) GPU architecture
+        - SwiGLU fusion significantly reduces memory bandwidth vs separate activation
+
+    Example:
+        >>> # Setup for MoE GEMM1 with 8 experts
+        >>> num_experts, hidden_dim, intermediate_dim = 8, 4096, 14336
+        >>>
+        >>> # Create tile mapping from routing decisions
+        >>> group_m = torch.tensor([256, 128, 384, 256, 128, 256, 256, 384], device="cuda")
+        >>> valid_m, aligned_m, tile_map, num_tiles = create_tile_mapping(group_m, mma_tiler_m=256)
+        >>>
+        >>> # Run grouped GEMM with SwiGLU fusion
+        >>> out, _ = blockscaled_contiguous_grouped_gemm_swiglu_fusion_nvfp4(
+        ...     a=permuted_input_fp4,           # (valid_m, hidden_dim//2)
+        ...     b=expert_gate_up_weights_fp4,   # (num_experts, 2*intermediate_dim, hidden_dim//2)
+        ...     a_scale=input_scale,
+        ...     b_scale=weight_scale,
+        ...     alpha=expert_alpha,             # (num_experts,)
+        ...     tile_idx_to_group_idx=tile_map,
+        ...     num_non_exiting_tiles=num_tiles,
+        ... )  # out shape: (valid_m, intermediate_dim)
+    """
+    # Validate inputs
+    assert a.device.type == "cuda", "Input tensors must be on CUDA device"
+    assert b.device.type == "cuda", "Input tensors must be on CUDA device"
+
+    # Get dimensions
+    permuted_m = a.shape[0]
+    num_experts = b.shape[0]
+    n = b.shape[1]  # This is 2*intermediate_size
+    k = a.shape[1]
+    if ab_dtype == "float4_e2m1fn":
+        k = k * 2  # FP4 is packed 2 elements per byte
+
+    intermediate_size = n // 2  # Output dimension after SwiGLU
+
+    # Check compute capability
+    major, minor = get_compute_capability(a.device)
+    if major != 10:
+        raise ValueError(
+            f"Blockscaled contiguous grouped GEMM with SwiGLU requires SM100 family (Blackwell: SM100, SM103, SM110). "
+            f"Got SM{major}{minor}."
+        )
+
+    # Validate configuration
+    ab_dtype_cutlass = get_cutlass_dtype(ab_dtype)
+    sf_dtype_cutlass = get_cutlass_dtype(sf_dtype)
+    c_dtype_cutlass = get_cutlass_dtype(c_dtype)
+
+    if not Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel.can_implement(
+        ab_dtype_cutlass,
+        sf_dtype_cutlass,
+        sf_vec_size,
+        c_dtype_cutlass,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        permuted_m,
+        n,
+        k,
+        num_experts,
+        a_major="k",
+        b_major="k",
+        c_major="n",
+    ):
+        raise ValueError(
+            f"Unsupported configuration: ab_dtype={ab_dtype}, sf_dtype={sf_dtype}, "
+            f"sf_vec_size={sf_vec_size}, c_dtype={c_dtype}, mma_tiler_mn={mma_tiler_mn}, "
+            f"cluster_shape_mn={cluster_shape_mn}, shape=({permuted_m}, {n}, {k}, {num_experts})"
+        )
+
+    # Check if we're doing FP4 quantization
+    generate_sfc = c_dtype == "float4_e2m1fn"
+    if generate_sfc:
+        if global_scale is None:
+            raise ValueError("global_scale is required when c_dtype is 'float4_e2m1fn'")
+
+    # Create output tensor if not provided
+    if out is None:
+        if generate_sfc:
+            # FP4 output: 2 values per byte
+            out = torch.empty(
+                (permuted_m, intermediate_size // 2),
+                dtype=torch.uint8,
+                device=a.device,
+            )
+        else:
+            out = torch.empty(
+                (permuted_m, intermediate_size),
+                dtype=cutlass_to_torch_dtype(c_dtype_cutlass),
+                device=a.device,
+            )
+
+    # Create output scale tensor if needed and not provided
+    if generate_sfc and out_scale is None:
+        # Scale factor layout for output
+        scale_intermediate_size = intermediate_size // sf_vec_size
+        # MMA-compatible scale factor shape
+        out_scale = torch.empty(
+            (32, 4, permuted_m // 128, 4, scale_intermediate_size // 4, 1),
+            dtype=torch.uint8,  # FP8 E4M3
+            device=a.device,
+        )
+
+    # Get SM count
+    if sm_count is None:
+        sm_count = get_num_sm(a.device)
+
+    # Get or compile the kernel
+    gemm, _, _, _ = _get_compiled_swiglu_kernel(
+        ab_dtype_name=ab_dtype,
+        sf_dtype_name=sf_dtype,
+        c_dtype_name=c_dtype,
+        sf_vec_size=sf_vec_size,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        vectorized_f32=vectorized_f32,
+    )
+
+    # Compute max active clusters (cached to avoid expensive HardwareInfo queries)
+    max_active_clusters = get_max_active_clusters(
+        cluster_shape_mn[0] * cluster_shape_mn[1]
+    )
+
+    # Create CuTe tensors from PyTorch tensors
+    # A: (permuted_m, k, 1) - single batch of permuted tokens
+    # B: (n, k, num_experts) - expert weights (note: n = 2*intermediate_size)
+    # C: (permuted_m, intermediate_size, 1) - output
+    a_tensor = from_dlpack(a.unsqueeze(-1), assumed_align=16).mark_layout_dynamic(
+        leading_dim=1
+    )
+    b_tensor = from_dlpack(b.permute(1, 2, 0), assumed_align=16).mark_layout_dynamic(
+        leading_dim=1
+    )
+    c_tensor = from_dlpack(out.unsqueeze(-1), assumed_align=16).mark_layout_dynamic(
+        leading_dim=1
+    )
+
+    # Set the correct element types for FP4 tensors
+    # from_dlpack infers dtype from PyTorch (uint8) but the kernel needs Float4E2M1FN
+    a_tensor.element_type = ab_dtype_cutlass
+    b_tensor.element_type = ab_dtype_cutlass
+    c_tensor.element_type = c_dtype_cutlass
+
+    # Scale factor tensors
+    # For A scale: 2D or 3D linear layout with dim 1 contiguous
+    sfa_tensor = from_dlpack(a_scale, assumed_align=16).mark_layout_dynamic(
+        leading_dim=1
+    )
+    # For B scale: 6D strided MMA layout with physical dim 3 contiguous
+    sfb_tensor = from_dlpack(b_scale, assumed_align=16).mark_layout_dynamic(
+        leading_dim=3
+    )
+
+    # Set the correct element types for scale factor tensors
+    sfa_tensor.element_type = sf_dtype_cutlass
+    sfb_tensor.element_type = sf_dtype_cutlass
+
+    # Optional output scale factor tensor
+    if generate_sfc:
+        # Output scale is contiguous 6D with dim 4 having stride 1
+        sfc_tensor = from_dlpack(out_scale, assumed_align=16).mark_layout_dynamic(
+            leading_dim=4
+        )
+        sfc_tensor.element_type = sf_dtype_cutlass
+        norm_const_tensor = from_dlpack(
+            global_scale, assumed_align=16
+        ).mark_layout_dynamic()
+    else:
+        sfc_tensor = None
+        norm_const_tensor = None
+
+    # Mapping tensors
+    tile_idx_tensor = from_dlpack(tile_idx_to_group_idx).mark_layout_dynamic()
+    num_tiles_tensor = from_dlpack(num_non_exiting_tiles).mark_layout_dynamic()
+    alpha_tensor = from_dlpack(alpha).mark_layout_dynamic()
+
+    # Get current CUDA stream
+    current_stream = cutlass_torch.current_stream()
+
+    # Compile and run the kernel
+    compiled_gemm = cute.compile(
+        gemm,
+        a_tensor,
+        b_tensor,
+        c_tensor,
+        sfa_tensor,
+        sfb_tensor,
+        sfc_tensor,
+        norm_const_tensor,
+        tile_idx_tensor,
+        num_tiles_tensor,
+        alpha_tensor,
+        max_active_clusters,
+        current_stream,
+    )
+
+    # Execute
+    compiled_gemm(
+        a_tensor,
+        b_tensor,
+        c_tensor,
+        sfa_tensor,
+        sfb_tensor,
+        sfc_tensor,
+        norm_const_tensor,
+        tile_idx_tensor,
+        num_tiles_tensor,
+        alpha_tensor,
+        current_stream,
+    )
+
+    return out, out_scale if generate_sfc else None
diff --git a/flashinfer/fused_moe/cute_dsl/fused_moe.py b/flashinfer/fused_moe/cute_dsl/fused_moe.py
new file mode 100644
index 0000000000..4580e0704a
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/fused_moe.py
@@ -0,0 +1,770 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+CuteDSL-based Fused MoE API for NVFP4 on Blackwell GPUs.
+
+This module provides high-level APIs for running Mixture of Experts (MoE)
+computations using CuteDSL kernels.
+
+Two APIs are provided:
+
+1. **Functional API** (`cute_dsl_fused_moe_nvfp4`):
+   Simple function call with auto-tuning support via `autotune()` context.
+   Best for: simple use cases, experimenting, auto-tuning.
+
+2. **Wrapper API** (`CuteDslMoEWrapper`):
+   Class-based API with pre-allocated buffers for CUDA graph compatibility.
+   Best for: production inference with CUDA graphs, fine-grained control.
+
+Both APIs share the same core implementation and support auto-tuning.
+
+Example (Functional API):
+    >>> from flashinfer.cute_dsl import cute_dsl_fused_moe_nvfp4
+    >>> output = cute_dsl_fused_moe_nvfp4(x, x_sf, ..., num_experts=8, top_k=2)
+
+Example (Wrapper API with CUDA Graph):
+    >>> from flashinfer.cute_dsl import CuteDslMoEWrapper
+    >>> moe = CuteDslMoEWrapper(num_experts=256, top_k=8, ..., use_cuda_graph=True)
+    >>> # Warmup
+    >>> for _ in range(3):
+    ...     output = moe.run(x, x_sf, topk_ids, topk_weights, w1, w1_sf, ...)
+    >>> # Capture
+    >>> with torch.cuda.graph(g):
+    ...     output = moe.run(x, x_sf, topk_ids, topk_weights, w1, w1_sf, ...)
+    >>> # Replay
+    >>> g.replay()
+"""
+
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+
+from ...api_logging import flashinfer_api
+from ...autotuner import AutoTuner
+from .moe_utils import (
+    allocate_moe_sort_buffers,
+    get_max_num_permuted_tokens,
+    moe_output_memset,
+    moe_sort,
+)
+from .blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion import (
+    blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion_nvfp4,
+)
+from .blockscaled_contiguous_grouped_gemm_finalize_fusion import (
+    blockscaled_contiguous_grouped_gemm_finalize_fusion_nvfp4,
+)
+from .tuner import (
+    ALL_MOE_TACTICS,
+    CuteDslFusedMoENvfp4Runner,
+)
+
+
+# =============================================================================
+# Module-level Resources for CUDA Graph Compatibility
+# =============================================================================
+
+_cuda_graph_resources: Dict[str, Any] = {}
+
+
+def _get_cuda_graph_resources() -> Dict[str, Any]:
+    """Get or create pre-allocated CUDA events and streams.
+
+    These resources must be created outside CUDA graph capture.
+    """
+    if not _cuda_graph_resources:
+        _cuda_graph_resources["main_event"] = torch.cuda.Event()
+        _cuda_graph_resources["memset_event"] = torch.cuda.Event()
+        _cuda_graph_resources["aux_stream"] = torch.cuda.Stream()
+    return _cuda_graph_resources
+
+
+# =============================================================================
+# Core Implementation (Shared by Functional and Wrapper APIs)
+# =============================================================================
+
+
+def _moe_core_impl(
+    # Input
+    x: torch.Tensor,
+    x_sf: torch.Tensor,
+    # Routing
+    token_selected_experts: torch.Tensor,
+    token_final_scales: torch.Tensor,
+    # GEMM1 weights
+    w1_weight: torch.Tensor,
+    w1_weight_sf: torch.Tensor,
+    w1_alpha: torch.Tensor,
+    # GEMM2 intermediate scale
+    fc2_input_scale: torch.Tensor,
+    # GEMM2 weights
+    w2_weight: torch.Tensor,
+    w2_weight_sf: torch.Tensor,
+    w2_alpha: torch.Tensor,
+    # MoE config
+    num_experts: int,
+    top_k: int,
+    num_local_experts: int,
+    local_expert_offset: int = 0,
+    # Tactic parameters
+    tile_size: int = 128,
+    gemm1_mma_tiler_mn: Tuple[int, int] = (128, 128),
+    gemm1_cluster_shape_mn: Tuple[int, int] = (1, 1),
+    gemm2_mma_tiler_mn: Tuple[int, int] = (128, 128),
+    gemm2_cluster_shape_mn: Tuple[int, int] = (1, 1),
+    # Pre-allocated buffers (for CUDA graph)
+    moe_sort_buffers: Optional[Dict[str, torch.Tensor]] = None,
+    gemm1_out: Optional[torch.Tensor] = None,
+    gemm1_out_scale: Optional[torch.Tensor] = None,
+    moe_output: Optional[torch.Tensor] = None,
+    # Stream resources
+    aux_stream: Optional[torch.cuda.Stream] = None,
+    main_event: Optional[torch.cuda.Event] = None,
+    memset_event: Optional[torch.cuda.Event] = None,
+    # Options
+    output_dtype: torch.dtype = torch.bfloat16,
+    use_async_memset: bool = True,
+) -> torch.Tensor:
+    """Core MoE implementation shared by functional and wrapper APIs.
+
+    This function handles:
+    1. moe_sort: Token routing computation
+    2. GEMM1 + SwiGLU: First projection with activation
+    3. Async moe_output_memset: Zero output buffer (overlapped with GEMM1)
+    4. GEMM2 + Finalize: Second projection with atomic scatter
+
+    Args:
+        x: Input tensor, NVFP4 quantized.
+        x_sf: Scale factors for x.
+        token_selected_experts: Expert assignments [num_tokens, top_k].
+        token_final_scales: Routing weights [num_tokens, top_k].
+        w1_weight: GEMM1 weights (gate + up fused).
+        w1_weight_sf: Scale factors for w1_weight.
+        w1_alpha: Per-expert global scale for GEMM1.
+        fc2_input_scale: Global scale for GEMM2 input quantization.
+        w2_weight: GEMM2 weights (down projection).
+        w2_weight_sf: Scale factors for w2_weight.
+        w2_alpha: Per-expert global scale for GEMM2.
+        num_experts: Total number of experts.
+        top_k: Number of experts per token.
+        num_local_experts: Number of local experts (for EP).
+        local_expert_offset: Expert offset for EP.
+        tile_size: Tile size for moe_sort.
+        gemm1_mma_tiler_mn: GEMM1 MMA tiler shape.
+        gemm1_cluster_shape_mn: GEMM1 cluster shape.
+        gemm2_mma_tiler_mn: GEMM2 MMA tiler shape.
+        gemm2_cluster_shape_mn: GEMM2 cluster shape.
+        moe_sort_buffers: Pre-allocated moe_sort output buffers.
+        gemm1_out: Pre-allocated GEMM1 output buffer.
+        gemm1_out_scale: Pre-allocated GEMM1 output scale buffer.
+        moe_output: Pre-allocated final output buffer.
+        aux_stream: Auxiliary CUDA stream for async memset.
+        main_event: CUDA event for main stream.
+        memset_event: CUDA event for memset completion.
+        output_dtype: Output data type.
+        use_async_memset: Use async memset on aux stream.
+
+    Returns:
+        Output tensor [num_tokens, hidden_size].
+    """
+    num_tokens = token_selected_experts.size(0)
+    hidden_size = w2_weight.size(1)
+
+    # Allocate output if not provided
+    if moe_output is None:
+        moe_output = torch.empty(
+            (num_tokens, hidden_size),
+            dtype=output_dtype,
+            device=x.device,
+        )
+
+    # Get stream resources if using async memset
+    if use_async_memset:
+        if aux_stream is None or main_event is None or memset_event is None:
+            resources = _get_cuda_graph_resources()
+            aux_stream = aux_stream or resources["aux_stream"]
+            main_event = main_event or resources["main_event"]
+            memset_event = memset_event or resources["memset_event"]
+
+    # Step 1: Sort tokens by expert
+    moe_sort_kwargs = moe_sort_buffers or {}
+    (
+        tile_idx_to_expert_idx,
+        tile_idx_to_mn_limit,
+        expanded_idx_to_permuted_idx,
+        permuted_idx_to_expanded_idx,
+        total_num_padded_tokens,
+        num_non_exiting_tiles,
+    ) = moe_sort(
+        token_selected_experts=token_selected_experts,
+        token_final_scales=token_final_scales,
+        num_experts=num_experts,
+        top_k=top_k,
+        local_expert_offset=local_expert_offset,
+        num_local_experts=num_local_experts,
+        tile_tokens_dim=tile_size,
+        **moe_sort_kwargs,
+    )
+
+    # Record event for async memset synchronization
+    if use_async_memset:
+        main_event.record()
+        moe_output.record_stream(aux_stream)
+
+    # Step 2: GEMM1 + SwiGLU
+    intermediate, intermediate_sf = (
+        blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion_nvfp4(
+            a=x,
+            b=w1_weight,
+            a_scale=x_sf,
+            b_scale=w1_weight_sf,
+            alpha=w1_alpha,
+            tile_idx_to_expert_idx=tile_idx_to_expert_idx,
+            tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+            token_id_mapping=permuted_idx_to_expanded_idx,
+            num_non_exiting_tiles=num_non_exiting_tiles,
+            out=gemm1_out,
+            out_scale=gemm1_out_scale,
+            global_scale=fc2_input_scale,
+            topk=top_k,
+            c_dtype="float4_e2m1fn",
+            mma_tiler_mn=gemm1_mma_tiler_mn,
+            cluster_shape_mn=gemm1_cluster_shape_mn,
+        )
+    )
+
+    # Step 3: Async moe_output_memset on auxiliary stream
+    if use_async_memset:
+        max_num_permuted_tokens = get_max_num_permuted_tokens(
+            num_tokens, top_k, num_local_experts, tile_size
+        )
+        with torch.cuda.stream(aux_stream):
+            main_event.wait()
+            moe_output_memset(
+                output=moe_output,
+                tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+                expanded_idx_to_permuted_idx=expanded_idx_to_permuted_idx,
+                permuted_idx_to_expanded_idx=permuted_idx_to_expanded_idx,
+                num_non_exiting_tiles=num_non_exiting_tiles,
+                max_num_permuted_tokens=max_num_permuted_tokens,
+                top_k=top_k,
+                tile_size=tile_size,
+            )
+            memset_event.record()
+        memset_event.wait()
+    else:
+        # Simple zero without async
+        moe_output[:num_tokens].zero_()
+
+    # Step 4: GEMM2 + Finalize
+    blockscaled_contiguous_grouped_gemm_finalize_fusion_nvfp4(
+        a=intermediate,
+        b=w2_weight,
+        a_scale=intermediate_sf,
+        b_scale=w2_weight_sf,
+        alpha=w2_alpha,
+        tile_idx_to_expert_idx=tile_idx_to_expert_idx,
+        num_non_exiting_tiles=num_non_exiting_tiles,
+        tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+        permuted_idx_to_expanded_idx=permuted_idx_to_expanded_idx,
+        token_final_scales=token_final_scales,
+        out=moe_output,
+        mma_tiler_mn=gemm2_mma_tiler_mn,
+        cluster_shape_mn=gemm2_cluster_shape_mn,
+    )
+
+    return moe_output[:num_tokens]
+
+
+# =============================================================================
+# Wrapper API (Class-based, CUDA Graph Compatible)
+# =============================================================================
+
+
+class CuteDslMoEWrapper:
+    """Wrapper class for CuteDSL MoE with CUDA graph and auto-tuning support.
+
+    This wrapper pre-allocates all necessary buffers when `use_cuda_graph=True`,
+    enabling CUDA graph capture and replay. It also supports auto-tuning via
+    the `tactic` parameter or by calling inside `autotune()` context.
+
+    Attributes:
+        num_experts: Total number of experts.
+        top_k: Number of experts per token.
+        hidden_size: Hidden dimension size.
+        intermediate_size: Intermediate dimension size.
+        use_cuda_graph: Whether to pre-allocate buffers for CUDA graph.
+        max_num_tokens: Maximum tokens (only used with use_cuda_graph=True).
+
+    Example (CUDA Graph):
+        >>> moe = CuteDslMoEWrapper(
+        ...     num_experts=256, top_k=8,
+        ...     hidden_size=7168, intermediate_size=2048,
+        ...     use_cuda_graph=True, max_num_tokens=4096,
+        ... )
+        >>> # Warmup
+        >>> for _ in range(3):
+        ...     output = moe.run(x, x_sf, topk_ids, topk_weights, w1, w1_sf, ...)
+        >>> # Capture
+        >>> g = torch.cuda.CUDAGraph()
+        >>> with torch.cuda.graph(g):
+        ...     output = moe.run(x, x_sf, topk_ids, topk_weights, w1, w1_sf, ...)
+        >>> # Replay
+        >>> g.replay()
+
+    Example (Auto-tuning):
+        >>> moe = CuteDslMoEWrapper(num_experts=256, top_k=8, ...)
+        >>> # Run with auto-tuning
+        >>> with autotune(True):
+        ...     output = moe.run(x, x_sf, topk_ids, topk_weights, w1, w1_sf, ...)
+    """
+
+    @flashinfer_api
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        use_cuda_graph: bool = False,
+        max_num_tokens: int = 4096,
+        num_local_experts: Optional[int] = None,
+        local_expert_offset: int = 0,
+        tile_size: int = 128,
+        sf_vec_size: int = 16,
+        output_dtype: torch.dtype = torch.bfloat16,
+        device: str = "cuda",
+    ):
+        """Initialize the MoE wrapper.
+
+        Args:
+            num_experts: Total number of experts.
+            top_k: Number of experts per token.
+            hidden_size: Hidden dimension size.
+            intermediate_size: Intermediate size (after SwiGLU reduction).
+            use_cuda_graph: Pre-allocate buffers for CUDA graph compatibility.
+            max_num_tokens: Maximum tokens (only for use_cuda_graph=True).
+            num_local_experts: Local experts for EP. Default: num_experts.
+            local_expert_offset: Expert offset for EP. Default: 0.
+            tile_size: Tile size for moe_sort. Default: 128.
+            sf_vec_size: Scale factor vector size. Default: 16.
+            output_dtype: Output data type. Default: torch.bfloat16.
+            device: Device for buffer allocation. Default: "cuda".
+        """
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.use_cuda_graph = use_cuda_graph
+        self.max_num_tokens = max_num_tokens
+        self.num_local_experts = num_local_experts or num_experts
+        self.local_expert_offset = local_expert_offset
+        self.tile_size = tile_size
+        self.sf_vec_size = sf_vec_size
+        self.output_dtype = output_dtype
+        self.device = device
+
+        # Pre-allocated buffers
+        self._moe_sort_buffers: Optional[Dict[str, torch.Tensor]] = None
+        self._gemm1_output: Optional[torch.Tensor] = None
+        self._gemm1_output_scale: Optional[torch.Tensor] = None
+        self._moe_output: Optional[torch.Tensor] = None
+        self._aux_stream: Optional[torch.cuda.Stream] = None
+        self._main_event: Optional[torch.cuda.Event] = None
+        self._memset_event: Optional[torch.cuda.Event] = None
+
+        # Create auto-tuner runner
+        self._runner = CuteDslFusedMoENvfp4Runner(
+            forward_impl=self._forward_with_tactic,
+            num_experts=num_experts,
+            top_k=top_k,
+            num_local_experts=self.num_local_experts,
+            local_expert_offset=local_expert_offset,
+            use_fused_finalize=True,
+            output_dtype=output_dtype,
+        )
+
+        if use_cuda_graph:
+            self._allocate_buffers()
+
+    def _allocate_buffers(self) -> None:
+        """Pre-allocate all buffers for CUDA graph compatibility."""
+        max_num_permuted_tokens = get_max_num_permuted_tokens(
+            self.max_num_tokens, self.top_k, self.num_local_experts, self.tile_size
+        )
+
+        # moe_sort buffers
+        self._moe_sort_buffers = allocate_moe_sort_buffers(
+            num_tokens=self.max_num_tokens,
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+            num_local_experts=self.num_local_experts,
+            tile_tokens_dim=self.tile_size,
+            device=self.device,
+        )
+
+        # GEMM1 output (FP4 quantized)
+        self._gemm1_output = torch.empty(
+            (max_num_permuted_tokens, self.intermediate_size // 2),
+            dtype=torch.uint8,
+            device=self.device,
+        )
+
+        # GEMM1 output scale
+        scale_size = max_num_permuted_tokens * (
+            self.intermediate_size // self.sf_vec_size
+        )
+        self._gemm1_output_scale = torch.empty(
+            (scale_size,), dtype=torch.uint8, device=self.device
+        )
+
+        # Final output
+        self._moe_output = torch.empty(
+            (self.max_num_tokens, self.hidden_size),
+            dtype=self.output_dtype,
+            device=self.device,
+        )
+
+        # CUDA resources
+        self._aux_stream = torch.cuda.Stream(device=self.device)
+        self._main_event = torch.cuda.Event()
+        self._memset_event = torch.cuda.Event()
+
+    def _forward_with_tactic(
+        self,
+        x: torch.Tensor,
+        x_sf: torch.Tensor,
+        token_selected_experts: torch.Tensor,
+        token_final_scales: torch.Tensor,
+        w1_weight: torch.Tensor,
+        w1_weight_sf: torch.Tensor,
+        w1_alpha: torch.Tensor,
+        fc2_input_scale: torch.Tensor,
+        w2_weight: torch.Tensor,
+        w2_weight_sf: torch.Tensor,
+        w2_alpha: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_local_experts: int,
+        local_expert_offset: int = 0,
+        tile_size: int = 128,
+        gemm1_mma_tiler_mn: Tuple[int, int] = (128, 128),
+        gemm1_cluster_shape_mn: Tuple[int, int] = (1, 1),
+        gemm2_mma_tiler_mn: Tuple[int, int] = (128, 128),
+        gemm2_cluster_shape_mn: Tuple[int, int] = (1, 1),
+        output_dtype: torch.dtype = torch.bfloat16,
+        use_fused_finalize: bool = True,
+        moe_output: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Forward implementation called by auto-tuner."""
+        return _moe_core_impl(
+            x=x,
+            x_sf=x_sf,
+            token_selected_experts=token_selected_experts,
+            token_final_scales=token_final_scales,
+            w1_weight=w1_weight,
+            w1_weight_sf=w1_weight_sf,
+            w1_alpha=w1_alpha,
+            fc2_input_scale=fc2_input_scale,
+            w2_weight=w2_weight,
+            w2_weight_sf=w2_weight_sf,
+            w2_alpha=w2_alpha,
+            num_experts=num_experts,
+            top_k=top_k,
+            num_local_experts=num_local_experts,
+            local_expert_offset=local_expert_offset,
+            tile_size=tile_size,
+            gemm1_mma_tiler_mn=gemm1_mma_tiler_mn,
+            gemm1_cluster_shape_mn=gemm1_cluster_shape_mn,
+            gemm2_mma_tiler_mn=gemm2_mma_tiler_mn,
+            gemm2_cluster_shape_mn=gemm2_cluster_shape_mn,
+            moe_sort_buffers=self._moe_sort_buffers if self.use_cuda_graph else None,
+            gemm1_out=self._gemm1_output if self.use_cuda_graph else None,
+            gemm1_out_scale=self._gemm1_output_scale if self.use_cuda_graph else None,
+            moe_output=moe_output
+            if moe_output is not None
+            else (self._moe_output if self.use_cuda_graph else None),
+            aux_stream=self._aux_stream,
+            main_event=self._main_event,
+            memset_event=self._memset_event,
+            output_dtype=output_dtype,
+            use_async_memset=True,
+        )
+
+    @flashinfer_api
+    def run(
+        self,
+        x: torch.Tensor,
+        x_sf: torch.Tensor,
+        token_selected_experts: torch.Tensor,
+        token_final_scales: torch.Tensor,
+        w1_weight: torch.Tensor,
+        w1_weight_sf: torch.Tensor,
+        w1_alpha: torch.Tensor,
+        fc2_input_scale: torch.Tensor,
+        w2_weight: torch.Tensor,
+        w2_weight_sf: torch.Tensor,
+        w2_alpha: torch.Tensor,
+        tactic: Optional[Tuple] = None,
+    ) -> torch.Tensor:
+        """Run MoE computation.
+
+        This method is CUDA graph safe when use_cuda_graph=True.
+        Supports auto-tuning via `tactic` parameter or `autotune()` context.
+
+        Args:
+            x: Input tensor, NVFP4 quantized [num_tokens, hidden_size // 2].
+            x_sf: Scale factors for x.
+            token_selected_experts: Expert assignments [num_tokens, top_k].
+            token_final_scales: Routing weights [num_tokens, top_k].
+            w1_weight: GEMM1 weights (gate + up fused).
+            w1_weight_sf: Scale factors for w1_weight.
+            w1_alpha: Per-expert global scale for GEMM1.
+            fc2_input_scale: Global scale for GEMM2 input quantization.
+            w2_weight: GEMM2 weights (down projection).
+            w2_weight_sf: Scale factors for w2_weight.
+            w2_alpha: Per-expert global scale for GEMM2.
+            tactic: Tactic tuple or None for auto-selection.
+
+        Returns:
+            Output tensor [num_tokens, hidden_size].
+        """
+        num_tokens = token_selected_experts.size(0)
+
+        if self.use_cuda_graph and num_tokens > self.max_num_tokens:
+            raise ValueError(
+                f"num_tokens ({num_tokens}) exceeds max_num_tokens ({self.max_num_tokens})"
+            )
+
+        # Allocate output buffer if not using pre-allocated one
+        if self.use_cuda_graph:
+            moe_output = self._moe_output
+        else:
+            moe_output = torch.empty(
+                (num_tokens, self.hidden_size),
+                dtype=self.output_dtype,
+                device=x.device,
+            )
+
+        # Use auto-tuner for tactic selection
+        tuner = AutoTuner.get()
+
+        inputs = [
+            x,
+            x_sf,
+            token_selected_experts,
+            token_final_scales,
+            w1_weight,
+            w1_weight_sf,
+            w1_alpha,
+            fc2_input_scale,
+            w2_weight,
+            w2_weight_sf,
+            w2_alpha,
+            moe_output,
+        ]
+
+        if tactic is not None:
+            # Use provided tactic
+            return self._runner(inputs, tactic=tactic)
+
+        # Let tuner choose tactic
+        _, best_tactic = tuner.choose_one(
+            "CuteDslMoEWrapper::run",
+            [self._runner],
+            CuteDslFusedMoENvfp4Runner.tuning_config,
+            inputs,
+        )
+
+        return self._runner(inputs, tactic=best_tactic)
+
+    def get_valid_tactics(self) -> list:
+        """Return list of valid tactics for this MoE configuration."""
+        return ALL_MOE_TACTICS
+
+
+# =============================================================================
+# Functional API (Simple Function Call)
+# =============================================================================
+
+
+def _cute_dsl_fused_moe_nvfp4_impl(
+    x: torch.Tensor,
+    x_sf: torch.Tensor,
+    token_selected_experts: torch.Tensor,
+    token_final_scales: torch.Tensor,
+    w1_weight: torch.Tensor,
+    w1_weight_sf: torch.Tensor,
+    w1_alpha: torch.Tensor,
+    fc2_input_scale: torch.Tensor,
+    w2_weight: torch.Tensor,
+    w2_weight_sf: torch.Tensor,
+    w2_alpha: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    num_local_experts: int,
+    local_expert_offset: int = 0,
+    tile_size: int = 128,
+    gemm1_mma_tiler_mn: Tuple[int, int] = (128, 128),
+    gemm1_cluster_shape_mn: Tuple[int, int] = (1, 1),
+    gemm2_mma_tiler_mn: Tuple[int, int] = (128, 128),
+    gemm2_cluster_shape_mn: Tuple[int, int] = (1, 1),
+    output_dtype: torch.dtype = torch.bfloat16,
+    use_fused_finalize: bool = True,
+    moe_output: Optional[torch.Tensor] = None,
+    aux_stream: Optional[torch.cuda.Stream] = None,
+) -> torch.Tensor:
+    """Internal implementation called by auto-tuner for functional API."""
+    return _moe_core_impl(
+        x=x,
+        x_sf=x_sf,
+        token_selected_experts=token_selected_experts,
+        token_final_scales=token_final_scales,
+        w1_weight=w1_weight,
+        w1_weight_sf=w1_weight_sf,
+        w1_alpha=w1_alpha,
+        fc2_input_scale=fc2_input_scale,
+        w2_weight=w2_weight,
+        w2_weight_sf=w2_weight_sf,
+        w2_alpha=w2_alpha,
+        num_experts=num_experts,
+        top_k=top_k,
+        num_local_experts=num_local_experts,
+        local_expert_offset=local_expert_offset,
+        tile_size=tile_size,
+        gemm1_mma_tiler_mn=gemm1_mma_tiler_mn,
+        gemm1_cluster_shape_mn=gemm1_cluster_shape_mn,
+        gemm2_mma_tiler_mn=gemm2_mma_tiler_mn,
+        gemm2_cluster_shape_mn=gemm2_cluster_shape_mn,
+        moe_output=moe_output,
+        aux_stream=aux_stream,
+        output_dtype=output_dtype,
+        use_async_memset=True,
+    )
+
+
+@flashinfer_api
+def cute_dsl_fused_moe_nvfp4(
+    x: torch.Tensor,
+    x_sf: torch.Tensor,
+    token_selected_experts: torch.Tensor,
+    token_final_scales: torch.Tensor,
+    w1_weight: torch.Tensor,
+    w1_weight_sf: torch.Tensor,
+    w1_alpha: torch.Tensor,
+    fc2_input_scale: torch.Tensor,
+    w2_weight: torch.Tensor,
+    w2_weight_sf: torch.Tensor,
+    w2_alpha: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    num_local_experts: Optional[int] = None,
+    local_expert_offset: int = 0,
+    output_dtype: torch.dtype = torch.bfloat16,
+    use_fused_finalize: bool = True,
+    moe_output: Optional[torch.Tensor] = None,
+    aux_stream: Optional[torch.cuda.Stream] = None,
+) -> torch.Tensor:
+    """Run fused MoE computation using CuteDSL NVFP4 kernels.
+
+    This is the simple functional API. For CUDA graph support, use
+    `CuteDslMoEWrapper` instead.
+
+    Auto-tuning is controlled via the `autotune()` context manager:
+
+        >>> with autotune(True):
+        ...     output = cute_dsl_fused_moe_nvfp4(...)
+
+    Args:
+        x: Input tensor, NVFP4 quantized [num_tokens, hidden_size // 2].
+        x_sf: Scale factors for x.
+        token_selected_experts: Expert assignments [num_tokens, top_k].
+        token_final_scales: Routing weights [num_tokens, top_k].
+        w1_weight: GEMM1 weights (gate + up fused).
+        w1_weight_sf: Scale factors for w1_weight.
+        w1_alpha: Per-expert global scale for GEMM1.
+        fc2_input_scale: Global scale for GEMM2 input quantization.
+        w2_weight: GEMM2 weights (down projection).
+        w2_weight_sf: Scale factors for w2_weight.
+        w2_alpha: Per-expert global scale for GEMM2.
+        num_experts: Total number of experts.
+        top_k: Number of experts per token.
+        num_local_experts: Local experts for EP. Default: num_experts.
+        local_expert_offset: Expert offset for EP. Default: 0.
+        output_dtype: Output data type. Default: torch.bfloat16.
+        use_fused_finalize: Use fused finalize. Default: True.
+        moe_output: Pre-allocated output buffer.
+        aux_stream: Auxiliary CUDA stream.
+
+    Returns:
+        Output tensor [num_tokens, hidden_size].
+    """
+    if num_local_experts is None:
+        num_local_experts = num_experts
+
+    num_tokens = token_selected_experts.size(0)
+    hidden_size = w2_weight.size(1)
+
+    if moe_output is None:
+        moe_output = torch.empty(
+            (num_tokens, hidden_size),
+            dtype=output_dtype,
+            device=x.device,
+        )
+
+    tuner = AutoTuner.get()
+
+    runner = CuteDslFusedMoENvfp4Runner(
+        forward_impl=_cute_dsl_fused_moe_nvfp4_impl,
+        num_experts=num_experts,
+        top_k=top_k,
+        num_local_experts=num_local_experts,
+        local_expert_offset=local_expert_offset,
+        use_fused_finalize=use_fused_finalize,
+        output_dtype=output_dtype,
+    )
+
+    inputs = [
+        x,
+        x_sf,
+        token_selected_experts,
+        token_final_scales,
+        w1_weight,
+        w1_weight_sf,
+        w1_alpha,
+        fc2_input_scale,
+        w2_weight,
+        w2_weight_sf,
+        w2_alpha,
+        moe_output,
+    ]
+
+    _, best_tactic = tuner.choose_one(
+        "CuteDslFusedMoE::run_moe_nvfp4",
+        [runner],
+        CuteDslFusedMoENvfp4Runner.tuning_config,
+        inputs,
+        aux_stream=aux_stream,
+    )
+
+    return runner(inputs, tactic=best_tactic, aux_stream=aux_stream)
+
+
+__all__ = [
+    "cute_dsl_fused_moe_nvfp4",
+    "CuteDslMoEWrapper",
+]
diff --git a/flashinfer/fused_moe/cute_dsl/moe_utils.py b/flashinfer/fused_moe/cute_dsl/moe_utils.py
new file mode 100644
index 0000000000..bb5e604c54
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/moe_utils.py
@@ -0,0 +1,881 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import functools
+from enum import IntEnum
+from typing import Dict, Optional, Tuple
+
+import torch
+
+from ...jit.moe_utils import gen_moe_utils_module
+
+
+def _get_cuda_stream_ptr() -> int:
+    """Get the current PyTorch CUDA stream pointer.
+
+    This is needed for CUDA graph compatibility - the kernel must run on
+    PyTorch's current stream, not TVM's default stream.
+    """
+    return torch.cuda.current_stream().cuda_stream
+
+
+# ============================ Helper Functions ============================
+
+
+def get_max_num_tiles(
+    num_tokens: int,
+    top_k: int,
+    num_local_experts: int,
+    tile_size: int,
+) -> int:
+    """
+    Calculate the maximum number of tiles for grouped GEMM.
+
+    This follows the same logic as TRT-LLM's GroupedGemmInputsHelper.get_max_num_tiles().
+
+    Args:
+        num_tokens: Number of input tokens.
+        top_k: Number of experts per token.
+        num_local_experts: Number of local experts (for expert parallelism).
+        tile_size: Tile size for scheduling.
+
+    Returns:
+        Maximum number of tiles.
+    """
+    num_expanded_tokens = num_tokens * top_k
+
+    if num_expanded_tokens <= num_local_experts:
+        return num_expanded_tokens
+
+    # First, distribute one token to each expert
+    num_remaining_tokens = num_expanded_tokens - num_local_experts
+    max_num_tiles = num_local_experts
+
+    # Greedily fill remaining tokens into tiles
+    max_num_tiles += (num_remaining_tokens + tile_size - 1) // tile_size
+
+    return max_num_tiles
+
+
+def get_max_num_permuted_tokens(
+    num_tokens: int,
+    top_k: int,
+    num_local_experts: int,
+    tile_size: int,
+) -> int:
+    """
+    Calculate the maximum number of permuted tokens.
+
+    This follows the same logic as TRT-LLM's GroupedGemmInputsHelper.get_max_num_permuted_tokens().
+
+    Args:
+        num_tokens: Number of input tokens.
+        top_k: Number of experts per token.
+        num_local_experts: Number of local experts (for expert parallelism).
+        tile_size: Tile size for scheduling.
+
+    Returns:
+        Maximum number of permuted tokens.
+    """
+    max_num_tiles = get_max_num_tiles(num_tokens, top_k, num_local_experts, tile_size)
+    return max_num_tiles * tile_size
+
+
+class MoeActivationType(IntEnum):
+    """Activation types for MoE layers.
+
+    Note: Must match MoeActivationType enum in moeUtils.h
+    """
+
+    Gelu = 0
+    Relu = 1
+    Silu = 2
+    Swiglu = 3
+    Geglu = 4
+    Identity = 5
+
+
+@functools.lru_cache(maxsize=1)
+def _get_moe_utils_module():
+    """Lazily load and cache the MoE utils JIT module."""
+    spec = gen_moe_utils_module()
+    return spec.build_and_load()
+
+
+def _get_dtype_suffix(dtype: torch.dtype) -> str:
+    """Get the dtype suffix for function dispatch."""
+    if dtype == torch.float16:
+        return "fp16"
+    elif dtype == torch.bfloat16:
+        return "bf16"
+    elif dtype == torch.float8_e4m3fn:
+        return "fp8"
+    elif dtype == torch.uint8:  # Used for FP4 (packed)
+        return "fp4"
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+
+def moe_permute(
+    input: torch.Tensor,
+    permuted_output: torch.Tensor,
+    tile_idx_to_mn_limit: torch.Tensor,
+    permuted_idx_to_expanded_idx: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    max_num_permuted_tokens: int,
+    top_k: int,
+    tile_size: int,
+    enable_pdl: bool = False,
+    input_sf: Optional[torch.Tensor] = None,
+    permuted_sf: Optional[torch.Tensor] = None,
+) -> None:
+    """
+    Permute input activations according to MoE routing decisions.
+
+    This function reorders input tokens based on expert assignments, preparing
+    them for batched expert computation.
+
+    Args:
+        input: Input activations tensor of shape [num_tokens, hidden_size].
+               Supported dtypes: float16, bfloat16, float8_e4m3fn, uint8 (FP4).
+        permuted_output: Output tensor for permuted activations of shape
+                        [max_num_permuted_tokens, hidden_size].
+        tile_idx_to_mn_limit: Tensor mapping tile indices to M/N limits.
+                             Shape: [num_tiles].
+        permuted_idx_to_expanded_idx: Mapping from permuted indices to expanded indices.
+                                      Shape: [max_num_permuted_tokens].
+        num_non_exiting_tiles: Number of non-exiting tiles (scalar on device).
+        max_num_permuted_tokens: Maximum number of permuted tokens.
+        top_k: Number of experts per token.
+        tile_size: Size of each tile for scheduling.
+        enable_pdl: Enable Programmatic Dependent Launch for better kernel overlap.
+                    Default is False.
+        input_sf: Scale factors for input (required for FP4).
+                  Shape: [num_tokens, hidden_size // 16].
+        permuted_sf: Output scale factors for permuted data (required for FP4).
+                     Shape: [max_num_permuted_tokens, hidden_size // 16].
+
+    Note:
+        - For FP4 inputs, input_sf and permuted_sf are required.
+        - The permuted_sf output uses a swizzled layout for efficient TMA access.
+    """
+    module = _get_moe_utils_module()
+    dtype_suffix = _get_dtype_suffix(input.dtype)
+
+    hidden_size = input.shape[-1]
+    if dtype_suffix == "fp4":
+        # For FP4, hidden_size is halved due to packing
+        hidden_size = hidden_size * 2
+
+    func_name = f"flashinfer_moe_permute_{dtype_suffix}"
+    func = module[func_name]
+
+    input_sf_ptr = input_sf.data_ptr() if input_sf is not None else 0
+    permuted_sf_ptr = permuted_sf.data_ptr() if permuted_sf is not None else 0
+
+    func(
+        input.data_ptr(),
+        permuted_output.data_ptr(),
+        input_sf_ptr,
+        permuted_sf_ptr,
+        tile_idx_to_mn_limit.data_ptr(),
+        permuted_idx_to_expanded_idx.data_ptr(),
+        num_non_exiting_tiles.data_ptr(),
+        max_num_permuted_tokens,
+        hidden_size,
+        top_k,
+        tile_size,
+        enable_pdl,
+    )
+
+
+def moe_unpermute(
+    permuted_input: torch.Tensor,
+    output: torch.Tensor,
+    expanded_idx_to_permuted_idx: torch.Tensor,
+    topk_scales: torch.Tensor,
+    num_tokens: int,
+    top_k: int,
+    enable_pdl: bool = False,
+) -> None:
+    """
+    Unpermute and scale outputs after expert computation.
+
+    This function reverses the permutation done by moe_permute and applies
+    top-k scaling weights to combine expert outputs.
+
+    Args:
+        permuted_input: Permuted expert outputs of shape [num_permuted_tokens, hidden_size].
+                        Supported dtypes: float16, bfloat16.
+        output: Output tensor of shape [num_tokens, hidden_size].
+        expanded_idx_to_permuted_idx: Mapping from expanded indices to permuted indices.
+                                       Shape: [num_tokens, top_k].
+                                       -1 indicates a masked expert.
+        topk_scales: Scaling weights for each expert per token.
+                     Shape: [num_tokens, top_k].
+                     Supported dtypes: float32, float16, bfloat16.
+        num_tokens: Number of original tokens.
+        top_k: Number of experts per token.
+        enable_pdl: Enable Programmatic Dependent Launch for better kernel overlap.
+                    Default is False.
+
+    Note:
+        Output is the weighted sum of expert contributions:
+        output[i] = sum(topk_scales[i, k] * expert_output[i, k] for k in range(top_k))
+    """
+    module = _get_moe_utils_module()
+    input_dtype_suffix = _get_dtype_suffix(permuted_input.dtype)
+
+    hidden_size = permuted_input.shape[-1]
+
+    # Determine scale dtype suffix
+    if topk_scales.dtype == torch.float32:
+        scale_suffix = "float"
+    elif topk_scales.dtype == torch.float16:
+        scale_suffix = "half"
+    elif topk_scales.dtype == torch.bfloat16:
+        scale_suffix = "bf16"
+    else:
+        raise ValueError(f"Unsupported scale dtype: {topk_scales.dtype}")
+
+    func_name = f"flashinfer_moe_unpermute_{input_dtype_suffix}_{scale_suffix}_scale"
+    func = module[func_name]
+
+    func(
+        permuted_input.data_ptr(),
+        output.data_ptr(),
+        expanded_idx_to_permuted_idx.data_ptr(),
+        topk_scales.data_ptr(),
+        num_tokens,
+        hidden_size,
+        top_k,
+        enable_pdl,
+    )
+
+
+def moe_output_memset(
+    output: torch.Tensor,
+    tile_idx_to_mn_limit: torch.Tensor,
+    expanded_idx_to_permuted_idx: torch.Tensor,
+    permuted_idx_to_expanded_idx: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    max_num_permuted_tokens: int,
+    top_k: int,
+    tile_size: int,
+    enable_pdl: bool = False,
+) -> None:
+    """
+    Zero-initialize output buffer for tokens that will receive scattered writes.
+
+    This function sets output locations to zero for tokens that are first in their
+    top-k sequence, preparing the buffer for accumulation during unpermutation.
+
+    Args:
+        output: Output tensor to zero-initialize. Shape: [num_tokens, hidden_size].
+                Supported dtypes: float16, bfloat16.
+        tile_idx_to_mn_limit: Tensor mapping tile indices to M/N limits.
+                             Shape: [num_tiles].
+        expanded_idx_to_permuted_idx: Mapping from expanded indices to permuted indices.
+                                       Shape: [num_tokens, top_k].
+        permuted_idx_to_expanded_idx: Mapping from permuted indices to expanded indices.
+                                      Shape: [max_num_permuted_tokens].
+        num_non_exiting_tiles: Number of non-exiting tiles (scalar on device).
+        max_num_permuted_tokens: Maximum number of permuted tokens.
+        top_k: Number of experts per token.
+        tile_size: Size of each tile for scheduling.
+        enable_pdl: Enable Programmatic Dependent Launch for better kernel overlap.
+                    Default is False.
+    """
+    module = _get_moe_utils_module()
+    dtype_suffix = _get_dtype_suffix(output.dtype)
+
+    hidden_size = output.shape[-1]
+
+    func_name = f"flashinfer_moe_output_memset_{dtype_suffix}"
+    func = module[func_name]
+
+    func(
+        output.data_ptr(),
+        tile_idx_to_mn_limit.data_ptr(),
+        expanded_idx_to_permuted_idx.data_ptr(),
+        permuted_idx_to_expanded_idx.data_ptr(),
+        num_non_exiting_tiles.data_ptr(),
+        max_num_permuted_tokens,
+        hidden_size,
+        top_k,
+        tile_size,
+        enable_pdl,
+    )
+
+
+# ============================ moe_sort ============================
+
+
+def allocate_moe_sort_buffers(
+    num_tokens: int,
+    num_experts: int,
+    top_k: int,
+    num_local_experts: Optional[int] = None,
+    tile_tokens_dim: int = 128,
+    device: str = "cuda",
+) -> Dict[str, torch.Tensor]:
+    """
+    Pre-allocate output buffers for moe_sort for CUDA graph compatibility.
+
+    When using CUDA graphs, allocate these buffers BEFORE graph capture and pass
+    them to moe_sort via the out_* parameters. This ensures the same memory
+    addresses are used during capture and replay.
+
+    Args:
+        num_tokens: Number of tokens.
+        num_experts: Total number of experts.
+        top_k: Number of experts per token.
+        num_local_experts: Number of local experts. Default: num_experts.
+        tile_tokens_dim: Tile size for scheduling. Default: 128.
+        device: Device to allocate on. Default: "cuda".
+
+    Returns:
+        Dictionary with pre-allocated buffers that can be unpacked as kwargs to moe_sort:
+            - out_tile_idx_to_expert_idx
+            - out_tile_idx_to_mn_limit
+            - out_expanded_idx_to_permuted_idx
+            - out_permuted_idx_to_expanded_idx
+            - out_total_num_padded_tokens
+            - out_num_non_exiting_tiles
+
+    Example:
+        >>> # Pre-allocate before CUDA graph capture
+        >>> buffers = allocate_moe_sort_buffers(num_tokens, num_experts, top_k)
+        >>>
+        >>> # Warmup
+        >>> for _ in range(3):
+        ...     moe_sort(experts, scales, ..., **buffers)
+        >>>
+        >>> # Capture
+        >>> g = torch.cuda.CUDAGraph()
+        >>> with torch.cuda.graph(g):
+        ...     results = moe_sort(experts, scales, ..., **buffers)
+    """
+    if num_local_experts is None:
+        num_local_experts = num_experts
+
+    max_num_tiles = get_max_num_tiles(
+        num_tokens, top_k, num_local_experts, tile_tokens_dim
+    )
+    max_num_permuted_tokens = get_max_num_permuted_tokens(
+        num_tokens, top_k, num_local_experts, tile_tokens_dim
+    )
+
+    return {
+        "out_tile_idx_to_expert_idx": torch.empty(
+            (max_num_tiles,), dtype=torch.int32, device=device
+        ),
+        "out_tile_idx_to_mn_limit": torch.empty(
+            (max_num_tiles,), dtype=torch.int32, device=device
+        ),
+        "out_expanded_idx_to_permuted_idx": torch.empty(
+            (num_tokens, top_k), dtype=torch.int32, device=device
+        ),
+        "out_permuted_idx_to_expanded_idx": torch.empty(
+            (max_num_permuted_tokens,), dtype=torch.int32, device=device
+        ),
+        "out_total_num_padded_tokens": torch.empty(
+            (1,), dtype=torch.int32, device=device
+        ),
+        "out_num_non_exiting_tiles": torch.empty(
+            (1,), dtype=torch.int32, device=device
+        ),
+    }
+
+
+def moe_sort(
+    token_selected_experts: torch.Tensor,
+    token_final_scales: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    local_expert_offset: int = 0,
+    num_local_experts: Optional[int] = None,
+    tile_tokens_dim: int = 128,
+    enable_pdl: bool = False,
+    # CUDA graph support: pre-allocated output buffers
+    out_tile_idx_to_expert_idx: Optional[torch.Tensor] = None,
+    out_tile_idx_to_mn_limit: Optional[torch.Tensor] = None,
+    out_expanded_idx_to_permuted_idx: Optional[torch.Tensor] = None,
+    out_permuted_idx_to_expanded_idx: Optional[torch.Tensor] = None,
+    out_total_num_padded_tokens: Optional[torch.Tensor] = None,
+    out_num_non_exiting_tiles: Optional[torch.Tensor] = None,
+) -> Tuple[
+    torch.Tensor,  # tile_idx_to_expert_idx
+    torch.Tensor,  # tile_idx_to_mn_limit
+    torch.Tensor,  # expanded_idx_to_permuted_idx
+    torch.Tensor,  # permuted_idx_to_expanded_idx
+    torch.Tensor,  # total_num_padded_tokens [1], int32 (device tensor for CUDA graph compatibility)
+    torch.Tensor,  # num_non_exiting_tiles
+]:
+    """
+    Sort tokens by expert assignment and generate mapping tensors.
+
+    This function performs token sorting and index mapping computation required
+    for grouped GEMM operations in MoE. It uses the same algorithm as TRT-LLM's
+    moe_sort with DeepSeekV3 routing method.
+
+    Note: This function does NOT physically reorder data - use moe_permute() for that.
+
+    CUDA Graph Compatibility:
+        For CUDA graph capture, pre-allocate output buffers BEFORE capture using
+        allocate_moe_sort_buffers() and pass them via the out_* parameters. This
+        ensures the same memory addresses are used during capture and replay.
+
+        Example:
+            >>> buffers = allocate_moe_sort_buffers(num_tokens, num_experts, top_k, ...)
+            >>> # Warmup before capture
+            >>> for _ in range(3):
+            ...     moe_sort(..., **buffers)
+            >>> # Capture
+            >>> with torch.cuda.graph(g):
+            ...     moe_sort(..., **buffers)
+
+    Args:
+        token_selected_experts: Expert assignments of shape [num_tokens, top_k].
+                               Data type: torch.int32.
+        token_final_scales: Routing weights of shape [num_tokens, top_k].
+                           Data type: torch.float32 or torch.bfloat16.
+        num_experts: Total number of experts.
+        top_k: Number of experts per token.
+        local_expert_offset: Expert offset for expert parallelism. Default: 0.
+        num_local_experts: Number of local experts. Default: num_experts.
+        tile_tokens_dim: Tile size for scheduling. Default: 128.
+        enable_pdl: Enable Programmatic Dependent Launch for better kernel overlap.
+                    Default is False.
+        out_tile_idx_to_expert_idx: Pre-allocated buffer for tile_idx_to_expert_idx.
+        out_tile_idx_to_mn_limit: Pre-allocated buffer for tile_idx_to_mn_limit.
+        out_expanded_idx_to_permuted_idx: Pre-allocated buffer for expanded_idx_to_permuted_idx.
+        out_permuted_idx_to_expanded_idx: Pre-allocated buffer for permuted_idx_to_expanded_idx.
+        out_total_num_padded_tokens: Pre-allocated buffer for total_num_padded_tokens.
+        out_num_non_exiting_tiles: Pre-allocated buffer for num_non_exiting_tiles.
+
+    Returns:
+        tuple: A tuple of 6 elements:
+            - tile_idx_to_expert_idx: [max_num_tiles], int32
+                Mapping from tile index to local expert index (0 to num_local_experts-1).
+            - tile_idx_to_mn_limit: [max_num_tiles], int32
+                M/N limit for each tile (cumulative token count).
+            - expanded_idx_to_permuted_idx: [num_tokens, top_k], int32
+                Mapping from expanded index to permuted index.
+                -1 indicates a masked/non-local expert.
+            - permuted_idx_to_expanded_idx: [max_num_permuted_tokens], int32
+                Mapping from permuted index to expanded index.
+            - total_num_padded_tokens: [1], int32 (device tensor)
+                Total number of padded tokens. Returned as tensor for CUDA graph compatibility.
+            - num_non_exiting_tiles: [1], int32 (device tensor)
+                Number of non-exiting (active) tiles.
+
+    Example:
+        >>> import torch
+        >>> from flashinfer.cute_dsl_moe_utils import moe_sort
+        >>>
+        >>> num_tokens, num_experts, top_k = 128, 8, 2
+        >>> token_selected_experts = torch.randint(0, num_experts, (num_tokens, top_k),
+        ...                                        dtype=torch.int32, device="cuda")
+        >>> token_final_scales = torch.randn(num_tokens, top_k, device="cuda")
+        >>>
+        >>> (tile_idx_to_expert_idx, tile_idx_to_mn_limit,
+        ...  expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx,
+        ...  total_num_padded_tokens, num_non_exiting_tiles) = moe_sort(
+        ...     token_selected_experts, token_final_scales,
+        ...     num_experts=num_experts, top_k=top_k)
+    """
+    # Validate inputs
+    assert token_selected_experts.dim() == 2, "token_selected_experts must be 2D"
+    assert token_final_scales.dim() == 2, "token_final_scales must be 2D"
+
+    num_tokens = token_selected_experts.size(0)
+    assert token_selected_experts.size(1) == top_k, (
+        "token_selected_experts.size(1) must equal top_k"
+    )
+    assert token_final_scales.size(0) == num_tokens, (
+        "token_final_scales.size(0) must equal num_tokens"
+    )
+    assert token_final_scales.size(1) == top_k, (
+        "token_final_scales.size(1) must equal top_k"
+    )
+
+    if num_local_experts is None:
+        num_local_experts = num_experts
+
+    device = token_selected_experts.device
+
+    # Calculate buffer sizes
+    max_num_tiles = get_max_num_tiles(
+        num_tokens, top_k, num_local_experts, tile_tokens_dim
+    )
+    max_num_permuted_tokens = get_max_num_permuted_tokens(
+        num_tokens, top_k, num_local_experts, tile_tokens_dim
+    )
+
+    # Ensure inputs are contiguous and correct dtypes
+    token_selected_experts = token_selected_experts.contiguous()
+    if token_selected_experts.dtype != torch.int32:
+        token_selected_experts = token_selected_experts.to(torch.int32)
+
+    token_final_scales = token_final_scales.contiguous()
+
+    # Use pre-allocated buffers if provided, otherwise allocate new ones
+    # Pre-allocation is required for CUDA graph compatibility
+    if out_tile_idx_to_expert_idx is not None:
+        tile_idx_to_expert_idx = out_tile_idx_to_expert_idx
+    else:
+        tile_idx_to_expert_idx = torch.empty(
+            (max_num_tiles,), dtype=torch.int32, device=device
+        )
+
+    if out_tile_idx_to_mn_limit is not None:
+        tile_idx_to_mn_limit = out_tile_idx_to_mn_limit
+    else:
+        tile_idx_to_mn_limit = torch.empty(
+            (max_num_tiles,), dtype=torch.int32, device=device
+        )
+
+    if out_expanded_idx_to_permuted_idx is not None:
+        expanded_idx_to_permuted_idx = out_expanded_idx_to_permuted_idx
+        # Reset to -1 for masked experts (kernel expects this)
+        expanded_idx_to_permuted_idx.fill_(-1)
+    else:
+        expanded_idx_to_permuted_idx = torch.full(
+            (num_tokens, top_k), -1, dtype=torch.int32, device=device
+        )
+
+    if out_permuted_idx_to_expanded_idx is not None:
+        permuted_idx_to_expanded_idx = out_permuted_idx_to_expanded_idx
+        permuted_idx_to_expanded_idx.zero_()
+    else:
+        permuted_idx_to_expanded_idx = torch.zeros(
+            (max_num_permuted_tokens,), dtype=torch.int32, device=device
+        )
+
+    if out_total_num_padded_tokens is not None:
+        total_num_padded_tokens_tensor = out_total_num_padded_tokens
+        total_num_padded_tokens_tensor.zero_()
+    else:
+        total_num_padded_tokens_tensor = torch.zeros(
+            (1,), dtype=torch.int32, device=device
+        )
+
+    if out_num_non_exiting_tiles is not None:
+        num_non_exiting_tiles = out_num_non_exiting_tiles
+        num_non_exiting_tiles.zero_()
+    else:
+        num_non_exiting_tiles = torch.zeros((1,), dtype=torch.int32, device=device)
+
+    # Allocate expert counts buffer for large token counts (>1024)
+    # Required size: 2 * num_experts
+    if num_tokens > 1024:
+        expert_counts = torch.zeros(
+            (2 * num_experts,), dtype=torch.int32, device=device
+        )
+        expert_counts_ptr = expert_counts.data_ptr()
+    else:
+        expert_counts_ptr = 0  # Will be set to nullptr in kernel
+
+    # Get the JIT module and call the kernel
+    module = _get_moe_utils_module()
+    func = module["flashinfer_moe_sort"]
+
+    # Get PyTorch's current stream for CUDA graph compatibility
+    cuda_stream_ptr = _get_cuda_stream_ptr()
+
+    func(
+        # Inputs
+        token_selected_experts.data_ptr(),
+        token_final_scales.data_ptr(),
+        num_tokens,
+        num_experts,
+        top_k,
+        local_expert_offset,
+        num_local_experts,
+        tile_tokens_dim,
+        enable_pdl,
+        # Outputs
+        tile_idx_to_expert_idx.data_ptr(),
+        tile_idx_to_mn_limit.data_ptr(),
+        expanded_idx_to_permuted_idx.data_ptr(),
+        permuted_idx_to_expanded_idx.data_ptr(),
+        total_num_padded_tokens_tensor.data_ptr(),
+        num_non_exiting_tiles.data_ptr(),
+        # Optional buffer
+        expert_counts_ptr,
+        # CUDA stream for CUDA graph compatibility
+        cuda_stream_ptr,
+    )
+
+    # Return total_num_padded_tokens as tensor for CUDA graph compatibility
+    # (avoiding .item() which causes CPU-GPU sync)
+    return (
+        tile_idx_to_expert_idx,
+        tile_idx_to_mn_limit,
+        expanded_idx_to_permuted_idx,
+        permuted_idx_to_expanded_idx,
+        total_num_padded_tokens_tensor,
+        num_non_exiting_tiles,
+    )
+
+
+# ============================== Activation Functions ==============================
+
+
+def moe_activation(
+    input: torch.Tensor,
+    output: torch.Tensor,
+    tile_idx_to_mn_limit: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    activation_type: MoeActivationType,
+    max_num_permuted_tokens: int,
+    tile_size: int,
+    enable_pdl: bool = False,
+) -> None:
+    """
+    Apply activation function to MoE intermediate outputs.
+
+    This is a generic activation function that supports multiple activation types.
+    For convenience, use the specific wrappers like moe_swiglu(), moe_gelu(), etc.
+
+    Args:
+        input: Input tensor. For GLU activations (Swiglu, Geglu), shape is
+               [num_permuted_tokens, 2 * interm_size] where first half is linear
+               projection and second half is gate. For non-GLU activations,
+               shape is [num_permuted_tokens, interm_size].
+               Supported dtypes: float16, bfloat16.
+        output: Output tensor of shape [num_permuted_tokens, interm_size].
+        tile_idx_to_mn_limit: Valid token count per tile from moe_sort.
+                             Shape: [num_tiles].
+        num_non_exiting_tiles: Number of valid tiles (scalar on device).
+        activation_type: Type of activation to apply. See MoeActivationType.
+        max_num_permuted_tokens: Maximum number of permuted tokens.
+        tile_size: Tile size for scheduling.
+        enable_pdl: Enable Programmatic Dependent Launch for better kernel overlap.
+                    Default is False.
+    """
+    module = _get_moe_utils_module()
+    dtype_suffix = _get_dtype_suffix(input.dtype)
+
+    interm_size = output.shape[-1]
+
+    func_name = f"flashinfer_moe_activation_{dtype_suffix}"
+    func = module[func_name]
+
+    func(
+        input.data_ptr(),
+        output.data_ptr(),
+        tile_idx_to_mn_limit.data_ptr(),
+        num_non_exiting_tiles.data_ptr(),
+        int(activation_type),
+        max_num_permuted_tokens,
+        interm_size,
+        tile_size,
+        enable_pdl,
+    )
+
+
+def moe_swiglu(
+    input: torch.Tensor,
+    output: torch.Tensor,
+    tile_idx_to_mn_limit: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    max_num_permuted_tokens: int,
+    tile_size: int,
+    enable_pdl: bool = False,
+) -> None:
+    """
+    Apply SwiGLU activation for MoE intermediate outputs.
+
+    SwiGLU(x, gate) = SiLU(gate) * x = gate * sigmoid(gate) * x
+
+    Args:
+        input: Input tensor of shape [num_permuted_tokens, 2 * interm_size].
+               First half is the linear projection, second half is the gate.
+               Supported dtypes: float16, bfloat16.
+        output: Output tensor of shape [num_permuted_tokens, interm_size].
+        tile_idx_to_mn_limit: Valid token count per tile from moe_sort.
+                             Shape: [num_tiles].
+        num_non_exiting_tiles: Number of valid tiles (scalar on device).
+        max_num_permuted_tokens: Maximum number of permuted tokens.
+        tile_size: Tile size for scheduling.
+        enable_pdl: Enable Programmatic Dependent Launch for better kernel overlap.
+                    Default is False.
+    """
+    moe_activation(
+        input=input,
+        output=output,
+        tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+        num_non_exiting_tiles=num_non_exiting_tiles,
+        activation_type=MoeActivationType.Swiglu,
+        max_num_permuted_tokens=max_num_permuted_tokens,
+        tile_size=tile_size,
+        enable_pdl=enable_pdl,
+    )
+
+
+def moe_geglu(
+    input: torch.Tensor,
+    output: torch.Tensor,
+    tile_idx_to_mn_limit: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    max_num_permuted_tokens: int,
+    tile_size: int,
+    enable_pdl: bool = False,
+) -> None:
+    """
+    Apply GeGLU activation for MoE intermediate outputs.
+
+    GeGLU(x, gate) = GELU(gate) * x
+
+    Args:
+        input: Input tensor of shape [num_permuted_tokens, 2 * interm_size].
+               First half is the linear projection, second half is the gate.
+               Supported dtypes: float16, bfloat16.
+        output: Output tensor of shape [num_permuted_tokens, interm_size].
+        tile_idx_to_mn_limit: Valid token count per tile from moe_sort.
+                             Shape: [num_tiles].
+        num_non_exiting_tiles: Number of valid tiles (scalar on device).
+        max_num_permuted_tokens: Maximum number of permuted tokens.
+        tile_size: Tile size for scheduling.
+        enable_pdl: Enable Programmatic Dependent Launch for better kernel overlap.
+                    Default is False.
+    """
+    moe_activation(
+        input=input,
+        output=output,
+        tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+        num_non_exiting_tiles=num_non_exiting_tiles,
+        activation_type=MoeActivationType.Geglu,
+        max_num_permuted_tokens=max_num_permuted_tokens,
+        tile_size=tile_size,
+        enable_pdl=enable_pdl,
+    )
+
+
+def moe_gelu(
+    input: torch.Tensor,
+    output: torch.Tensor,
+    tile_idx_to_mn_limit: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    max_num_permuted_tokens: int,
+    tile_size: int,
+    enable_pdl: bool = False,
+) -> None:
+    """
+    Apply GELU activation for MoE intermediate outputs.
+
+    GELU(x) = x * Phi(x) where Phi is the CDF of standard normal distribution.
+
+    Args:
+        input: Input tensor of shape [num_permuted_tokens, interm_size].
+               Supported dtypes: float16, bfloat16.
+        output: Output tensor of shape [num_permuted_tokens, interm_size].
+        tile_idx_to_mn_limit: Valid token count per tile from moe_sort.
+                             Shape: [num_tiles].
+        num_non_exiting_tiles: Number of valid tiles (scalar on device).
+        max_num_permuted_tokens: Maximum number of permuted tokens.
+        tile_size: Tile size for scheduling.
+        enable_pdl: Enable Programmatic Dependent Launch for better kernel overlap.
+                    Default is False.
+    """
+    moe_activation(
+        input=input,
+        output=output,
+        tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+        num_non_exiting_tiles=num_non_exiting_tiles,
+        activation_type=MoeActivationType.Gelu,
+        max_num_permuted_tokens=max_num_permuted_tokens,
+        tile_size=tile_size,
+        enable_pdl=enable_pdl,
+    )
+
+
+def moe_silu(
+    input: torch.Tensor,
+    output: torch.Tensor,
+    tile_idx_to_mn_limit: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    max_num_permuted_tokens: int,
+    tile_size: int,
+    enable_pdl: bool = False,
+) -> None:
+    """
+    Apply SiLU (Swish) activation for MoE intermediate outputs.
+
+    SiLU(x) = x * sigmoid(x)
+
+    Args:
+        input: Input tensor of shape [num_permuted_tokens, interm_size].
+               Supported dtypes: float16, bfloat16.
+        output: Output tensor of shape [num_permuted_tokens, interm_size].
+        tile_idx_to_mn_limit: Valid token count per tile from moe_sort.
+                             Shape: [num_tiles].
+        num_non_exiting_tiles: Number of valid tiles (scalar on device).
+        max_num_permuted_tokens: Maximum number of permuted tokens.
+        tile_size: Tile size for scheduling.
+        enable_pdl: Enable Programmatic Dependent Launch for better kernel overlap.
+                    Default is False.
+    """
+    moe_activation(
+        input=input,
+        output=output,
+        tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+        num_non_exiting_tiles=num_non_exiting_tiles,
+        activation_type=MoeActivationType.Silu,
+        max_num_permuted_tokens=max_num_permuted_tokens,
+        tile_size=tile_size,
+        enable_pdl=enable_pdl,
+    )
+
+
+def moe_relu(
+    input: torch.Tensor,
+    output: torch.Tensor,
+    tile_idx_to_mn_limit: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    max_num_permuted_tokens: int,
+    tile_size: int,
+    enable_pdl: bool = False,
+) -> None:
+    """
+    Apply ReLU activation for MoE intermediate outputs.
+
+    ReLU(x) = max(0, x)
+
+    Args:
+        input: Input tensor of shape [num_permuted_tokens, interm_size].
+               Supported dtypes: float16, bfloat16.
+        output: Output tensor of shape [num_permuted_tokens, interm_size].
+        tile_idx_to_mn_limit: Valid token count per tile from moe_sort.
+                             Shape: [num_tiles].
+        num_non_exiting_tiles: Number of valid tiles (scalar on device).
+        max_num_permuted_tokens: Maximum number of permuted tokens.
+        tile_size: Tile size for scheduling.
+        enable_pdl: Enable Programmatic Dependent Launch for better kernel overlap.
+                    Default is False.
+    """
+    moe_activation(
+        input=input,
+        output=output,
+        tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+        num_non_exiting_tiles=num_non_exiting_tiles,
+        activation_type=MoeActivationType.Relu,
+        max_num_permuted_tokens=max_num_permuted_tokens,
+        tile_size=tile_size,
+        enable_pdl=enable_pdl,
+    )
diff --git a/flashinfer/fused_moe/cute_dsl/tuner.py b/flashinfer/fused_moe/cute_dsl/tuner.py
new file mode 100644
index 0000000000..6455fd814d
--- /dev/null
+++ b/flashinfer/fused_moe/cute_dsl/tuner.py
@@ -0,0 +1,426 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+Auto-tuner for CuteDSL NVFP4 MoE kernels.
+
+This module provides a TunableRunner implementation for the CuteDSL NVFP4 MoE
+kernels, enabling automatic performance tuning across different GEMM tactics.
+
+Tactic format follows TRT-LLM's style:
+- GEMM1 (Gather + SwiGLU): (mma_tiler_mn, cluster_shape_mn, raster_along_m)
+- GEMM2 (Finalize): (mma_tiler_mn, cluster_shape_mn, raster_along_m)
+
+Reference: TensorRT-LLM/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
+- Sm100BlockScaledContiguousGatherGroupedGemmSwigluFusionRunner.get_valid_tactics (line 1867)
+- Sm100BlockScaledContiguousGroupedGemmFinalizeFusionRunner.get_valid_tactics (line 1163)
+"""
+
+import itertools
+from typing import Any, Callable, Dict, List, Tuple
+
+import torch
+
+from ...autotuner import (
+    DynamicTensorSpec,
+    OptimizationProfile,
+    TunableRunner,
+    TuningConfig,
+)
+from ..utils import (
+    get_last_power_of_2_num_tokens_buckets,
+    last_positive_power_of_2,
+)
+
+
+# =============================================================================
+# GEMM1 Tactics (Gather + SwiGLU Fusion)
+# =============================================================================
+# Reference: TRT-LLM cute_dsl_custom_ops.py line 1867-1897
+# Sm100BlockScaledContiguousGatherGroupedGemmSwigluFusionRunner.get_valid_tactics
+#
+# Format: (mma_tiler_mn, cluster_shape_mn, raster_along_m)
+# - mma_tiler_mn: (tile_size, N_tile) where tile_size is 128 or 256, N_tile is 128 or 256
+# - cluster_shape_mn: (tile_size // 128, cluster_n) where cluster_n is fixed to 1 for Gather kernel
+# - raster_along_m: False (fixed)
+
+
+def get_gemm1_valid_tactics(tile_size: int) -> List[Tuple]:
+    """Get valid tactics for GEMM1 (Gather + SwiGLU Fusion).
+
+    Reference: TRT-LLM cute_dsl_custom_ops.py line 1879-1897
+
+    Args:
+        tile_size: MMA tile M dimension (128 or 256)
+
+    Returns:
+        List of (mma_tiler_mn, cluster_shape_mn, raster_along_m) tuples
+    """
+    # From TRT-LLM line 1879-1883:
+    # mma_tiler_mn_candidates = [(self.tile_size, 128), (self.tile_size, 256)]
+    # cluster_shape_mn_candidates = [(self.tile_size // 128, 1)]  # Note: Only 1, not 2!
+    # raster_along_m_candidates = [False]
+
+    mma_tiler_mn_candidates = [(tile_size, 128), (tile_size, 256)]
+    cluster_shape_mn_candidates = [
+        (tile_size // 128, 1)
+    ]  # Gather kernel only supports cluster_n=1
+    raster_along_m_candidates = [False]
+
+    tactics = []
+    for mma_tiler_mn, cluster_shape_mn, raster_along_m in itertools.product(
+        mma_tiler_mn_candidates, cluster_shape_mn_candidates, raster_along_m_candidates
+    ):
+        tactics.append((mma_tiler_mn, cluster_shape_mn, raster_along_m))
+
+    return tactics
+
+
+# =============================================================================
+# GEMM2 Tactics (Finalize Fusion)
+# =============================================================================
+# Reference: TRT-LLM cute_dsl_custom_ops.py line 1163-1193
+# Sm100BlockScaledContiguousGroupedGemmFinalizeFusionRunner.get_valid_tactics
+#
+# Format: (mma_tiler_mn, cluster_shape_mn, raster_along_m)
+# - mma_tiler_mn: (tile_size, N_tile) where tile_size is 128 or 256, N_tile is 128 or 256
+# - cluster_shape_mn: (tile_size // 128, cluster_n) where cluster_n is 1 or 2
+# - raster_along_m: False (fixed, theoretically more performant)
+
+
+def get_gemm2_valid_tactics(tile_size: int) -> List[Tuple]:
+    """Get valid tactics for GEMM2 (Finalize Fusion).
+
+    Reference: TRT-LLM cute_dsl_custom_ops.py line 1173-1193
+
+    Args:
+        tile_size: MMA tile M dimension (128 or 256)
+
+    Returns:
+        List of (mma_tiler_mn, cluster_shape_mn, raster_along_m) tuples
+    """
+    # From TRT-LLM line 1173-1179:
+    # mma_tiler_mn_candidates = [(self.tile_size, 128), (self.tile_size, 256)]
+    # cluster_shape_mn_candidates = [(self.tile_size // 128, 1), (self.tile_size // 128, 2)]
+    # raster_along_m_candidates = [False]
+
+    mma_tiler_mn_candidates = [(tile_size, 128), (tile_size, 256)]
+    cluster_shape_mn_candidates = [(tile_size // 128, 1), (tile_size // 128, 2)]
+    raster_along_m_candidates = [False]
+
+    tactics = []
+    for mma_tiler_mn, cluster_shape_mn, raster_along_m in itertools.product(
+        mma_tiler_mn_candidates, cluster_shape_mn_candidates, raster_along_m_candidates
+    ):
+        tactics.append((mma_tiler_mn, cluster_shape_mn, raster_along_m))
+
+    return tactics
+
+
+# =============================================================================
+# Combined MoE Tactics
+# =============================================================================
+# The MoE pipeline uses both GEMM1 and GEMM2, they must share the same tile_size
+# (M dimension of mma_tiler_mn) because moe_sort uses tile_size for padding.
+#
+# Tactic format: (tile_size, gemm1_tactic, gemm2_tactic)
+# - tile_size: 128 or 256 (shared by both GEMMs and moe_sort)
+# - gemm1_tactic: (mma_tiler_mn, cluster_shape_mn, raster_along_m)
+# - gemm2_tactic: (mma_tiler_mn, cluster_shape_mn, raster_along_m)
+
+
+def get_moe_valid_tactics() -> List[Tuple]:
+    """Get all valid MoE tactic combinations.
+
+    Each tactic is a tuple: (tile_size, gemm1_tactic, gemm2_tactic)
+
+    The tile_size must be shared between GEMM1 and GEMM2 because:
+    1. moe_sort uses tile_size to pad tokens to tile boundaries
+    2. Both GEMMs process the same padded token sequence
+
+    Returns:
+        List of (tile_size, gemm1_tactic, gemm2_tactic) tuples
+    """
+    tactics = []
+
+    for tile_size in [128, 256]:
+        gemm1_tactics = get_gemm1_valid_tactics(tile_size)
+        gemm2_tactics = get_gemm2_valid_tactics(tile_size)
+
+        for gemm1_tactic, gemm2_tactic in itertools.product(
+            gemm1_tactics, gemm2_tactics
+        ):
+            tactics.append((tile_size, gemm1_tactic, gemm2_tactic))
+
+    return tactics
+
+
+# Pre-generate all valid tactics
+# tile_size=128: 2 GEMM1 tactics × 4 GEMM2 tactics = 8
+# tile_size=256: 2 GEMM1 tactics × 4 GEMM2 tactics = 8
+# Total: 16 tactics
+ALL_MOE_TACTICS = get_moe_valid_tactics()
+
+# Default tactic (tile_size=128, smallest MMA tiles, cluster_n=1)
+DEFAULT_MOE_TACTIC = (
+    128,  # tile_size
+    ((128, 128), (1, 1), False),  # gemm1_tactic
+    ((128, 128), (1, 1), False),  # gemm2_tactic
+)
+
+
+def _extract_tactic_params(tactic: Tuple) -> Dict[str, Any]:
+    """Extract parameters from a MoE tactic tuple.
+
+    Args:
+        tactic: (tile_size, gemm1_tactic, gemm2_tactic)
+
+    Returns:
+        Dictionary with all tactic parameters
+    """
+    tile_size, gemm1_tactic, gemm2_tactic = tactic
+    gemm1_mma_tiler_mn, gemm1_cluster_shape_mn, gemm1_raster_along_m = gemm1_tactic
+    gemm2_mma_tiler_mn, gemm2_cluster_shape_mn, gemm2_raster_along_m = gemm2_tactic
+
+    return {
+        "tile_size": tile_size,
+        "gemm1_mma_tiler_mn": gemm1_mma_tiler_mn,
+        "gemm1_cluster_shape_mn": gemm1_cluster_shape_mn,
+        "gemm1_raster_along_m": gemm1_raster_along_m,
+        "gemm2_mma_tiler_mn": gemm2_mma_tiler_mn,
+        "gemm2_cluster_shape_mn": gemm2_cluster_shape_mn,
+        "gemm2_raster_along_m": gemm2_raster_along_m,
+    }
+
+
+class CuteDslFusedMoENvfp4Runner(TunableRunner):
+    """TunableRunner for CuteDSL NVFP4 MoE kernels.
+
+    This runner enables auto-tuning of the CuteDSL NVFP4 MoE pipeline by
+    trying different combinations of GEMM tactics.
+
+    Tactic format follows TRT-LLM style:
+        (tile_size, gemm1_tactic, gemm2_tactic)
+    where:
+        - tile_size: 128 or 256
+        - gemm1_tactic: (mma_tiler_mn, cluster_shape_mn, raster_along_m)
+        - gemm2_tactic: (mma_tiler_mn, cluster_shape_mn, raster_along_m)
+
+    Input tensor indices (for dynamic_tensor_specs):
+        0: x (num_tokens, hidden_size//2) - FP4 packed input
+        1: x_sf (num_tokens, hidden_size//sf_vec_size) - input scale factors
+        2: token_selected_experts (num_tokens, top_k) - expert assignments
+        3: token_final_scales (num_tokens, top_k) - routing weights
+        4-10: weight tensors (fixed size, don't depend on num_tokens)
+        11: moe_output (num_tokens, hidden_size) - output buffer
+
+    Args:
+        forward_impl: The actual MoE implementation function.
+        num_experts: Total number of experts.
+        top_k: Number of experts selected per token.
+        num_local_experts: Number of local experts (for expert parallelism).
+        local_expert_offset: Starting expert index for this partition.
+        use_fused_finalize: Whether to use fused finalize (default: True).
+        output_dtype: Output data type (default: torch.bfloat16).
+    """
+
+    # Tensor initializers for dynamic tensors (indices 0, 1, 2, 3, 11)
+    # These create valid dummy tensors for profiling with different num_tokens
+    dynamic_tensor_initializers = [
+        # 0: x - FP4 quantized input (uint8 packed)
+        lambda shapes, dtype, device: torch.randint(
+            0, 256, shapes, dtype=torch.uint8, device=device
+        ),
+        # 1: x_sf - FP8 scale factors (uint8)
+        lambda shapes, dtype, device: torch.randint(
+            1, 128, shapes, dtype=torch.uint8, device=device
+        ),
+        # 2: token_selected_experts - expert indices (int32, 0 to num_experts-1)
+        lambda shapes, dtype, device: torch.randint(
+            0,
+            8,
+            shapes,
+            dtype=torch.int32,
+            device=device,  # num_experts=8 typical
+        ),
+        # 3: token_final_scales - routing weights (float32, softmax normalized)
+        lambda shapes, dtype, device: torch.softmax(
+            torch.randn(shapes, device=device), dim=-1
+        ).to(torch.float32),
+        # 11: moe_output - output buffer (bfloat16)
+        lambda shapes, dtype, device: torch.empty(shapes, dtype=dtype, device=device),
+    ]
+
+    # Tuning config with dynamic tensor specs for num_tokens dimension
+    # Indices 0, 1, 2, 3, 11 all have num_tokens as their first dimension
+    tuning_config = TuningConfig(
+        dynamic_tensor_specs=(
+            DynamicTensorSpec(
+                input_idx=(0, 1, 2, 3, 11),  # x, x_sf, experts, scales, moe_output
+                dim_idx=(0, 0, 0, 0, 0),  # First dimension is num_tokens for all
+                gen_tuning_buckets=get_last_power_of_2_num_tokens_buckets(8192),
+                map_to_tuning_buckets=lambda x: min(last_positive_power_of_2(x), 8192),
+                tensor_initializers=dynamic_tensor_initializers,
+            ),
+        ),
+    )
+
+    def __init__(
+        self,
+        forward_impl: Callable,
+        num_experts: int,
+        top_k: int,
+        num_local_experts: int,
+        local_expert_offset: int = 0,
+        use_fused_finalize: bool = True,
+        output_dtype: torch.dtype = torch.bfloat16,
+    ):
+        self.forward_impl = forward_impl
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.num_local_experts = num_local_experts
+        self.local_expert_offset = local_expert_offset
+        self.use_fused_finalize = use_fused_finalize
+        self.output_dtype = output_dtype
+
+    def __hash__(self):
+        return hash(
+            (
+                self.num_experts,
+                self.top_k,
+                self.num_local_experts,
+                self.local_expert_offset,
+                self.use_fused_finalize,
+                self.output_dtype,
+            )
+        )
+
+    def get_valid_tactics(  # type: ignore[override]
+        self,
+        inputs: List[torch.Tensor],
+        profile: OptimizationProfile,
+    ) -> List[Tuple[Any, ...]]:
+        """Return list of valid tactics.
+
+        Returns tactics in TRT-LLM format:
+            (tile_size, gemm1_tactic, gemm2_tactic)
+
+        Args:
+            inputs: List of input tensors (not used for tactic validation).
+            profile: Optimization profile (not used for tactic validation).
+
+        Returns:
+            List of valid tactic tuples.
+        """
+        # Return all pre-generated tactics
+        # In practice, some might be invalid for certain problem sizes,
+        # but the kernel will handle that with can_implement checks
+        return ALL_MOE_TACTICS
+
+    def forward(  # type: ignore[override]
+        self,
+        inputs: List[torch.Tensor],
+        tactic: Tuple[Any, ...] = None,  # type: ignore[assignment]
+        do_preparation: bool = False,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        """Execute the MoE forward pass with the specified tactic.
+
+        Args:
+            inputs: List of input tensors:
+                [x, x_sf, token_selected_experts, token_final_scales,
+                 w1_weight, w1_weight_sf, w1_alpha, fc2_input_scale,
+                 w2_weight, w2_weight_sf, w2_alpha, moe_output (optional)]
+            tactic: Tactic tuple (tile_size, gemm1_tactic, gemm2_tactic) or None for default.
+            do_preparation: If True, perform one-time setup (not used).
+            **kwargs: Additional keyword arguments passed to forward_impl.
+
+        Returns:
+            Output tensor from the MoE computation.
+        """
+        if tactic is None or tactic == -1:
+            tactic = DEFAULT_MOE_TACTIC
+
+        # Extract parameters from tactic
+        params = _extract_tactic_params(tactic)
+
+        # Unpack inputs
+        (
+            x,
+            x_sf,
+            token_selected_experts,
+            token_final_scales,
+            w1_weight,
+            w1_weight_sf,
+            w1_alpha,
+            fc2_input_scale,
+            w2_weight,
+            w2_weight_sf,
+            w2_alpha,
+            *optional_inputs,
+        ) = inputs
+
+        moe_output = optional_inputs[0] if optional_inputs else None
+
+        # Call the implementation with tactic parameters
+        return self.forward_impl(
+            x=x,
+            x_sf=x_sf,
+            token_selected_experts=token_selected_experts,
+            token_final_scales=token_final_scales,
+            w1_weight=w1_weight,
+            w1_weight_sf=w1_weight_sf,
+            w1_alpha=w1_alpha,
+            fc2_input_scale=fc2_input_scale,
+            w2_weight=w2_weight,
+            w2_weight_sf=w2_weight_sf,
+            w2_alpha=w2_alpha,
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+            num_local_experts=self.num_local_experts,
+            local_expert_offset=self.local_expert_offset,
+            tile_size=params["tile_size"],
+            gemm1_mma_tiler_mn=params["gemm1_mma_tiler_mn"],
+            gemm1_cluster_shape_mn=params["gemm1_cluster_shape_mn"],
+            gemm2_mma_tiler_mn=params["gemm2_mma_tiler_mn"],
+            gemm2_cluster_shape_mn=params["gemm2_cluster_shape_mn"],
+            output_dtype=self.output_dtype,
+            use_fused_finalize=self.use_fused_finalize,
+            moe_output=moe_output,
+            **kwargs,
+        )
+
+
+# =============================================================================
+# Utility Functions
+# =============================================================================
+
+
+def print_all_tactics():
+    """Print all valid MoE tactics for debugging."""
+    print(f"Total MoE tactics: {len(ALL_MOE_TACTICS)}")
+    print()
+    for i, tactic in enumerate(ALL_MOE_TACTICS):
+        tile_size, gemm1_tactic, gemm2_tactic = tactic
+        print(f"Tactic {i}:")
+        print(f"  tile_size: {tile_size}")
+        print(
+            f"  gemm1: mma_tiler_mn={gemm1_tactic[0]}, cluster_shape_mn={gemm1_tactic[1]}, raster_along_m={gemm1_tactic[2]}"
+        )
+        print(
+            f"  gemm2: mma_tiler_mn={gemm2_tactic[0]}, cluster_shape_mn={gemm2_tactic[1]}, raster_along_m={gemm2_tactic[2]}"
+        )
+        print()
diff --git a/flashinfer/gemm/kernels/grouped_gemm_masked_blackwell.py b/flashinfer/gemm/kernels/grouped_gemm_masked_blackwell.py
index fb6cb6360c..520c94b36d 100644
--- a/flashinfer/gemm/kernels/grouped_gemm_masked_blackwell.py
+++ b/flashinfer/gemm/kernels/grouped_gemm_masked_blackwell.py
@@ -61,6 +61,7 @@
     get_cutlass_dtype,
     cutlass_to_torch_dtype,
     get_num_sm,
+    get_max_active_clusters,
     make_ptr,
 )
 from typing import Callable, List
@@ -2621,10 +2622,9 @@ def __init__(
                 f"MaskedBatchedMatmulCuteDSL: Unsupported with {ab_dtype}, {sf_dtype}, {sf_vec_size}, {c_dtype},  {mma_tiler_mn}, {cluster_shape_mn}, {m}, {n}, {k}, {l}, {a_major}, {b_major}, {c_major}"
             )
 
-        # Compute max active clusters on current device
-        hardware_info = cutlass.utils.HardwareInfo()
+        # Compute max active clusters on current device (cached to avoid expensive queries)
         self._max_active_clusters = min(
-            hardware_info.get_max_active_clusters(
+            get_max_active_clusters(
                 self._cluster_shape_mn[0] * self._cluster_shape_mn[1]
             ),
             sm_count,
diff --git a/flashinfer/jit/__init__.py b/flashinfer/jit/__init__.py
index 3d76524e62..7c19a2776b 100644
--- a/flashinfer/jit/__init__.py
+++ b/flashinfer/jit/__init__.py
@@ -85,6 +85,7 @@
 from .dsv3_optimizations import (
     gen_dsv3_fused_routing_module as gen_dsv3_fused_routing_module,
 )
+from .moe_utils import gen_moe_utils_module as gen_moe_utils_module
 
 
 cuda_lib_path = os.environ.get(
diff --git a/flashinfer/jit/moe_utils.py b/flashinfer/jit/moe_utils.py
new file mode 100644
index 0000000000..71f54eb907
--- /dev/null
+++ b/flashinfer/jit/moe_utils.py
@@ -0,0 +1,90 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from . import env as jit_env
+from .core import (
+    JitSpec,
+    gen_jit_spec,
+    current_compilation_context,
+)
+
+
+def gen_moe_utils_module() -> JitSpec:
+    """
+    Generate a JitSpec for the MoE utilities module.
+
+    This module contains:
+    - moePermute: Permute input activations for MoE routing
+    - moeUnpermute: Unpermute and scale outputs after expert computation
+    - moeOutputMemset: Zero-initialize output buffers for scattered writes
+    - moeActivation: Apply activation functions with optional FP4 quantization
+    - moeSort: Sort tokens by expert assignment (DeepSeekV3 routing)
+    """
+    nvcc_flags = [
+        "-DENABLE_BF16",
+        "-DENABLE_FP8",
+        "-DENABLE_FP4",
+    ]
+
+    nvcc_flags += current_compilation_context.get_nvcc_flags_list(
+        supported_major_versions=[9, 10, 11, 12]
+    )
+
+    return gen_jit_spec(
+        "moe_utils",
+        [
+            jit_env.FLASHINFER_CSRC_DIR / "moe_utils_binding.cu",
+            jit_env.FLASHINFER_CSRC_DIR
+            / "nv_internal/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/envUtils.cpp",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/logger.cpp",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/stringUtils.cpp",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/tllmException.cpp",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/memoryUtils.cu",
+            # Routing kernels for moe_sort
+            jit_env.FLASHINFER_CSRC_DIR / "trtllm_fused_moe_routing_deepseek.cu",
+        ],
+        extra_cuda_cflags=nvcc_flags,
+        extra_include_paths=[
+            jit_env.FLASHINFER_CSRC_DIR,
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal" / "include",
+            jit_env.FLASHINFER_CSRC_DIR
+            / "nv_internal"
+            / "tensorrt_llm"
+            / "cutlass_extensions"
+            / "include",
+            jit_env.FLASHINFER_CSRC_DIR
+            / "nv_internal"
+            / "tensorrt_llm"
+            / "kernels"
+            / "cutlass_kernels"
+            / "include",
+            jit_env.FLASHINFER_CSRC_DIR
+            / "nv_internal"
+            / "tensorrt_llm"
+            / "kernels"
+            / "cutlass_kernels",
+            # Include paths for routing kernels
+            jit_env.FLASHINFER_INCLUDE_DIR,
+            # Include path for trtllm/gen/MmaDecl.h (used by DtypeDecl.h)
+            jit_env.FLASHINFER_INCLUDE_DIR
+            / "flashinfer"
+            / "trtllm"
+            / "batched_gemm"
+            / "trtllmGen_bmm_export",
+        ],
+    )
diff --git a/tests/moe/__init__.py b/tests/moe/__init__.py
index e69de29bb2..62f627c671 100644
--- a/tests/moe/__init__.py
+++ b/tests/moe/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2025 by FlashInfer team.
+# Licensed under the Apache License, Version 2.0.
diff --git a/tests/moe/test_cute_dsl_fused_moe.py b/tests/moe/test_cute_dsl_fused_moe.py
new file mode 100644
index 0000000000..439d484974
--- /dev/null
+++ b/tests/moe/test_cute_dsl_fused_moe.py
@@ -0,0 +1,932 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+Numerical accuracy tests for CuteDSL Fused MoE NVFP4 on Blackwell GPUs.
+
+This test file covers both APIs:
+1. Functional API: `cute_dsl_fused_moe_nvfp4`
+2. Wrapper API: `CuteDslMoEWrapper`
+
+Tests include:
+- Numerical accuracy against reference implementation
+- CUDA graph capture and replay
+- Auto-tuning integration
+- API consistency between functional and wrapper APIs
+"""
+
+import pytest
+import torch
+from torch.nn import functional as F
+
+from flashinfer.cute_dsl import is_cute_dsl_available
+
+
+def is_sm100_family():
+    """Check for SM100 family (Blackwell: SM100, SM103, SM110).
+
+    CuteDSL MoE NVFP4 kernels are optimized for SM100 architecture.
+    SM120+ (Rubin) may have different shared memory/TMEM configurations.
+    """
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    return props.major == 10
+
+
+# Skip decorators
+cute_dsl_available = pytest.mark.skipif(
+    not is_cute_dsl_available(), reason="CuteDSL not available"
+)
+sm100_required = pytest.mark.skipif(
+    not is_sm100_family(),
+    reason="Requires SM100 family GPU (Blackwell: SM100, SM103, SM110)",
+)
+
+
+def silu(x: torch.Tensor) -> torch.Tensor:
+    """SiLU activation: x * sigmoid(x)"""
+    return x * torch.sigmoid(x)
+
+
+def interleave_linear_and_gate(
+    x: torch.Tensor, group_size: int = 64, dim: int = -1
+) -> torch.Tensor:
+    """Interleave linear and gate weights for SwiGLU."""
+    sizes = x.size()
+    dim = dim % x.dim()
+    assert sizes[dim] % (group_size * 2) == 0
+    prev_sizes = sizes[:dim]
+    post_sizes = sizes[dim + 1 :]
+    x = x.view(*prev_sizes, 2, sizes[dim] // (group_size * 2), group_size, *post_sizes)
+    x = x.transpose(dim, dim + 1).contiguous().view(*sizes)
+    return x
+
+
+def quant_dequant_fp4_reference(
+    tensor: torch.Tensor,
+    global_scale: torch.Tensor,
+    sf_vec_size: int = 16,
+) -> torch.Tensor:
+    """Simulate FP4 quantization and dequantization for reference computation."""
+    from flashinfer.fp4_quantization import fp4_quantize, e2m1_and_ufp8sf_scale_to_float
+
+    tensor_bf16 = tensor.to(torch.bfloat16)
+    fp4_packed, sf = fp4_quantize(
+        tensor_bf16,
+        global_scale=global_scale,
+        sf_vec_size=sf_vec_size,
+        is_sf_swizzled_layout=False,
+    )
+
+    sf_uint8 = sf.view(torch.uint8).reshape(-1)
+    dequantized = e2m1_and_ufp8sf_scale_to_float(
+        fp4_packed.cpu(),
+        sf_uint8.cpu(),
+        (1.0 / global_scale).cpu(),
+        sf_vec_size=sf_vec_size,
+        ufp8_type=1,
+        is_sf_swizzled_layout=False,
+    ).to(tensor.device)
+
+    return dequantized.float()
+
+
+def compute_reference_moe_fp4(
+    hidden_states: torch.Tensor,
+    gemm1_weights: torch.Tensor,
+    gemm2_weights: torch.Tensor,
+    token_selected_experts: torch.Tensor,
+    token_final_scales: torch.Tensor,
+    num_tokens: int,
+    num_experts: int,
+    top_k: int,
+    hidden_size: int,
+    intermediate_size: int,
+    fc2_input_scale: torch.Tensor = None,
+    num_local_experts: int = None,
+    local_expert_offset: int = 0,
+) -> torch.Tensor:
+    """Compute reference MoE output using PyTorch operations.
+
+    Computation is done on CPU to avoid cuBLAS issues with certain tensor layouts,
+    then moved back to the original device.
+
+    Args:
+        hidden_states: Input hidden states [num_tokens, hidden_size]
+        gemm1_weights: GEMM1 weights [num_local_experts, 2*intermediate_size, hidden_size]
+        gemm2_weights: GEMM2 weights [num_local_experts, hidden_size, intermediate_size]
+        token_selected_experts: Selected expert IDs (global) [num_tokens, top_k]
+        token_final_scales: Routing weights [num_tokens, top_k]
+        num_tokens: Number of tokens
+        num_experts: Total number of experts (global)
+        top_k: Number of experts per token
+        hidden_size: Hidden dimension
+        intermediate_size: Intermediate dimension
+        fc2_input_scale: Optional scale for FC2 input quantization
+        num_local_experts: Number of local experts (for EP). Defaults to num_experts.
+        local_expert_offset: Starting expert ID for this EP rank. Defaults to 0.
+
+    Returns:
+        Output tensor [num_tokens, hidden_size]
+    """
+    if num_local_experts is None:
+        num_local_experts = num_experts
+
+    device = hidden_states.device
+
+    # Move to CPU for reference computation to avoid cuBLAS layout issues
+    hidden_states_cpu = hidden_states.float().cpu()
+    gemm1_weights_cpu = gemm1_weights.float().cpu()
+    gemm2_weights_cpu = gemm2_weights.float().cpu()
+    token_selected_experts_cpu = token_selected_experts.cpu()
+    token_final_scales_cpu = token_final_scales.cpu()
+
+    output = torch.zeros((num_tokens, hidden_size), dtype=torch.float32)
+
+    for token_idx in range(num_tokens):
+        token_input = hidden_states_cpu[token_idx : token_idx + 1]
+
+        for k in range(top_k):
+            expert_idx = token_selected_experts_cpu[token_idx, k].item()
+            scale = token_final_scales_cpu[token_idx, k].item()
+
+            # Skip invalid expert IDs
+            if expert_idx < 0 or expert_idx >= num_experts:
+                continue
+
+            # Convert global expert ID to local index for EP
+            local_idx = expert_idx - local_expert_offset
+            if local_idx < 0 or local_idx >= num_local_experts:
+                # This expert is not on this EP rank, skip
+                continue
+
+            w1 = gemm1_weights_cpu[local_idx]
+            gemm1_out = token_input @ w1.T
+
+            linear = gemm1_out[:, :intermediate_size]
+            gate = gemm1_out[:, intermediate_size:]
+            swiglu_out = silu(gate) * linear
+
+            if fc2_input_scale is not None:
+                swiglu_out = quant_dequant_fp4_reference(
+                    swiglu_out.to(device), fc2_input_scale, sf_vec_size=16
+                ).cpu()
+
+            w2 = gemm2_weights_cpu[local_idx]
+            gemm2_out = swiglu_out @ w2.T
+
+            output[token_idx] += scale * gemm2_out.squeeze(0)
+
+    return output.to(device)
+
+
+def create_moe_tensors(
+    num_tokens: int,
+    hidden_size: int,
+    intermediate_size: int,
+    num_experts: int,
+    num_local_experts: int,
+    top_k: int,
+    device: str = "cuda",
+    seed: int = 42,
+):
+    """Create properly quantized MoE tensors for testing."""
+    from flashinfer.fp4_quantization import fp4_quantize
+    from flashinfer.cute_dsl.utils import convert_sf_to_mma_layout
+
+    torch.manual_seed(seed)
+    sf_vec_size = 16
+
+    # Input
+    x_bf16 = (
+        torch.randn(num_tokens, hidden_size, dtype=torch.bfloat16, device=device) / 10
+    )
+    a1_gs = torch.tensor([1.0], device=device, dtype=torch.float32)
+
+    x_quantized, x_sf = fp4_quantize(
+        x_bf16, global_scale=a1_gs, sf_vec_size=sf_vec_size, is_sf_swizzled_layout=False
+    )
+    x_sf = x_sf.unsqueeze(-1)
+
+    # Routing
+    router_logits = torch.randn(num_tokens, num_experts, device=device)
+    routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+    routing_weights, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+    routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+    routing_weights = routing_weights.float()
+    selected_experts = selected_experts.to(torch.int32)
+
+    # GEMM1 weights
+    w1_bf16 = (
+        torch.randn(
+            num_local_experts,
+            2 * intermediate_size,
+            hidden_size,
+            dtype=torch.bfloat16,
+            device=device,
+        )
+        / 10
+    )
+
+    w1_bf16_interleaved = interleave_linear_and_gate(w1_bf16, group_size=64, dim=1)
+    w1_gs = torch.tensor([1.0], device=device, dtype=torch.float32)
+
+    w1_flat = w1_bf16_interleaved.view(
+        num_local_experts * 2 * intermediate_size, hidden_size
+    )
+    w1_q_flat, w1_sf_flat = fp4_quantize(
+        w1_flat, global_scale=w1_gs, sf_vec_size=sf_vec_size, is_sf_swizzled_layout=True
+    )
+    w1_q = w1_q_flat.view(num_local_experts, 2 * intermediate_size, hidden_size // 2)
+    w1_weight_sf = convert_sf_to_mma_layout(
+        w1_sf_flat,
+        m=2 * intermediate_size,
+        k=hidden_size,
+        num_groups=num_local_experts,
+        sf_vec_size=sf_vec_size,
+    )
+    w1_alpha = torch.ones(num_local_experts, device=device, dtype=torch.float32)
+
+    # GEMM2 weights
+    w2_bf16 = (
+        torch.randn(
+            num_local_experts,
+            hidden_size,
+            intermediate_size,
+            dtype=torch.bfloat16,
+            device=device,
+        )
+        / 10
+    )
+
+    w2_gs = torch.tensor([1.0], device=device, dtype=torch.float32)
+    w2_flat = w2_bf16.view(num_local_experts * hidden_size, intermediate_size)
+    w2_q_flat, w2_sf_flat = fp4_quantize(
+        w2_flat, global_scale=w2_gs, sf_vec_size=sf_vec_size, is_sf_swizzled_layout=True
+    )
+    w2_q = w2_q_flat.view(num_local_experts, hidden_size, intermediate_size // 2)
+    w2_weight_sf = convert_sf_to_mma_layout(
+        w2_sf_flat,
+        m=hidden_size,
+        k=intermediate_size,
+        num_groups=num_local_experts,
+        sf_vec_size=sf_vec_size,
+    )
+    w2_alpha = torch.ones(num_local_experts, device=device, dtype=torch.float32)
+
+    fc2_input_scale = torch.tensor([1.0], device=device, dtype=torch.float32)
+
+    return {
+        "x": x_quantized,
+        "x_sf": x_sf,
+        "x_bf16": x_bf16,
+        "token_selected_experts": selected_experts,
+        "token_final_scales": routing_weights,
+        "w1_weight": w1_q,
+        "w1_weight_sf": w1_weight_sf,
+        "w1_weight_bf16": w1_bf16,
+        "w1_alpha": w1_alpha,
+        "fc2_input_scale": fc2_input_scale,
+        "w2_weight": w2_q,
+        "w2_weight_sf": w2_weight_sf,
+        "w2_weight_bf16": w2_bf16,
+        "w2_alpha": w2_alpha,
+    }
+
+
+def check_accuracy(
+    actual: torch.Tensor, expected: torch.Tensor, percent_threshold: float = 0.925
+):
+    """Check numerical accuracy with percentage-based tolerance."""
+    actual = actual.float()
+    expected = expected.float()
+
+    output_scale = max(expected.std().item(), 0.01)
+    atol = max(0.1, 3.0 * output_scale)
+    rtol = 0.85
+
+    abs_diff = torch.abs(actual - expected)
+    rel_diff = abs_diff / (torch.abs(expected) + 1e-8)
+    within_tolerance = (abs_diff < atol) | (rel_diff < rtol)
+    percent_within = within_tolerance.float().mean().item()
+
+    return percent_within >= percent_threshold, percent_within, atol
+
+
+# =============================================================================
+# Test Class: Functional API (cute_dsl_fused_moe_nvfp4)
+# =============================================================================
+
+
+@cute_dsl_available
+@sm100_required
+class TestCuteDslFusedMoeFunctional:
+    """Tests for the functional API: cute_dsl_fused_moe_nvfp4."""
+
+    @pytest.mark.parametrize(
+        "hidden_size,intermediate_size", [(256, 512), (1024, 2048)]
+    )
+    @pytest.mark.parametrize("top_k", [1, 2, 8])
+    @pytest.mark.parametrize("num_tokens", [128, 515, 1024])
+    @pytest.mark.parametrize("num_experts", [256, 384])
+    def test_numerical_accuracy(
+        self,
+        num_tokens: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        num_experts: int,
+    ):
+        """Accuracy test for functional API across configurations."""
+        from flashinfer import cute_dsl_fused_moe_nvfp4
+
+        num_local_experts = num_experts
+
+        tensors = create_moe_tensors(
+            num_tokens=num_tokens,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_experts=num_experts,
+            num_local_experts=num_local_experts,
+            top_k=top_k,
+        )
+
+        result = cute_dsl_fused_moe_nvfp4(
+            x=tensors["x"],
+            x_sf=tensors["x_sf"],
+            token_selected_experts=tensors["token_selected_experts"],
+            token_final_scales=tensors["token_final_scales"],
+            w1_weight=tensors["w1_weight"],
+            w1_weight_sf=tensors["w1_weight_sf"],
+            w1_alpha=tensors["w1_alpha"],
+            fc2_input_scale=tensors["fc2_input_scale"],
+            w2_weight=tensors["w2_weight"],
+            w2_weight_sf=tensors["w2_weight_sf"],
+            w2_alpha=tensors["w2_alpha"],
+            num_experts=num_experts,
+            top_k=top_k,
+            num_local_experts=num_local_experts,
+        )
+
+        assert result.shape == (num_tokens, hidden_size)
+        assert result.dtype == torch.bfloat16
+        assert not torch.isnan(result).any()
+        assert not torch.isinf(result).any()
+
+        ref_output = compute_reference_moe_fp4(
+            hidden_states=tensors["x_bf16"].float().cuda(),
+            gemm1_weights=tensors["w1_weight_bf16"].float().cuda(),
+            gemm2_weights=tensors["w2_weight_bf16"].float().cuda(),
+            token_selected_experts=tensors["token_selected_experts"],
+            token_final_scales=tensors["token_final_scales"],
+            num_tokens=num_tokens,
+            num_experts=num_local_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            fc2_input_scale=tensors["fc2_input_scale"],
+        )
+
+        passed, percent_within, atol = check_accuracy(result, ref_output)
+        assert passed, (
+            f"Only {percent_within * 100:.2f}% within tolerance (atol={atol:.4f})"
+        )
+
+    def test_with_autotune(self):
+        """Test functional API with autotune context."""
+        from flashinfer import autotune
+        from flashinfer import cute_dsl_fused_moe_nvfp4
+
+        num_tokens, hidden_size, intermediate_size = 256, 256, 512
+        num_experts, top_k = 256, 2
+
+        tensors = create_moe_tensors(
+            num_tokens=num_tokens,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_experts=num_experts,
+            num_local_experts=num_experts,
+            top_k=top_k,
+        )
+
+        with autotune(True):
+            result = cute_dsl_fused_moe_nvfp4(
+                x=tensors["x"],
+                x_sf=tensors["x_sf"],
+                token_selected_experts=tensors["token_selected_experts"],
+                token_final_scales=tensors["token_final_scales"],
+                w1_weight=tensors["w1_weight"],
+                w1_weight_sf=tensors["w1_weight_sf"],
+                w1_alpha=tensors["w1_alpha"],
+                fc2_input_scale=tensors["fc2_input_scale"],
+                w2_weight=tensors["w2_weight"],
+                w2_weight_sf=tensors["w2_weight_sf"],
+                w2_alpha=tensors["w2_alpha"],
+                num_experts=num_experts,
+                top_k=top_k,
+            )
+
+        assert result.shape == (num_tokens, hidden_size)
+        assert not torch.isnan(result).any()
+
+
+# =============================================================================
+# Test Class: Wrapper API (CuteDslMoEWrapper)
+# =============================================================================
+
+
+@cute_dsl_available
+@sm100_required
+class TestCuteDslMoEWrapper:
+    """Tests for the wrapper API: CuteDslMoEWrapper."""
+
+    @pytest.mark.parametrize("num_tokens", [128, 256, 512])
+    @pytest.mark.parametrize("top_k", [2, 8])
+    @pytest.mark.parametrize("num_experts", [256, 384])
+    def test_wrapper_accuracy(self, num_tokens: int, top_k: int, num_experts: int):
+        """Accuracy test for wrapper API."""
+        from flashinfer import CuteDslMoEWrapper
+
+        hidden_size, intermediate_size = 256, 512
+
+        tensors = create_moe_tensors(
+            num_tokens=num_tokens,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_experts=num_experts,
+            num_local_experts=num_experts,
+            top_k=top_k,
+        )
+
+        # Create wrapper WITHOUT CUDA graph
+        moe = CuteDslMoEWrapper(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            use_cuda_graph=False,
+        )
+
+        result = moe.run(
+            x=tensors["x"],
+            x_sf=tensors["x_sf"],
+            token_selected_experts=tensors["token_selected_experts"],
+            token_final_scales=tensors["token_final_scales"],
+            w1_weight=tensors["w1_weight"],
+            w1_weight_sf=tensors["w1_weight_sf"],
+            w1_alpha=tensors["w1_alpha"],
+            fc2_input_scale=tensors["fc2_input_scale"],
+            w2_weight=tensors["w2_weight"],
+            w2_weight_sf=tensors["w2_weight_sf"],
+            w2_alpha=tensors["w2_alpha"],
+        )
+
+        assert result.shape == (num_tokens, hidden_size)
+        assert not torch.isnan(result).any()
+        assert not torch.isinf(result).any()
+
+        ref_output = compute_reference_moe_fp4(
+            hidden_states=tensors["x_bf16"].float().cuda(),
+            gemm1_weights=tensors["w1_weight_bf16"].float().cuda(),
+            gemm2_weights=tensors["w2_weight_bf16"].float().cuda(),
+            token_selected_experts=tensors["token_selected_experts"],
+            token_final_scales=tensors["token_final_scales"],
+            num_tokens=num_tokens,
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            fc2_input_scale=tensors["fc2_input_scale"],
+        )
+
+        passed, percent_within, atol = check_accuracy(result, ref_output)
+        assert passed, (
+            f"Only {percent_within * 100:.2f}% within tolerance (atol={atol:.4f})"
+        )
+
+    @pytest.mark.parametrize("num_tokens", [64, 128, 256])
+    @pytest.mark.parametrize("num_experts", [256, 384])
+    def test_wrapper_cuda_graph(self, num_tokens: int, num_experts: int):
+        """Test wrapper API with CUDA graph capture and replay."""
+        from flashinfer import CuteDslMoEWrapper
+
+        hidden_size, intermediate_size = 256, 512
+        top_k = 2
+
+        tensors = create_moe_tensors(
+            num_tokens=num_tokens,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_experts=num_experts,
+            num_local_experts=num_experts,
+            top_k=top_k,
+        )
+
+        # Create wrapper WITH CUDA graph
+        moe = CuteDslMoEWrapper(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            use_cuda_graph=True,
+            max_num_tokens=num_tokens,
+        )
+
+        # Warmup
+        for _ in range(3):
+            moe.run(
+                x=tensors["x"],
+                x_sf=tensors["x_sf"],
+                token_selected_experts=tensors["token_selected_experts"],
+                token_final_scales=tensors["token_final_scales"],
+                w1_weight=tensors["w1_weight"],
+                w1_weight_sf=tensors["w1_weight_sf"],
+                w1_alpha=tensors["w1_alpha"],
+                fc2_input_scale=tensors["fc2_input_scale"],
+                w2_weight=tensors["w2_weight"],
+                w2_weight_sf=tensors["w2_weight_sf"],
+                w2_alpha=tensors["w2_alpha"],
+            )
+        torch.cuda.synchronize()
+
+        # Capture CUDA graph
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            output = moe.run(
+                x=tensors["x"],
+                x_sf=tensors["x_sf"],
+                token_selected_experts=tensors["token_selected_experts"],
+                token_final_scales=tensors["token_final_scales"],
+                w1_weight=tensors["w1_weight"],
+                w1_weight_sf=tensors["w1_weight_sf"],
+                w1_alpha=tensors["w1_alpha"],
+                fc2_input_scale=tensors["fc2_input_scale"],
+                w2_weight=tensors["w2_weight"],
+                w2_weight_sf=tensors["w2_weight_sf"],
+                w2_alpha=tensors["w2_alpha"],
+            )
+        torch.cuda.synchronize()
+
+        # Note: CUDA graph capture doesn't execute - output may be zeros here
+        # Actual execution happens during replay
+        assert output.shape == (num_tokens, hidden_size)
+
+        # First replay to get actual output
+        g.replay()
+        torch.cuda.synchronize()
+
+        # Verify output is valid after first replay
+        assert not torch.isnan(output).any(), "NaN after first replay"
+        assert not (output == 0).all(), "All zeros after first replay"
+
+        # Test replay consistency (allow small numerical differences due to FP4 atomics)
+        results = []
+        for _ in range(3):
+            g.replay()
+            torch.cuda.synchronize()
+            results.append(output.clone())
+
+        # All replays should produce very similar results (small FP4 tolerance)
+        for i in range(1, len(results)):
+            max_diff = (results[0] - results[i]).abs().max().item()
+            # FP4 atomics can have small non-determinism
+            assert max_diff < 0.5, f"Replay {i} differs too much: max_diff={max_diff}"
+
+        # Verify accuracy
+        ref_output = compute_reference_moe_fp4(
+            hidden_states=tensors["x_bf16"].float().cuda(),
+            gemm1_weights=tensors["w1_weight_bf16"].float().cuda(),
+            gemm2_weights=tensors["w2_weight_bf16"].float().cuda(),
+            token_selected_experts=tensors["token_selected_experts"],
+            token_final_scales=tensors["token_final_scales"],
+            num_tokens=num_tokens,
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            fc2_input_scale=tensors["fc2_input_scale"],
+        )
+
+        passed, percent_within, atol = check_accuracy(results[0], ref_output)
+        assert passed, (
+            f"CUDA graph accuracy: {percent_within * 100:.2f}% (atol={atol:.4f})"
+        )
+
+    def test_wrapper_with_autotune(self):
+        """Test wrapper API with autotune context."""
+        from flashinfer import autotune
+        from flashinfer import CuteDslMoEWrapper
+
+        num_tokens, hidden_size, intermediate_size = 256, 256, 512
+        num_experts, top_k = 256, 2
+
+        tensors = create_moe_tensors(
+            num_tokens=num_tokens,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_experts=num_experts,
+            num_local_experts=num_experts,
+            top_k=top_k,
+        )
+
+        moe = CuteDslMoEWrapper(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            use_cuda_graph=False,
+        )
+
+        with autotune(True):
+            result = moe.run(
+                x=tensors["x"],
+                x_sf=tensors["x_sf"],
+                token_selected_experts=tensors["token_selected_experts"],
+                token_final_scales=tensors["token_final_scales"],
+                w1_weight=tensors["w1_weight"],
+                w1_weight_sf=tensors["w1_weight_sf"],
+                w1_alpha=tensors["w1_alpha"],
+                fc2_input_scale=tensors["fc2_input_scale"],
+                w2_weight=tensors["w2_weight"],
+                w2_weight_sf=tensors["w2_weight_sf"],
+                w2_alpha=tensors["w2_alpha"],
+            )
+
+        assert result.shape == (num_tokens, hidden_size)
+        assert not torch.isnan(result).any()
+
+        ref_output = compute_reference_moe_fp4(
+            hidden_states=tensors["x_bf16"].float().cuda(),
+            gemm1_weights=tensors["w1_weight_bf16"].float().cuda(),
+            gemm2_weights=tensors["w2_weight_bf16"].float().cuda(),
+            token_selected_experts=tensors["token_selected_experts"],
+            token_final_scales=tensors["token_final_scales"],
+            num_tokens=num_tokens,
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            fc2_input_scale=tensors["fc2_input_scale"],
+        )
+
+        passed, percent_within, atol = check_accuracy(result, ref_output)
+        assert passed, (
+            f"Only {percent_within * 100:.2f}% within tolerance (atol={atol:.4f})"
+        )
+
+
+# =============================================================================
+# Test Class: API Consistency
+# =============================================================================
+
+
+@cute_dsl_available
+@sm100_required
+class TestApiConsistency:
+    """Tests verifying consistency between functional and wrapper APIs."""
+
+    def test_functional_vs_wrapper_output(self):
+        """Verify functional and wrapper APIs produce the same output."""
+        from flashinfer import CuteDslMoEWrapper, cute_dsl_fused_moe_nvfp4
+
+        num_tokens, hidden_size, intermediate_size = 128, 256, 512
+        num_experts, top_k = 256, 2
+
+        tensors = create_moe_tensors(
+            num_tokens=num_tokens,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_experts=num_experts,
+            num_local_experts=num_experts,
+            top_k=top_k,
+        )
+
+        # Functional API
+        result_functional = cute_dsl_fused_moe_nvfp4(
+            x=tensors["x"],
+            x_sf=tensors["x_sf"],
+            token_selected_experts=tensors["token_selected_experts"],
+            token_final_scales=tensors["token_final_scales"],
+            w1_weight=tensors["w1_weight"],
+            w1_weight_sf=tensors["w1_weight_sf"],
+            w1_alpha=tensors["w1_alpha"],
+            fc2_input_scale=tensors["fc2_input_scale"],
+            w2_weight=tensors["w2_weight"],
+            w2_weight_sf=tensors["w2_weight_sf"],
+            w2_alpha=tensors["w2_alpha"],
+            num_experts=num_experts,
+            top_k=top_k,
+        )
+
+        # Wrapper API
+        moe = CuteDslMoEWrapper(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            use_cuda_graph=False,
+        )
+
+        result_wrapper = moe.run(
+            x=tensors["x"],
+            x_sf=tensors["x_sf"],
+            token_selected_experts=tensors["token_selected_experts"],
+            token_final_scales=tensors["token_final_scales"],
+            w1_weight=tensors["w1_weight"],
+            w1_weight_sf=tensors["w1_weight_sf"],
+            w1_alpha=tensors["w1_alpha"],
+            fc2_input_scale=tensors["fc2_input_scale"],
+            w2_weight=tensors["w2_weight"],
+            w2_weight_sf=tensors["w2_weight_sf"],
+            w2_alpha=tensors["w2_alpha"],
+        )
+
+        # Both should produce valid outputs
+        assert result_functional.shape == result_wrapper.shape
+        assert not torch.isnan(result_functional).any()
+        assert not torch.isnan(result_wrapper).any()
+
+        # Outputs should be very close (may not be exactly equal due to different
+        # tuning paths, but should be within FP4 tolerance)
+        diff = (result_functional - result_wrapper).abs()
+        max_diff = diff.max().item()
+        # Allow small differences from autotuner path differences
+        assert max_diff < 1e-3, f"Max diff between APIs: {max_diff}"
+
+
+# =============================================================================
+# Test Class: Expert Parallelism
+# =============================================================================
+
+
+@cute_dsl_available
+@sm100_required
+class TestExpertParallelism:
+    """Tests for expert parallelism (EP) configurations."""
+
+    @pytest.mark.parametrize("ep_size", [1, 8, 32])
+    @pytest.mark.parametrize("ep_rank", [0, -1])  # -1 means last rank
+    def test_wrapper_with_ep(self, ep_size: int, ep_rank: int):
+        """Test wrapper API with expert parallelism and numerical accuracy.
+
+        Tests different EP ranks to ensure local_expert_offset handling is correct.
+        ep_rank=-1 is converted to the last rank (ep_size-1) to test non-zero offsets.
+        """
+        from flashinfer import CuteDslMoEWrapper
+
+        # Convert -1 to last rank
+        if ep_rank == -1:
+            ep_rank = ep_size - 1
+
+        num_tokens, hidden_size, intermediate_size = 256, 256, 512
+        num_experts, top_k = 256, 8
+        num_local_experts = num_experts // ep_size
+        local_expert_offset = ep_rank * num_local_experts
+
+        tensors = create_moe_tensors(
+            num_tokens=num_tokens,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_experts=num_experts,
+            num_local_experts=num_local_experts,
+            top_k=top_k,
+        )
+
+        # Keep original routing - the kernel should handle filtering
+        # based on local_expert_offset and num_local_experts
+        token_selected_experts = tensors["token_selected_experts"].clone()
+
+        moe = CuteDslMoEWrapper(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_local_experts=num_local_experts,
+            local_expert_offset=local_expert_offset,
+        )
+
+        result = moe.run(
+            x=tensors["x"],
+            x_sf=tensors["x_sf"],
+            token_selected_experts=token_selected_experts,
+            token_final_scales=tensors["token_final_scales"],
+            w1_weight=tensors["w1_weight"],
+            w1_weight_sf=tensors["w1_weight_sf"],
+            w1_alpha=tensors["w1_alpha"],
+            fc2_input_scale=tensors["fc2_input_scale"],
+            w2_weight=tensors["w2_weight"],
+            w2_weight_sf=tensors["w2_weight_sf"],
+            w2_alpha=tensors["w2_alpha"],
+        )
+
+        assert result.shape == (num_tokens, hidden_size)
+        assert not torch.isnan(result).any()
+        assert not torch.isinf(result).any()
+
+        # Numerical accuracy verification against reference
+        ref_output = compute_reference_moe_fp4(
+            hidden_states=tensors["x_bf16"].float().cuda(),
+            gemm1_weights=tensors["w1_weight_bf16"].float().cuda(),
+            gemm2_weights=tensors["w2_weight_bf16"].float().cuda(),
+            token_selected_experts=token_selected_experts,
+            token_final_scales=tensors["token_final_scales"],
+            num_tokens=num_tokens,
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            fc2_input_scale=tensors["fc2_input_scale"],
+            num_local_experts=num_local_experts,
+            local_expert_offset=local_expert_offset,
+        )
+
+        passed, percent_within, atol = check_accuracy(result, ref_output)
+        assert passed, (
+            f"EP accuracy test failed (ep_size={ep_size}, ep_rank={ep_rank}, "
+            f"offset={local_expert_offset}): {percent_within * 100:.2f}% within tolerance (atol={atol:.4f})"
+        )
+
+    @pytest.mark.parametrize("ep_size", [8])
+    def test_functional_with_ep(self, ep_size: int):
+        """Test functional API with expert parallelism and numerical accuracy."""
+        from flashinfer import cute_dsl_fused_moe_nvfp4
+
+        # Test middle rank to ensure offset handling works
+        ep_rank = ep_size // 2
+
+        num_tokens, hidden_size, intermediate_size = 256, 256, 512
+        num_experts, top_k = 256, 8
+        num_local_experts = num_experts // ep_size
+        local_expert_offset = ep_rank * num_local_experts
+
+        tensors = create_moe_tensors(
+            num_tokens=num_tokens,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_experts=num_experts,
+            num_local_experts=num_local_experts,
+            top_k=top_k,
+        )
+
+        result = cute_dsl_fused_moe_nvfp4(
+            x=tensors["x"],
+            x_sf=tensors["x_sf"],
+            token_selected_experts=tensors["token_selected_experts"],
+            token_final_scales=tensors["token_final_scales"],
+            w1_weight=tensors["w1_weight"],
+            w1_weight_sf=tensors["w1_weight_sf"],
+            w1_alpha=tensors["w1_alpha"],
+            fc2_input_scale=tensors["fc2_input_scale"],
+            w2_weight=tensors["w2_weight"],
+            w2_weight_sf=tensors["w2_weight_sf"],
+            w2_alpha=tensors["w2_alpha"],
+            num_experts=num_experts,
+            top_k=top_k,
+            num_local_experts=num_local_experts,
+            local_expert_offset=local_expert_offset,
+        )
+
+        assert result.shape == (num_tokens, hidden_size)
+        assert not torch.isnan(result).any()
+        assert not torch.isinf(result).any()
+
+        # Numerical accuracy verification
+        ref_output = compute_reference_moe_fp4(
+            hidden_states=tensors["x_bf16"].float().cuda(),
+            gemm1_weights=tensors["w1_weight_bf16"].float().cuda(),
+            gemm2_weights=tensors["w2_weight_bf16"].float().cuda(),
+            token_selected_experts=tensors["token_selected_experts"],
+            token_final_scales=tensors["token_final_scales"],
+            num_tokens=num_tokens,
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            fc2_input_scale=tensors["fc2_input_scale"],
+            num_local_experts=num_local_experts,
+            local_expert_offset=local_expert_offset,
+        )
+
+        passed, percent_within, atol = check_accuracy(result, ref_output)
+        assert passed, (
+            f"EP functional API accuracy test failed (ep_size={ep_size}, ep_rank={ep_rank}): "
+            f"{percent_within * 100:.2f}% within tolerance (atol={atol:.4f})"
+        )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])