vllm-project · mgoin · Jan 8, 2026 · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -1422,3 +1422,10 @@ steps:
   num_gpus: 2
   commands:
     - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
+
+- label: MoE Refactor Integration Test (B200 DP - TEMPORARY) # optional
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -6,13 +6,16 @@
 but use different quantization strategies and backends.
 """
 
-import nvtx
 import torch
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.worker.workspace import init_workspace_manager
@@ -59,6 +62,7 @@ def bench_run(
     per_out_ch: bool,
     mkn: tuple[int, int, int],
 ):
+    init_workspace_manager(torch.cuda.current_device())
     (m, k, n) = mkn
 
     dtype = torch.half
@@ -121,85 +125,6 @@ def bench_run(
     # Force per-tensor quantization for all cases
     per_act_token = False
 
-    # Create stride tensors for CUTLASS
-    ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
-    ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
-    c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
-    c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
-
-    def run_triton_moe(
-        a: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        a2_scale: torch.Tensor,
-        num_repeats: int,
-    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-            per_act_token_quant=per_act_token,
-            per_out_ch_quant=per_out_ch,
-        )
-
-        for _ in range(num_repeats):
-            fused_experts(
-                a,
-                w1,
-                w2,
-                topk_weights,
-                topk_ids,
-                quant_config=quant_config,
-            )
-
-    def run_cutlass_moe_fp8(
-        a: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        a2_scale: torch.Tensor,
-        num_repeats: int,
-    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-            per_act_token_quant=per_act_token,
-            per_out_ch_quant=per_out_ch,
-        )
-
-        for _ in range(num_repeats):
-            with nvtx.annotate("cutlass_moe_fp8", color="blue"):
-                cutlass_moe_fp8(
-                    a=a,
-                    w1_q=w1,
-                    w2_q=w2,
-                    topk_weights=topk_weights,
-                    topk_ids=topk_ids,
-                    ab_strides1=ab_strides1,
-                    ab_strides2=ab_strides2,
-                    c_strides1=c_strides1,
-                    c_strides2=c_strides2,
-                    quant_config=quant_config,
-                    activation="silu",
-                    global_num_experts=num_experts,
-                )
-
     # Pre-create quantization config to avoid creating it inside CUDA graph
     quant_config = fp8_w8a8_moe_quant_config(
         w1_scale=w1_scale,
@@ -210,23 +135,30 @@ def run_cutlass_moe_fp8(
         per_out_ch_quant=per_out_ch,
     )
 
+    fn = mk.FusedMoEModularKernel(
+        MoEPrepareAndFinalizeNoEP(),
+        CutlassExpertsFp8(
+            out_dtype=a.dtype,
+            e=num_experts,
+            n=n,
+            k=k,
+            quant_config=quant_config,
+            device=w1.device,
+        ),
+    )
+
     # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
     cutlass_stream = torch.cuda.Stream()
     cutlass_graph = torch.cuda.CUDAGraph()
     with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
         # Capture 10 invocations like benchmark_moe.py
         for _ in range(10):
-            cutlass_moe_fp8(
-                a=a,
-                w1_q=w1_fp8q_cutlass,
-                w2_q=w2_fp8q_cutlass,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                ab_strides1=ab_strides1,
-                ab_strides2=ab_strides2,
-                c_strides1=c_strides1,
-                c_strides2=c_strides2,
-                quant_config=quant_config,
+            fn(
+                a,
+                w1_fp8q_cutlass,
+                w2_fp8q_cutlass,
+                topk_weights,
+                topk_ids,
                 activation="silu",
                 global_num_experts=num_experts,
             )

diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -5,14 +5,18 @@
 import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES_MOE
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_experts,
     fused_topk,
 )
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.worker.workspace import init_workspace_manager
 
@@ -45,6 +49,7 @@ def bench_run(
     per_out_ch: bool,
     mkn: tuple[int, int, int],
 ):
+    init_workspace_manager(torch.cuda.current_device())
     label = "Quant Matmul"
 
     sub_label = (
@@ -82,11 +87,6 @@ def bench_run(
         a, score, topk, renormalize=False
     )
 
-    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
-    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
-    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-
     def run_triton_moe(
         a: torch.Tensor,
         w1: torch.Tensor,
@@ -120,10 +120,6 @@ def run_cutlass_moe(
         w2: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         per_act_token: bool,
@@ -135,31 +131,29 @@ def run_cutlass_moe(
             per_act_token_quant=per_act_token,
         )
 
-        for _ in range(num_repeats):
-            cutlass_moe_fp8(
-                a,
-                w1,
-                w2,
-                topk_weights,
-                topk_ids,
-                ab_strides1,
-                ab_strides2,
-                c_strides1,
-                c_strides2,
+        fn = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            CutlassExpertsFp8(
+                out_dtype=a.dtype,
+                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+                e=w2.shape[0],
+                n=w2.shape[2],
+                k=w2.shape[1],
                 quant_config=quant_config,
-            )
+                device=w1.device,
+            ),
+        )
+
+        for _ in range(num_repeats):
+            fn(a, w1, w2, topk_weights, topk_ids)
 
     def run_cutlass_from_graph(
         a: torch.Tensor,
         a_scale: torch.Tensor,
-        w1_q: torch.Tensor,
-        w2_q: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
     ):
@@ -169,21 +163,23 @@ def run_cutlass_from_graph(
             per_act_token_quant=per_act_token,
         )
 
+        fn = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            CutlassExpertsFp8(
+                out_dtype=a.dtype,
+                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+                e=w2.shape[0],
+                n=w2.shape[2],
+                k=w2.shape[1],
+                quant_config=quant_config,
+                device=w1.device,
+            ),
+        )
+
         with set_current_vllm_config(
             VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
         ):
-            return cutlass_moe_fp8(
-                a,
-                w1_q,
-                w2_q,
-                topk_weights,
-                topk_ids,
-                ab_strides1,
-                ab_strides2,
-                c_strides1,
-                c_strides2,
-                quant_config=quant_config,
-            )
+            return fn(a, w1, w2, topk_weights, topk_ids)
 
     def run_triton_from_graph(
         a: torch.Tensor,
@@ -227,10 +223,6 @@ def replay_graph(graph, num_repeats):
             w2_q,
             w1_scale,
             w2_scale,
-            ab_strides1,
-            ab_strides2,
-            c_strides1,
-            c_strides2,
             topk_weights,
             topk_ids,
         )
@@ -268,10 +260,6 @@ def replay_graph(graph, num_repeats):
         "w1_scale": w1_scale,
         "w2_scale": w2_scale,
         "per_act_token": per_act_token,
-        "ab_strides1": ab_strides1,
-        "ab_strides2": ab_strides2,
-        "c_strides1": c_strides1,
-        "c_strides2": c_strides2,
         # cuda graph params
         "cutlass_graph": cutlass_graph,
         "triton_graph": triton_graph,
@@ -330,10 +318,6 @@ def replay_graph(graph, num_repeats):
         w2_q,
         w1_scale,
         w2_scale,
-        ab_strides1,
-        ab_strides2,
-        c_strides1,
-        c_strides2,
         topk_weights,
         topk_ids,
         per_act_token,
@@ -342,7 +326,7 @@ def replay_graph(graph, num_repeats):
 
     results.append(
         benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
@@ -48,8 +48,6 @@ def clear_triton_cache():
 
     # Try to clear Triton's runtime cache
     try:
-        import triton
-
         if (
             hasattr(triton, "runtime")
             and hasattr(triton.runtime, "cache")

diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
@@ -87,7 +87,7 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
 | triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
 | deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
 | cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
-| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
+| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
 | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
 | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
 | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |

@@ -0,0 +1,5 @@
+model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
+accuracy_threshold: 0.92
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
@@ -0,0 +1,8 @@
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_high_throughput"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
@@ -0,0 +1,9 @@
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency --disable-uvicorn-access-log"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
+  VLLM_USE_DEEP_GEMM_E8M0: "0"
@@ -0,0 +1,8 @@
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"