vllm-project · mgoin · Sep 17, 2025 · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025
diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -13,6 +13,10 @@
 
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import (
+    fp8_w8a8_moe_quant_config,
+    nvfp4_moe_quant_config,
+)
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.scalar_type import scalar_types
@@ -140,17 +144,20 @@ def run_triton_moe(
         a_fp8_scale: torch.Tensor,
         num_repeats: int,
     ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a_fp8_scale,
+        )
+
         for _ in range(num_repeats):
             fused_experts(
                 a,
                 w1,
                 w2,
                 topk_weights,
                 topk_ids,
-                use_fp8_w8a8=True,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a_fp8_scale,
+                quant_config=quant_config,
             )
 
     def run_cutlass_moe_fp4(
@@ -172,25 +179,27 @@ def run_cutlass_moe_fp4(
         device: torch.device,
         num_repeats: int,
     ):
+        quant_config = nvfp4_moe_quant_config(
+            a1_gscale=a1_gs,
+            a2_gscale=a2_gs,
+            w1_scale=w1_blockscale,
+            w2_scale=w2_blockscale,
+            g1_alphas=w1_gs,
+            g2_alphas=w2_gs,
+        )
         for _ in range(num_repeats):
             with nvtx.annotate("cutlass_moe_fp4", color="green"):
                 cutlass_moe_fp4(
                     a=a,
-                    a1_gscale=a1_gs,
-                    a2_gscale=a2_gs,
                     w1_fp4=w1_fp4,
-                    w1_blockscale=w1_blockscale,
-                    w1_alphas=w1_gs,
                     w2_fp4=w2_fp4,
-                    w2_blockscale=w2_blockscale,
-                    w2_alphas=w2_gs,
                     topk_weights=topk_weights,
                     topk_ids=topk_ids,
                     m=m,
                     n=n,
                     k=k,
                     e=num_experts,
-                    device=device,
+                    quant_config=quant_config,
                 )
 
     def run_cutlass_from_graph(
@@ -211,26 +220,29 @@ def run_cutlass_from_graph(
         e: int,
         device: torch.device,
     ):
+        quant_config = nvfp4_moe_quant_config(
+            a1_gscale=a1_gs,
+            a2_gscale=a2_gs,
+            w1_scale=w1_blockscale,
+            w2_scale=w2_blockscale,
+            g1_alphas=w1_gs,
+            g2_alphas=w2_gs,
+        )
+
         with set_current_vllm_config(
             VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
         ):
             return cutlass_moe_fp4(
                 a=a,
-                a1_gscale=a1_gs,
                 w1_fp4=w1_fp4,
-                w1_blockscale=w1_blockscale,
-                w1_alphas=w1_alphas,
-                a2_gscale=a2_gs,
                 w2_fp4=w2_fp4,
-                w2_blockscale=w2_blockscale,
-                w2_alphas=w2_alphas,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 m=m,
                 n=n,
                 k=k,
                 e=num_experts,
-                device=device,
+                quant_config=quant_config,
             )
 
     def run_triton_from_graph(
@@ -246,16 +258,18 @@ def run_triton_from_graph(
         with set_current_vllm_config(
             VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
         ):
+            quant_config = fp8_w8a8_moe_quant_config(
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_fp8_scale,
+            )
             return fused_experts(
                 a,
                 w1,
                 w2,
                 topk_weights,
                 topk_ids,
-                use_fp8_w8a8=True,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a_fp8_scale,
+                quant_config=quant_config,
             )
 
     def replay_graph(graph, num_repeats):

diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -7,6 +7,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_experts,
@@ -96,17 +97,19 @@ def run_triton_moe(
         a_scale: torch.Tensor,
         num_repeats: int,
     ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a_scale,
+        )
         for _ in range(num_repeats):
             fused_experts(
                 a,
                 w1,
                 w2,
                 topk_weights,
                 topk_ids,
-                use_fp8_w8a8=True,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a_scale,
+                quant_config=quant_config,
             )
 
     def run_cutlass_moe(
@@ -125,21 +128,24 @@ def run_cutlass_moe(
         per_act_token: bool,
         num_repeats: int,
     ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            per_act_token_quant=per_act_token,
+        )
+
         for _ in range(num_repeats):
             cutlass_moe_fp8(
                 a,
                 w1,
                 w2,
                 topk_weights,
                 topk_ids,
-                w1_scale,
-                w2_scale,
                 ab_strides1,
                 ab_strides2,
                 c_strides1,
                 c_strides2,
-                per_act_token,
-                a1_scale=None,
+                quant_config=quant_config,
             )
 
     def run_cutlass_from_graph(
@@ -156,6 +162,12 @@ def run_cutlass_from_graph(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
     ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            per_act_token_quant=per_act_token,
+        )
+
         with set_current_vllm_config(
             VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
         ):
@@ -165,14 +177,11 @@ def run_cutlass_from_graph(
                 w2_q,
                 topk_weights,
                 topk_ids,
-                w1_scale,
-                w2_scale,
                 ab_strides1,
                 ab_strides2,
                 c_strides1,
                 c_strides2,
-                per_act_token,
-                a1_scale=None,
+                quant_config=quant_config,
             )
 
     def run_triton_from_graph(
@@ -185,6 +194,11 @@ def run_triton_from_graph(
         w2_scale: torch.Tensor,
         a_scale: torch.Tensor,
     ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a_scale,
+        )
         with set_current_vllm_config(
             VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
         ):
@@ -194,10 +208,7 @@ def run_triton_from_graph(
                 w2,
                 topk_weights,
                 topk_ids,
-                use_fp8_w8a8=True,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a_scale,
+                quant_config=quant_config,
             )
 
     def replay_graph(graph, num_repeats):

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
@@ -14,6 +14,10 @@
 import torch
 from ray.experimental.tqdm_ray import tqdm
 
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig,
+    _get_config_dtype_str,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
@@ -134,43 +138,36 @@ def prepare(i: int):
     def run():
         from vllm.model_executor.layers.fused_moe import override_config
 
+        if use_fp8_w8a8:
+            quant_dtype = torch.float8_e4m3fn
+        elif use_int8_w8a16:
+            quant_dtype = torch.int8
+        else:
+            quant_dtype = None
+
+        quant_config = FusedMoEQuantConfig.make(
+            quant_dtype=quant_dtype,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            block_shape=block_quant_shape,
+        )
+
         with override_config(config):
-            if use_deep_gemm:
-                topk_weights, topk_ids, token_expert_indices = fused_topk(
-                    x, input_gating, topk, False
-                )
-                return fused_experts(
-                    x,
-                    w1,
-                    w2,
-                    topk_weights,
-                    topk_ids,
-                    inplace=True,
-                    use_fp8_w8a8=use_fp8_w8a8,
-                    w1_scale=w1_scale,
-                    w2_scale=w2_scale,
-                    a1_scale=a1_scale,
-                    a2_scale=a2_scale,
-                    block_shape=block_quant_shape,
-                    allow_deep_gemm=True,
-                )
-            else:
-                fused_moe(
-                    x,
-                    w1,
-                    w2,
-                    input_gating,
-                    topk,
-                    renormalize=True,
-                    inplace=True,
-                    use_fp8_w8a8=use_fp8_w8a8,
-                    use_int8_w8a16=use_int8_w8a16,
-                    w1_scale=w1_scale,
-                    w2_scale=w2_scale,
-                    a1_scale=a1_scale,
-                    a2_scale=a2_scale,
-                    block_shape=block_quant_shape,
-                )
+            topk_weights, topk_ids, token_expert_indices = fused_topk(
+                x, input_gating, topk, renormalize=not use_deep_gemm
+            )
+            return fused_experts(
+                x,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                inplace=True,
+                quant_config=quant_config,
+                allow_deep_gemm=use_deep_gemm,
+            )
 
     # JIT compilation & warmup
     run()
@@ -414,7 +411,7 @@ def benchmark(
         use_deep_gemm: bool = False,
     ) -> tuple[dict[str, int], float]:
         current_platform.seed_everything(self.seed)
-        dtype_str = get_config_dtype_str(
+        dtype_str = _get_config_dtype_str(
             dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
         )
         # NOTE(woosuk): The current naming convention uses w2.shape[2], which
@@ -547,7 +544,7 @@ def save_configs(
     block_quant_shape: list[int],
     save_dir: str,
 ) -> None:
-    dtype_str = get_config_dtype_str(
+    dtype_str = _get_config_dtype_str(
         dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
     )