Make fused version work with cuda graph

wenscarl · wenscarl · commit 8a224daeb822 · 2025-10-14T03:29:02.000Z
Signed-off-by: Shu Wang &lt;shuw@nvidia.com&gt;
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -1053,9 +1053,6 @@ def get_vllm_port() -> int | None:
         "VLLM_MARLIN_USE_ATOMIC_ADD", "0"
     )
     == "1",
-    "VLLM_DEEPEPLL_BF16_DISPATCH": lambda: bool(
-        int(os.getenv("VLLM_DEEPEPLL_BF16_DISPATCH", "0"))
-    ),
     # Whether to use marlin kernel in mxfp4 quantization method
     "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
         os.environ.get("VLLM_MXFP4_USE_MARLIN", None)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -114,31 +114,30 @@ def _do_quant(
         assert isinstance(x, torch.Tensor)
 
         num_experts, max_tokens, hidden_dim = x.size()
-        if not envs.VLLM_DEEPEPLL_BF16_DISPATCH:
-            # TODO (varun): Optimization - Use a batched version of quant
-            x = x.view((-1, hidden_dim))
-            x, x_scales = moe_kernel_quantize_input(
-                x,
-                quant_config.a1_scale,
-                quant_config.quant_dtype,
-                quant_config.per_act_token_quant,
-                quant_config.block_shape,
-            )
-            x = x.view((num_experts, -1, hidden_dim))
-
-            if quant_config.quant_dtype is not None:
-                assert x_scales is not None
-                x_scales = normalize_batched_scales_shape(x_scales, num_experts)
-        else:
-            # BF16 dispatch path - no quantization
-            # TODO(shuw@nvidia.com): enable nvfp4 dispatch once DEEPEP is ready.
-            logger.info_once("Using BF16 dispatch path for DeepEPLLPrepareAndFinalize")
-            assert x.dtype == torch.bfloat16, (
-                "BF16 dispatch requires input to be in BF16"
+
+        # TODO (varun): Optimization - Use a batched version of quant
+        x = x.view((-1, hidden_dim))
+        q_dtype = quant_config.quant_dtype
+
+        if envs.VLLM_FLASHINFER_MOE_BACKEND == "cutedsl":
+            logger.info_once(
+                "Skip quantization when using FlashInfer CUTEDSL for "
+                "ModelOptNvFp4FusedMoE."
             )
-            x_scales = None
-            x = x.view((num_experts, -1, hidden_dim))
-            # print(f"after deepepll: x.shape = {x.shape}")
+            q_dtype = None
+
+        x, x_scales = moe_kernel_quantize_input(
+            x,
+            quant_config.a1_scale,
+            q_dtype,
+            quant_config.per_act_token_quant,
+            quant_config.block_shape,
+        )
+        x = x.view((num_experts, -1, hidden_dim))
+
+        if q_dtype is not None:
+            assert x_scales is not None
+            x_scales = normalize_batched_scales_shape(x_scales, num_experts)
 
         return x, x_scales
 
@@ -276,8 +275,6 @@ def _finalize(
 
         # TODO (varun) : Enable zero copy mode
         dbo_maybe_run_recv_hook()
-        # print("xxx"*100, fused_expert_output.shape)
-        # print("ttt"*100, fused_expert_output.dtype)
         _, _, recv_hook = self.buffer.low_latency_combine(
             fused_expert_output,
             topk_ids,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
@@ -13,8 +13,8 @@
 from vllm.utils.flashinfer import (
     flashinfer_cutedsl_grouped_gemm_nt_masked,
     has_flashinfer_cutedsl_grouped_gemm_nt_masked,
-    nvfp4_batched_quantize,
-    silu_and_mul,
+    scaled_fp4_grouped_quantize,
+    silu_and_mul_scaled_nvfp4_experts_quantize,
 )
 
 logger = init_logger(__name__)
@@ -110,18 +110,9 @@ def workspace_shapes(
         - Note: in order for activation chunking to work, the first dimension
           of each tuple must be the number of tokens.
         """
-        # assert a.dim() == 2
-        # assert aq.dim() == 3
-        # output_shape = aq.shape
-        # workspace_dtype = a.dtype
-        # E = aq.size(0)
-        # workspace2 = (E, M, N)
-        # workspace1 = output_shape
         output_shape = (local_num_experts, M, K)
         workspace2 = (local_num_experts, M, N)
         workspace1 = output_shape
-        # The workspace is determined by `aq`, since it comes after any
-        # potential communication op and is involved in the expert computation.
         return (workspace1, workspace2, output_shape)
 
     def apply(
@@ -182,54 +173,6 @@ def get_cute_dtype(input: torch.Tensor) -> str:
         raise ValueError(f"Unsupported cute dtype {input.dtype}")
 
 
-def scaled_fp4_grouped_quant(
-    input_tensor: torch.Tensor,
-    input_global_scale: torch.Tensor,
-    mask: torch.Tensor,
-):
-    """
-    Wrapper around nvfp4_batched_quantize
-
-    Args:
-        input_tensor (Tensor):
-            - Shape (l, m, k)
-        input_global_scale (Tensor): Shape (l,)
-        mask (Tensor): Mask tensor, broadcastable
-
-    Returns:
-        output (Tensor): Quantized tensor, logical shape (m, k//2, l)
-        output_scales (Tensor): Blockscale tensor, logical shape
-        (32, 4, rm, 4, rk, l)
-    """
-    num_experts, m, k = input_tensor.shape
-
-    sf_vec_size = 16
-    assert k % sf_vec_size == 0, f"k must be multiple of 16, but got {k}."
-
-    scale_k = k // sf_vec_size
-    padded_k = (scale_k + (4 - 1)) // 4 * 4
-    padded_m = (m + (128 - 1)) // 128 * 128
-
-    aq, aq_sf = nvfp4_batched_quantize(
-        input_tensor,
-        input_global_scale,
-        mask=mask,
-    )
-
-    # --- re-layout quantized tensor ---
-    # physical (l, m, k//2) -> logical (m, k//2, l)
-    output = aq.permute(1, 2, 0)
-
-    # --- re-layout blockscales ---
-    # physical (l, rm, rk, 32, 4, 4) -> logical (32, 4, rm, 4, rk, l)
-    output_scales = aq_sf.view(torch.float8_e4m3fn).view(
-        num_experts, padded_m // 128, padded_k // 4, 32, 4, 4
-    )
-    output_scales = output_scales.permute(3, 4, 1, 5, 2, 0)
-
-    return output, output_scales
-
-
 def flashinfer_cutedsl_moe_masked(
     hidden_states: torch.Tensor,
     input_global_scale: torch.Tensor,
@@ -313,10 +256,10 @@ def flashinfer_cutedsl_moe_masked(
         f"w2_alpha must be (l,), got {w2_alpha.shape}"
     )
 
-    aq, aq_sf = scaled_fp4_grouped_quant(
+    aq, aq_sf = scaled_fp4_grouped_quantize(
         hidden_states,
-        input_global_scale,
         masked_m,
+        input_global_scale,
     )
 
     workspace = workspace.permute(1, 2, 0)  # requirement of kernel
@@ -343,11 +286,10 @@ def flashinfer_cutedsl_moe_masked(
     )  # in logical [m, n, l]
 
     # SILU and quantization
-
-    diq, diq_sf = scaled_fp4_grouped_quant(
-        silu_and_mul(workspace.permute(2, 0, 1)),
-        a2_global_scale,
+    diq, diq_sf = silu_and_mul_scaled_nvfp4_experts_quantize(
+        workspace.permute(2, 0, 1),
         masked_m,
+        a2_global_scale,
     )
 
     # Gemm2
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
@@ -101,10 +101,12 @@ def wrapper(*args, **kwargs):
 )
 flashinfer_fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize")
 nvfp4_batched_quantize = _lazy_import_wrapper("flashinfer", "nvfp4_batched_quantize")
-silu_and_mul_nvfp4_batched_quantize = _lazy_import_wrapper(
-    "flashinfer", "silu_and_mul_nvfp4_batched_quantize"
+silu_and_mul_scaled_nvfp4_experts_quantize = _lazy_import_wrapper(
+    "flashinfer", "silu_and_mul_scaled_nvfp4_experts_quantize"
+)
+scaled_fp4_grouped_quantize = _lazy_import_wrapper(
+    "flashinfer", "scaled_fp4_grouped_quantize"
 )
-silu_and_mul = _lazy_import_wrapper("flashinfer", "silu_and_mul")
 nvfp4_block_scale_interleave = _lazy_import_wrapper(
     "flashinfer", "nvfp4_block_scale_interleave"
 )
@@ -194,8 +196,8 @@ def has_flashinfer_cutedsl_grouped_gemm_nt_masked() -> bool:
     # Check if all required functions are available
     required_functions = [
         ("flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked"),
-        ("flashinfer", "silu_and_mul"),
-        ("flashinfer", "nvfp4_batched_quantize"),
+        ("flashinfer", "scaled_fp4_grouped_quantize"),
+        ("flashinfer", "silu_and_scaled_nvfp4_experts_quantize"),
     ]
 
     for module_name, attr_name in required_functions:
@@ -482,9 +484,8 @@ def flashinfer_disable_q_quantization() -> bool:
     "flashinfer_cutlass_fused_moe",
     "flashinfer_cutedsl_grouped_gemm_nt_masked",
     "flashinfer_fp4_quantize",
-    "silu_and_mul_nvfp4_batched_quantize",
-    "silu_and_mul",
-    "nvfp4_batched_quantize",
+    "silu_and_mul_scaled_nvfp4_experts_quantize",
+    "scaled_fp4_grouped_quantize",
     "nvfp4_block_scale_interleave",
     "trtllm_fp4_block_scale_moe",
     "autotune",