vllm-project
diff --git a/‎tests/kernels/moe/test_cutedsl_moe.py‎
Lines changed: 527 additions & 0 deletions b/‎tests/kernels/moe/test_cutedsl_moe.py‎
Lines changed: 527 additions & 0 deletions
diff --git a/‎vllm/envs.py‎
Lines changed: 9 additions & 2 deletions b/‎vllm/envs.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py‎
Lines changed: 31 additions & 15 deletions b/‎vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py‎
Lines changed: 31 additions & 15 deletions
@@ -156,7 +156,9 @@
     VLLM_USE_FLASHINFER_MOE_FP16: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
-    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "throughput"
+    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "cutedsl"] = (
+        "throughput"
+    )
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
@@ -1051,6 +1053,9 @@ def get_vllm_port() -> int | None:
         "VLLM_MARLIN_USE_ATOMIC_ADD", "0"
     )
     == "1",
+    "VLLM_DEEPEPLL_BF16_DISPATCH": lambda: bool(
+        int(os.getenv("VLLM_DEEPEPLL_BF16_DISPATCH", "0"))
+    ),
     # Whether to use marlin kernel in mxfp4 quantization method
     "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
         os.environ.get("VLLM_MXFP4_USE_MARLIN", None)
@@ -1199,7 +1204,9 @@ def get_vllm_port() -> int | None:
     # - "latency":
     #     Uses TensorRT-LLM kernels optimized for low-latency inference.
     "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
-        "VLLM_FLASHINFER_MOE_BACKEND", "throughput", ["throughput", "latency"]
+        "VLLM_FLASHINFER_MOE_BACKEND",
+        "throughput",
+        ["throughput", "latency", "cutedsl"],
     ),
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
 
@@ -6,6 +6,8 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import envs
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
@@ -24,6 +26,8 @@
 DEEPEP_QUANT_BLOCK_SIZE = 128
 DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE]
 
+logger = init_logger(__name__)
+
 
 def dequant_fp8(
     expert_x_fp8: torch.Tensor, expert_x_scales: torch.Tensor
@@ -110,21 +114,31 @@ def _do_quant(
         assert isinstance(x, torch.Tensor)
 
         num_experts, max_tokens, hidden_dim = x.size()
-
-        # TODO (varun): Optimization - Use a batched version of quant
-        x = x.view((-1, hidden_dim))
-        x, x_scales = moe_kernel_quantize_input(
-            x,
-            quant_config.a1_scale,
-            quant_config.quant_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-        )
-        x = x.view((num_experts, -1, hidden_dim))
-
-        if quant_config.quant_dtype is not None:
-            assert x_scales is not None
-            x_scales = normalize_batched_scales_shape(x_scales, num_experts)
+        if not envs.VLLM_DEEPEPLL_BF16_DISPATCH:
+            # TODO (varun): Optimization - Use a batched version of quant
+            x = x.view((-1, hidden_dim))
+            x, x_scales = moe_kernel_quantize_input(
+                x,
+                quant_config.a1_scale,
+                quant_config.quant_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
+            )
+            x = x.view((num_experts, -1, hidden_dim))
+
+            if quant_config.quant_dtype is not None:
+                assert x_scales is not None
+                x_scales = normalize_batched_scales_shape(x_scales, num_experts)
+        else:
+            # BF16 dispatch path - no quantization
+            # TODO([email protected]): enable nvfp4 dispatch once DEEPEP is ready.
+            logger.info_once("Using BF16 dispatch path for DeepEPLLPrepareAndFinalize")
+            assert x.dtype == torch.bfloat16, (
+                "BF16 dispatch requires input to be in BF16"
+            )
+            x_scales = None
+            x = x.view((num_experts, -1, hidden_dim))
+            # print(f"after deepepll: x.shape = {x.shape}")
 
         return x, x_scales
 
@@ -262,6 +276,8 @@ def _finalize(
 
         # TODO (varun) : Enable zero copy mode
         dbo_maybe_run_recv_hook()
+        # print("xxx"*100, fused_expert_output.shape)
+        # print("ttt"*100, fused_expert_output.dtype)
         _, _, recv_hook = self.buffer.low_latency_combine(
             fused_expert_output,
             topk_ids,