vllm-project · vllm-agent · Apr 15, 2026
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
@@ -200,14 +200,7 @@ steps:
   timeout_in_minutes: 90
   device: h100
   num_devices: 2
-  source_file_dependencies:
-  - csrc/quantization/cutlass_w8a8/moe/
-  - csrc/moe/
-  - tests/kernels/moe
-  - vllm/model_executor/layers/fused_moe/
-  - vllm/model_executor/layers/quantization/
-  - vllm/distributed/device_communicators/
-  - vllm/config
+  optional: true
   commands:
     - pytest -v -s kernels/moe/test_moe_layer.py
 
@@ -216,13 +209,6 @@ steps:
   timeout_in_minutes: 90
   device: b200
   num_devices: 2
-  source_file_dependencies:
-  - csrc/quantization/cutlass_w8a8/moe/
-  - csrc/moe/
-  - tests/kernels/moe
-  - vllm/model_executor/layers/fused_moe/
-  - vllm/model_executor/layers/quantization/
-  - vllm/distributed/device_communicators/
-  - vllm/config
+  optional: true
   commands:
     - pytest -v -s kernels/moe/test_moe_layer.py
@@ -46,7 +46,6 @@
     has_deep_gemm,
     has_mori,
 )
-from vllm.utils.math_utils import next_power_of_2
 
 from .mk_objects import (
     TestMoEQuantConfig,
@@ -605,6 +604,13 @@ def make_modular_kernel(
     vllm_config: VllmConfig,
     quant_config: FusedMoEQuantConfig,
 ) -> mk.FusedMoEKernel:
+    def next_power_of_2(x):
+        import math
+
+        if x == 0:
+            return 1
+        return 2 ** math.ceil(math.log2(x))
+
     # make moe config
     moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
         tp_size_=get_tensor_model_parallel_world_size(),

@@ -126,7 +126,7 @@ def parallel_launch_with_config(
     world_size: int,
     worker: Callable[Concatenate[ProcessGroupInfo, VllmConfig, Any, P], None],
     vllm_config: VllmConfig,
-    env_dict: dict[Any, Any] | None,
+    env_dict: dict[Any, Any],
     *args: P.args,
     **kwargs: P.kwargs,
 ) -> None:

@@ -29,7 +29,6 @@
     is_deep_gemm_supported,
 )
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
-from vllm.utils.math_utils import next_power_of_2
 from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.worker.workspace import init_workspace_manager
 
@@ -85,6 +84,14 @@ def with_dp_metadata(M: int, world_size: int):
         yield
 
 
+def next_power_of_2(x):
+    import math
+
+    if x == 0:
+        return 1
+    return 2 ** math.ceil(math.log2(x))
+
+
 def make_block_quant_fp8_weights(
     e: int,
     n: int,

@@ -32,7 +32,6 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8
 from vllm.model_executor.models.llama4 import Llama4MoE
 from vllm.platforms import current_platform
-from vllm.utils.math_utils import next_power_of_2
 from vllm.utils.torch_utils import set_random_seed
 
 try:
@@ -175,7 +174,6 @@ def make_moe_tensors_8bit(
             routing_method=layer.routing_method_type,
             activation=activation,
             device=w13_quantized.device,
-            max_num_tokens=next_power_of_2(m),
         )
 
         return TestData(
@@ -350,7 +348,6 @@ def get_fused_moe_quant_config(n: torch.nn.Module) -> FusedMoEQuantConfig:
             in_dtype=torch.bfloat16,
             is_act_and_mul=activation.is_gated,
             routing_method=RoutingMethodType.TopK,
-            max_num_tokens=next_power_of_2(m),
         )
 
         kernel = mk.FusedMoEKernel(

@@ -29,7 +29,6 @@
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
-from vllm.utils.math_utils import next_power_of_2
 from vllm.utils.torch_utils import set_random_seed
 
 if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability(
@@ -106,7 +105,6 @@ def test_flashinfer_fp4_moe_no_graph(
             in_dtype=dtype,
             is_act_and_mul=is_gated_act,
             routing_method=RoutingMethodType.TopK,
-            max_num_tokens=next_power_of_2(m),
         )
 
         flashinfer_experts = FusedMoEKernel(

@@ -59,7 +59,6 @@
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils.math_utils import next_power_of_2
 from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.worker.workspace import init_workspace_manager
 
@@ -1677,7 +1676,7 @@ def test_unquantized_bf16_flashinfer_trtllm_backend(
         in_dtype=dtype,
         is_act_and_mul=True,
         routing_method=RoutingMethodType.Renormalize,
-        max_num_tokens=next_power_of_2(m),
+        max_num_tokens=m,
     )
 
     with set_current_vllm_config(vllm_config):

@@ -26,7 +26,6 @@
 from vllm.config import (
     CompilationConfig,
     ParallelConfig,
-    SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -54,7 +53,7 @@
     has_flashinfer_nvlink_two_sided,
 )
 from vllm.utils.import_utils import has_deep_ep, has_mori, has_nixl_ep
-from vllm.utils.math_utils import cdiv, next_power_of_2
+from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.worker.workspace import (
     init_workspace_manager,
@@ -66,9 +65,8 @@
 SHAPE_COMBOS = [
     (1, 128, 256),
     (32, 1024, 512),
-    (222, 2048, 2048),
+    (222, 2048, 2048),  # should be big enough to exercise DP chunking
 ]
-MAX_M = max([x[0] for x in SHAPE_COMBOS])
 
 NUM_EXPERTS = [8, 64]
 TOP_KS = [2, 6]
@@ -114,7 +112,7 @@
     "mori":                        {None, "fp8", "modelopt_fp8"},
     "flashinfer_nvlink_two_sided": {None,        "modelopt_fp8", "modelopt_fp4"},
     "flashinfer_nvlink_one_sided": {None,        "modelopt_fp8", "modelopt_fp4"},
-    "deepep_low_latency":          {None,        "modelopt_fp8", "modelopt_fp4"},
+    "deepep_low_latency":          {None, "fp8", "modelopt_fp8", "modelopt_fp4"},
     "deepep_high_throughput":      {None, "fp8", "modelopt_fp8", "modelopt_fp4"},
     "nixl_ep":                     {None, "fp8", "modelopt_fp8"},
 }
@@ -365,9 +363,9 @@ def is_valid_config(config: MoETestConfig) -> tuple[bool, str | None]:
         )
 
     # routed_input_transform + quantization + high hidden dimensions
-    # TODO: Disable >= 2048 for now due to insane errors.
+    # TODO: Disable >= 2048 w/fp8 + deepep LL for now due to insane errors.
     if (
-        config.use_routed_input_transform
+        (config.use_routed_input_transform or config.backend == "deepep_low_latency")
         and config.quantization is not None
         and config.k >= 2048
     ):
@@ -1665,6 +1663,9 @@ def test_moe_layer(
 
     verbosity = pytestconfig.getoption("verbose")
 
+    test_env = dict()
+    test_env["VLLM_MOE_DP_CHUNK_SIZE"] = "128"
+    monkeypatch.setenv("VLLM_MOE_DP_CHUNK_SIZE", "128")
     if os.environ.get("VLLM_LOGGING_LEVEL") is None:
         monkeypatch.setenv("VLLM_LOGGING_LEVEL", "ERROR")
 
@@ -1689,11 +1690,7 @@ def test_moe_layer(
     compilation_config.pass_config.fuse_allreduce_rms = False  # for now
 
     vllm_config = VllmConfig(
-        parallel_config=parallel_config,
-        compilation_config=compilation_config,
-        scheduler_config=SchedulerConfig.default_factory(
-            max_num_batched_tokens=next_power_of_2(MAX_M)
-        ),
+        parallel_config=parallel_config, compilation_config=compilation_config
     )
 
     test_configs = generate_valid_test_configs(
@@ -1721,7 +1718,7 @@ def test_moe_layer(
             world_size,
             _parallel_worker,
             vllm_config,
-            None,
+            test_env,
             test_configs,
             verbosity,
         )

@@ -69,7 +69,6 @@ def make_dummy_moe_config(
         in_dtype=in_dtype,
         device="cuda",
         routing_method=RoutingMethodType.TopK,
-        max_num_tokens=512,
     )
 
 

@@ -622,18 +622,6 @@ def use_sequence_parallel_moe(self) -> bool:
             and self.data_parallel_size > 1
         )
 
-    @property
-    def use_batched_dp_moe(self) -> bool:
-        return (
-            self.all2all_backend
-            in (
-                "deepep_low_latency",
-                "nixl_ep",
-            )
-            and self.enable_expert_parallel
-            and self.data_parallel_size > 1
-        )
-
     @property
     def node_rank_within_dp(self) -> int:
         return self.node_rank % self.nnodes_within_dp

@@ -40,7 +40,6 @@ class SchedulerConfig:
     """
 
     DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
-    DEFAULT_MAX_NUM_BATCHED_TOKENS_FOR_BATCHED_DP: ClassVar[int] = 256
     DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
 
     runner_type: RunnerType = "generate"

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1604,6 +1604,9 @@ def create_engine_config(
 
         self._check_feature_supported()
         self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
+        self._set_default_max_num_seqs_and_batched_tokens_args(
+            usage_context, model_config
+        )
         self._set_default_reasoning_config_args()
         sliding_window: int | None = None
         if not is_interleaved(model_config.hf_text_config):
@@ -1857,12 +1860,6 @@ def create_engine_config(
             target_parallel_config=parallel_config,
         )
 
-        self._set_default_max_num_seqs_and_batched_tokens_args(
-            usage_context,
-            model_config,
-            parallel_config,
-        )
-
         assert self.max_num_batched_tokens is not None, (
             "max_num_batched_tokens must be set by this point"
         )
@@ -2278,7 +2275,6 @@ def _set_default_max_num_seqs_and_batched_tokens_args(
         self,
         usage_context: UsageContext | None,
         model_config: ModelConfig,
-        parallel_config: ParallelConfig,
     ):
         world_size = self.pipeline_parallel_size * self.tensor_parallel_size
         (
@@ -2290,15 +2286,10 @@ def _set_default_max_num_seqs_and_batched_tokens_args(
         orig_max_num_seqs = self.max_num_seqs
 
         if self.max_num_batched_tokens is None:
-            if parallel_config.use_batched_dp_moe:
-                self.max_num_batched_tokens = (
-                    SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS_FOR_BATCHED_DP
-                )
-            else:
-                self.max_num_batched_tokens = default_max_num_batched_tokens.get(
-                    usage_context,
-                    SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
-                )
+            self.max_num_batched_tokens = default_max_num_batched_tokens.get(
+                usage_context,
+                SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
+            )
 
         if self.max_num_seqs is None:
             self.max_num_seqs = default_max_num_seqs.get(

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -146,6 +146,8 @@
     VLLM_ENABLE_PREGRAD_PASSES: bool = False
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
+    VLLM_MOE_DP_CHUNK_SIZE: int = 256
+    VLLM_ENABLE_MOE_DP_CHUNK: bool = True
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict"
     VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY: str = ""
@@ -1138,6 +1140,15 @@ def _get_or_set_default() -> str:
     "VLLM_DP_MASTER_IP": lambda: os.getenv("VLLM_DP_MASTER_IP", "127.0.0.1"),
     # Port of the master node in the data parallel setting
     "VLLM_DP_MASTER_PORT": lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
+    # In the context of executing MoE models with Data-Parallel, Expert-Parallel
+    # and Batched All-to-All dispatch/combine kernels, VLLM_MOE_DP_CHUNK_SIZE
+    # dictates the quantum of tokens that can be dispatched from a DP
+    # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE
+    # units.
+    "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")),
+    "VLLM_ENABLE_MOE_DP_CHUNK": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_MOE_DP_CHUNK", "1"))
+    ),
     # Randomize inputs during dummy runs when using Data Parallel
     "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: (
         os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1"