Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 2 additions & 16 deletions .buildkite/test_areas/kernels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,7 @@ steps:
timeout_in_minutes: 90
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
- vllm/model_executor/layers/quantization/
- vllm/distributed/device_communicators/
- vllm/config
optional: true
commands:
- pytest -v -s kernels/moe/test_moe_layer.py

Expand All @@ -216,13 +209,6 @@ steps:
timeout_in_minutes: 90
device: b200
num_devices: 2
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
- vllm/model_executor/layers/quantization/
- vllm/distributed/device_communicators/
- vllm/config
optional: true
commands:
- pytest -v -s kernels/moe/test_moe_layer.py
8 changes: 7 additions & 1 deletion tests/kernels/moe/modular_kernel_tools/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
has_deep_gemm,
has_mori,
)
from vllm.utils.math_utils import next_power_of_2

from .mk_objects import (
TestMoEQuantConfig,
Expand Down Expand Up @@ -605,6 +604,13 @@ def make_modular_kernel(
vllm_config: VllmConfig,
quant_config: FusedMoEQuantConfig,
) -> mk.FusedMoEKernel:
def next_power_of_2(x):
import math

if x == 0:
return 1
return 2 ** math.ceil(math.log2(x))

# make moe config
moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
tp_size_=get_tensor_model_parallel_world_size(),
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/moe/modular_kernel_tools/parallel_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def parallel_launch_with_config(
world_size: int,
worker: Callable[Concatenate[ProcessGroupInfo, VllmConfig, Any, P], None],
vllm_config: VllmConfig,
env_dict: dict[Any, Any] | None,
env_dict: dict[Any, Any],
*args: P.args,
**kwargs: P.kwargs,
) -> None:
Expand Down
9 changes: 8 additions & 1 deletion tests/kernels/moe/test_deepep_deepgemm_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
is_deep_gemm_supported,
)
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
from vllm.utils.math_utils import next_power_of_2
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager

Expand Down Expand Up @@ -85,6 +84,14 @@ def with_dp_metadata(M: int, world_size: int):
yield


def next_power_of_2(x):
import math

if x == 0:
return 1
return 2 ** math.ceil(math.log2(x))


def make_block_quant_fp8_weights(
e: int,
n: int,
Expand Down
3 changes: 0 additions & 3 deletions tests/kernels/moe/test_flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8
from vllm.model_executor.models.llama4 import Llama4MoE
from vllm.platforms import current_platform
from vllm.utils.math_utils import next_power_of_2
from vllm.utils.torch_utils import set_random_seed

try:
Expand Down Expand Up @@ -175,7 +174,6 @@ def make_moe_tensors_8bit(
routing_method=layer.routing_method_type,
activation=activation,
device=w13_quantized.device,
max_num_tokens=next_power_of_2(m),
)

return TestData(
Expand Down Expand Up @@ -350,7 +348,6 @@ def get_fused_moe_quant_config(n: torch.nn.Module) -> FusedMoEQuantConfig:
in_dtype=torch.bfloat16,
is_act_and_mul=activation.is_gated,
routing_method=RoutingMethodType.TopK,
max_num_tokens=next_power_of_2(m),
)

kernel = mk.FusedMoEKernel(
Expand Down
2 changes: 0 additions & 2 deletions tests/kernels/moe/test_flashinfer_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel
from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.math_utils import next_power_of_2
from vllm.utils.torch_utils import set_random_seed

if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability(
Expand Down Expand Up @@ -106,7 +105,6 @@ def test_flashinfer_fp4_moe_no_graph(
in_dtype=dtype,
is_act_and_mul=is_gated_act,
routing_method=RoutingMethodType.TopK,
max_num_tokens=next_power_of_2(m),
)

flashinfer_experts = FusedMoEKernel(
Expand Down
3 changes: 1 addition & 2 deletions tests/kernels/moe/test_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@
from vllm.model_executor.models.mixtral import MixtralMoE
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils.math_utils import next_power_of_2
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager

Expand Down Expand Up @@ -1677,7 +1676,7 @@ def test_unquantized_bf16_flashinfer_trtllm_backend(
in_dtype=dtype,
is_act_and_mul=True,
routing_method=RoutingMethodType.Renormalize,
max_num_tokens=next_power_of_2(m),
max_num_tokens=m,
)

with set_current_vllm_config(vllm_config):
Expand Down
23 changes: 10 additions & 13 deletions tests/kernels/moe/test_moe_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from vllm.config import (
CompilationConfig,
ParallelConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
)
Expand Down Expand Up @@ -54,7 +53,7 @@
has_flashinfer_nvlink_two_sided,
)
from vllm.utils.import_utils import has_deep_ep, has_mori, has_nixl_ep
from vllm.utils.math_utils import cdiv, next_power_of_2
from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import (
init_workspace_manager,
Expand All @@ -66,9 +65,8 @@
SHAPE_COMBOS = [
(1, 128, 256),
(32, 1024, 512),
(222, 2048, 2048),
(222, 2048, 2048), # should be big enough to exercise DP chunking
]
MAX_M = max([x[0] for x in SHAPE_COMBOS])

NUM_EXPERTS = [8, 64]
TOP_KS = [2, 6]
Expand Down Expand Up @@ -114,7 +112,7 @@
"mori": {None, "fp8", "modelopt_fp8"},
"flashinfer_nvlink_two_sided": {None, "modelopt_fp8", "modelopt_fp4"},
"flashinfer_nvlink_one_sided": {None, "modelopt_fp8", "modelopt_fp4"},
"deepep_low_latency": {None, "modelopt_fp8", "modelopt_fp4"},
"deepep_low_latency": {None, "fp8", "modelopt_fp8", "modelopt_fp4"},
"deepep_high_throughput": {None, "fp8", "modelopt_fp8", "modelopt_fp4"},
"nixl_ep": {None, "fp8", "modelopt_fp8"},
}
Expand Down Expand Up @@ -365,9 +363,9 @@ def is_valid_config(config: MoETestConfig) -> tuple[bool, str | None]:
)

# routed_input_transform + quantization + high hidden dimensions
# TODO: Disable >= 2048 for now due to insane errors.
# TODO: Disable >= 2048 w/fp8 + deepep LL for now due to insane errors.
if (
config.use_routed_input_transform
(config.use_routed_input_transform or config.backend == "deepep_low_latency")
and config.quantization is not None
and config.k >= 2048
):
Expand Down Expand Up @@ -1665,6 +1663,9 @@ def test_moe_layer(

verbosity = pytestconfig.getoption("verbose")

test_env = dict()
test_env["VLLM_MOE_DP_CHUNK_SIZE"] = "128"
monkeypatch.setenv("VLLM_MOE_DP_CHUNK_SIZE", "128")
if os.environ.get("VLLM_LOGGING_LEVEL") is None:
monkeypatch.setenv("VLLM_LOGGING_LEVEL", "ERROR")

Expand All @@ -1689,11 +1690,7 @@ def test_moe_layer(
compilation_config.pass_config.fuse_allreduce_rms = False # for now

vllm_config = VllmConfig(
parallel_config=parallel_config,
compilation_config=compilation_config,
scheduler_config=SchedulerConfig.default_factory(
max_num_batched_tokens=next_power_of_2(MAX_M)
),
parallel_config=parallel_config, compilation_config=compilation_config
)

test_configs = generate_valid_test_configs(
Expand Down Expand Up @@ -1721,7 +1718,7 @@ def test_moe_layer(
world_size,
_parallel_worker,
vllm_config,
None,
test_env,
test_configs,
verbosity,
)
Expand Down
1 change: 0 additions & 1 deletion tests/kernels/moe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def make_dummy_moe_config(
in_dtype=in_dtype,
device="cuda",
routing_method=RoutingMethodType.TopK,
max_num_tokens=512,
)


Expand Down
12 changes: 0 additions & 12 deletions vllm/config/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,18 +622,6 @@ def use_sequence_parallel_moe(self) -> bool:
and self.data_parallel_size > 1
)

@property
def use_batched_dp_moe(self) -> bool:
return (
self.all2all_backend
in (
"deepep_low_latency",
"nixl_ep",
)
and self.enable_expert_parallel
and self.data_parallel_size > 1
)

@property
def node_rank_within_dp(self) -> int:
return self.node_rank % self.nnodes_within_dp
Expand Down
1 change: 0 additions & 1 deletion vllm/config/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ class SchedulerConfig:
"""

DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
DEFAULT_MAX_NUM_BATCHED_TOKENS_FOR_BATCHED_DP: ClassVar[int] = 256
DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128

runner_type: RunnerType = "generate"
Expand Down
23 changes: 7 additions & 16 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1604,6 +1604,9 @@ def create_engine_config(

self._check_feature_supported()
self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
self._set_default_max_num_seqs_and_batched_tokens_args(
usage_context, model_config
)
self._set_default_reasoning_config_args()
sliding_window: int | None = None
if not is_interleaved(model_config.hf_text_config):
Expand Down Expand Up @@ -1857,12 +1860,6 @@ def create_engine_config(
target_parallel_config=parallel_config,
)

self._set_default_max_num_seqs_and_batched_tokens_args(
usage_context,
model_config,
parallel_config,
)

assert self.max_num_batched_tokens is not None, (
"max_num_batched_tokens must be set by this point"
)
Expand Down Expand Up @@ -2278,7 +2275,6 @@ def _set_default_max_num_seqs_and_batched_tokens_args(
self,
usage_context: UsageContext | None,
model_config: ModelConfig,
parallel_config: ParallelConfig,
):
world_size = self.pipeline_parallel_size * self.tensor_parallel_size
(
Expand All @@ -2290,15 +2286,10 @@ def _set_default_max_num_seqs_and_batched_tokens_args(
orig_max_num_seqs = self.max_num_seqs

if self.max_num_batched_tokens is None:
if parallel_config.use_batched_dp_moe:
self.max_num_batched_tokens = (
SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS_FOR_BATCHED_DP
)
else:
self.max_num_batched_tokens = default_max_num_batched_tokens.get(
usage_context,
SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
)
self.max_num_batched_tokens = default_max_num_batched_tokens.get(
usage_context,
SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
)

if self.max_num_seqs is None:
self.max_num_seqs = default_max_num_seqs.get(
Expand Down
11 changes: 11 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@
VLLM_ENABLE_PREGRAD_PASSES: bool = False
VLLM_DP_MASTER_IP: str = ""
VLLM_DP_MASTER_PORT: int = 0
VLLM_MOE_DP_CHUNK_SIZE: int = 256
VLLM_ENABLE_MOE_DP_CHUNK: bool = True
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict"
VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY: str = ""
Expand Down Expand Up @@ -1138,6 +1140,15 @@ def _get_or_set_default() -> str:
"VLLM_DP_MASTER_IP": lambda: os.getenv("VLLM_DP_MASTER_IP", "127.0.0.1"),
# Port of the master node in the data parallel setting
"VLLM_DP_MASTER_PORT": lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
# In the context of executing MoE models with Data-Parallel, Expert-Parallel
# and Batched All-to-All dispatch/combine kernels, VLLM_MOE_DP_CHUNK_SIZE
# dictates the quantum of tokens that can be dispatched from a DP
# rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE
# units.
"VLLM_MOE_DP_CHUNK_SIZE": lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")),
"VLLM_ENABLE_MOE_DP_CHUNK": lambda: bool(
int(os.getenv("VLLM_ENABLE_MOE_DP_CHUNK", "1"))
),
# Randomize inputs during dummy runs when using Data Parallel
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: (
os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1"
Expand Down
Loading