Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 20 additions & 18 deletions vllm/config/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,22 +109,24 @@ def _validate_eplb_config(self) -> Self:
class ParallelConfig:
"""Configuration for the distributed execution."""

pipeline_parallel_size: int = 1
pipeline_parallel_size: int = Field(default=1, ge=1)
"""Number of pipeline parallel groups."""
tensor_parallel_size: int = 1
tensor_parallel_size: int = Field(default=1, ge=1)
"""Number of tensor parallel groups."""
prefill_context_parallel_size: int = 1
prefill_context_parallel_size: int = Field(default=1, ge=1)
"""Number of prefill context parallel groups."""
data_parallel_size: int = 1
data_parallel_size: int = Field(default=1, ge=1)
"""Number of data parallel groups. MoE layers will be sharded according to
the product of the tensor parallel size and data parallel size."""
data_parallel_size_local: int = 1
"""Number of local data parallel groups."""
data_parallel_rank: int = 0
"""Rank of the data parallel group."""
data_parallel_size_local: int = Field(default=1, ge=0)
"""Number of local data parallel groups. A value of 0 is a sentinel used by
the engine-args layer to signal that data parallelism was specified
externally (see `ParallelConfig.__post_init__`)."""
data_parallel_rank: int = Field(default=0, ge=0)
"""Rank of the data parallel group. The runtime check at
``__post_init__`` further bounds this by ``data_parallel_size``."""
data_parallel_rank_local: int | None = None
"""Local rank of the data parallel group,
set only in SPMD mode."""
"""Local rank of the data parallel group, set only in SPMD mode."""
data_parallel_master_ip: str = "127.0.0.1"
"""IP of the data parallel master."""
data_parallel_rpc_port: int = 29550
Expand Down Expand Up @@ -184,7 +186,7 @@ class ParallelConfig:
- "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl
- "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels"""

max_parallel_loading_workers: int | None = None
max_parallel_loading_workers: int | None = Field(default=None, ge=1)
"""Maximum number of parallel loading workers when loading model
sequentially in multiple batches. To avoid RAM OOM when using tensor
parallel and large models."""
Expand All @@ -197,15 +199,15 @@ class ParallelConfig:

enable_dbo: bool = False
"""Enable dual batch overlap for the model executor."""
ubatch_size: int = 0
ubatch_size: int = Field(default=0, ge=0)
"""Number of ubatch size."""

dbo_decode_token_threshold: int = 32
dbo_decode_token_threshold: int = Field(default=32, ge=0)
"""The threshold for dual batch overlap for batches only containing decodes.
If the number of tokens in the request is greater than this threshold,
microbatching will be used. Otherwise, the request will be processed in a
single batch."""
dbo_prefill_token_threshold: int = 512 # TODO(lucas): tune
dbo_prefill_token_threshold: int = Field(default=512, ge=0) # TODO(lucas): tune
"""The threshold for dual batch overlap for batches that contain one or more
prefills. If the number of tokens in the request is greater than this
threshold, microbatching will be used. Otherwise, the request will be
Expand Down Expand Up @@ -260,10 +262,10 @@ class is dynamically inherited by the worker class. This is used to inject
master_port: int = 29501
"""distributed master port for multi-node distributed
inference when distributed_executor_backend is mp."""
node_rank: int = 0
"""distributed node rank for multi-node distributed
node_rank: int = Field(default=0, ge=0)
"""distributed node rank for multi-node distributed
inference when distributed_executor_backend is mp."""
nnodes: int = 1
nnodes: int = Field(default=1, ge=1)
"""num of nodes for multi-node distributed
inference when distributed_executor_backend is mp."""
numa_bind: bool = False
Expand Down Expand Up @@ -318,7 +320,7 @@ class is dynamically inherited by the worker class. This is used to inject
"""Port of the coordination TCPStore. Can be set by the API server; workers
connect as clients to exchange self-picked group ports at runtime."""

decode_context_parallel_size: int = 1
decode_context_parallel_size: int = Field(default=1, ge=1)
"""Number of decode context parallel groups, because the world size does
not change by dcp, it simply reuse the GPUs of TP group, and tp_size
needs to be divisible by dcp_size."""
Expand Down
Loading