Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1021,7 +1021,7 @@ def test_kv_connector_unable_to_allocate():
"""

# Setup Scheduler With Mock External Cache Hit.
BLOCK_SIZE = 4
BLOCK_SIZE = 8
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these changes to block size intentional?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so. To satisfy the type validation, block size has to be one of [1, 8, 16, 32, 64, 128] cc: @hmellor

Copy link
Copy Markdown
Member

@DarkLight1337 DarkLight1337 Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @heheda12345 @tlrmchlsmth is it ok to change these tests to use the correct block sizes?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before, we were not validating the block size but we did specify a list of valid block sizes. We should either update the test or update the possible valid block sizes

NUM_BLOCKS = 10
scheduler = create_scheduler(
enable_prefix_caching=True,
Expand Down Expand Up @@ -1103,7 +1103,7 @@ def test_kv_connector_handles_preemption():
"""

# Setup Scheduler With Mock External Cache Hit.
BLOCK_SIZE = 2
BLOCK_SIZE = 8
# NOTE: there is 1 null block, so this is 6 blocks.
NUM_BLOCKS = 7
scheduler = create_scheduler(
Expand All @@ -1124,8 +1124,8 @@ def test_kv_connector_handles_preemption():
# Both can be scheduled at first, but the second request
# will be preempted and re-scheduled.
NUM_REQUESTS = 2
NUM_TOKENS = BLOCK_SIZE * 2 + 1
MAX_TOKENS = BLOCK_SIZE * 2
NUM_TOKENS = 3 * BLOCK_SIZE - 1
MAX_TOKENS = 4
requests = create_requests(
num_requests=NUM_REQUESTS,
num_tokens=NUM_TOKENS,
Expand Down
3 changes: 2 additions & 1 deletion tools/pre_commit/mypy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
FILES = [
"vllm/*.py",
"vllm/assets",
"vllm/engine",
"vllm/distributed",
"vllm/entrypoints",
"vllm/executor",
Expand All @@ -36,6 +37,7 @@
"vllm/transformers_utils",
"vllm/triton_utils",
"vllm/usage",
"vllm/utils",
]

# After fixing errors resulting from changing follow_imports
Expand All @@ -44,7 +46,6 @@
"tests",
"vllm/attention",
"vllm/compilation",
"vllm/engine",
"vllm/inputs",
"vllm/lora",
"vllm/model_executor",
Expand Down
12 changes: 10 additions & 2 deletions vllm/config/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import hashlib
from collections.abc import Callable
from dataclasses import field
from typing import TYPE_CHECKING, Any, Literal

from pydantic import Field, SkipValidation, field_validator
from pydantic import Field, field_validator
from pydantic.dataclasses import dataclass

from vllm.config.utils import config
Expand All @@ -30,7 +31,7 @@
class CacheConfig:
"""Configuration for the KV cache."""

block_size: SkipValidation[BlockSize] = None # type: ignore
block_size: BlockSize = None
"""Size of a contiguous cache block in number of tokens. On CUDA devices,
only block sizes up to 32 are supported.

Expand Down Expand Up @@ -150,6 +151,13 @@ def metrics_info(self):
# metrics info
return {key: str(value) for key, value in self.__dict__.items()}

@field_validator("block_size", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
if value is None:
return value
return handler(value)

@field_validator("cache_dtype", mode="after")
@classmethod
def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
Expand Down
15 changes: 11 additions & 4 deletions vllm/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing import TYPE_CHECKING, Any, Literal, cast, get_args

import torch
from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
from pydantic import ConfigDict, field_validator, model_validator
from pydantic.dataclasses import dataclass
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE

Expand Down Expand Up @@ -120,7 +120,7 @@ class ModelConfig:

Note that the model may support other tasks using the same model runner.
"""
tokenizer: SkipValidation[str] = None # type: ignore
tokenizer: str = None
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode: TokenizerMode = "auto"
Expand Down Expand Up @@ -171,7 +171,7 @@ class ModelConfig:
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
max_model_len: SkipValidation[int] = None # type: ignore
max_model_len: int = None
"""Model context length (prompt and output). If unspecified, will be
automatically derived from the model config.

Expand All @@ -182,7 +182,7 @@ class ModelConfig:
- 25.6k -> 25,600"""
spec_target_max_model_len: int | None = None
"""Specify the maximum length for spec decoding draft models."""
quantization: SkipValidation[QuantizationMethods | None] = None
quantization: str | QuantizationMethods | None = None
"""Method used to quantize the weights. If `None`, we first check the
`quantization_config` attribute in the model config file. If that is
`None`, we assume the model weights are not quantized and use `dtype` to
Expand Down Expand Up @@ -302,6 +302,13 @@ class ModelConfig:
skip_mm_profiling: InitVar[bool | None] = None
video_pruning_rate: InitVar[float | None] = None

@field_validator("tokenizer", "max_model_len", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
if value is None:
return value
return handler(value)

def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
Expand Down
20 changes: 9 additions & 11 deletions vllm/config/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@
ExpertPlacementStrategy = Literal["linear", "round_robin"]
DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
DataParallelBackend = Literal["ray", "mp"]
All2allBackendType = Literal[
"naive",
"pplx",
"deepep_high_throughput",
"deepep_low_latency",
"allgather_reducescatter",
"flashinfer_all2allv",
]


@config
Expand Down Expand Up @@ -113,17 +121,7 @@ class ParallelConfig:
with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
will have experts [1, 3]. This strategy can help improve load balancing
for grouped expert models with no redundant experts."""
all2all_backend: (
Literal[
"naive",
"pplx",
"deepep_high_throughput",
"deepep_low_latency",
"allgather_reducescatter",
"flashinfer_all2allv",
]
| None
) = None
all2all_backend: All2allBackendType | None = None
"""All2All backend for MoE expert parallel communication. If not set, uses
the value from VLLM_ALL2ALL_BACKEND environment variable. Available options:
- "naive": Naive all2all implementation using broadcasts
Expand Down
24 changes: 19 additions & 5 deletions vllm/config/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import hashlib
from collections.abc import Callable
from dataclasses import InitVar, field
from typing import Any, Literal

from pydantic import SkipValidation, model_validator
from pydantic import field_validator, model_validator
from pydantic.dataclasses import dataclass
from typing_extensions import Self

Expand All @@ -31,19 +32,19 @@ class SchedulerConfig:
runner_type: RunnerType = "generate"
"""The runner type to launch for the model."""

max_num_batched_tokens: SkipValidation[int] = None # type: ignore
max_num_batched_tokens: int = None
"""Maximum number of tokens to be processed in a single iteration.

This config has no static default. If left unspecified by the user, it will
be set in `EngineArgs.create_engine_config` based on the usage context."""

max_num_seqs: SkipValidation[int] = None # type: ignore
max_num_seqs: int = None
"""Maximum number of sequences to be processed in a single iteration.

This config has no static default. If left unspecified by the user, it will
be set in `EngineArgs.create_engine_config` based on the usage context."""

max_model_len: SkipValidation[int] = None # type: ignore
max_model_len: int = None
"""Maximum length of a sequence (including prompt and generated text). This
is primarily set in `ModelConfig` and that value should be manually
duplicated here."""
Expand Down Expand Up @@ -79,7 +80,7 @@ class SchedulerConfig:
3. more than one value (e.g. 1 2 128) is provided, then the capture list
will follow the provided list."""

enable_chunked_prefill: SkipValidation[bool] = None # type: ignore
enable_chunked_prefill: bool = None
"""If True, prefill requests can be chunked based
on the remaining max_num_batched_tokens."""

Expand Down Expand Up @@ -169,6 +170,19 @@ def compute_hash(self) -> str:
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str

@field_validator(
"max_num_batched_tokens",
"max_num_seqs",
"max_model_len",
"enable_chunked_prefill",
mode="wrap",
)
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
if value is None:
return value
return handler(value)

def __post_init__(self, is_encoder_decoder: bool) -> None:
if self.max_model_len is None:
self.max_model_len = 8192
Expand Down
2 changes: 1 addition & 1 deletion vllm/config/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def config(cls: ConfigT) -> ConfigT:
return cls


def get_field(cls: ConfigType, name: str) -> Field:
def get_field(cls: ConfigType, name: str) -> Any:
"""Get the default factory field of a dataclass by name. Used for getting
default factory fields in `EngineArgs`."""
if not is_dataclass(cls):
Expand Down
6 changes: 4 additions & 2 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class VllmConfig:

# TODO: use default_factory once default constructing ModelConfig doesn't
# try to download a model
model_config: ModelConfig = Field(default=None)
model_config: ModelConfig = None
"""Model configuration."""
cache_config: CacheConfig = Field(default_factory=CacheConfig)
"""Cache configuration."""
Expand All @@ -77,7 +77,9 @@ class VllmConfig:
default_factory=StructuredOutputsConfig
)
"""Structured outputs configuration."""
observability_config: ObservabilityConfig | None = None
observability_config: ObservabilityConfig = Field(
default_factory=ObservabilityConfig
)
"""Observability configuration."""
quant_config: QuantizationConfig | None = None
"""Quantization configuration."""
Expand Down
Loading