From 1b80912528d2f26dba6ec173fb0d8cbd93745676 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 9 Oct 2025 22:17:54 +0000 Subject: [PATCH 01/21] fix mypy Signed-off-by: wwl2755 --- tools/pre_commit/mypy.py | 1 + vllm/engine/arg_utils.py | 4 ++-- vllm/multimodal/parse.py | 8 +++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 22ee08535bdd..527279860e1e 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -27,6 +27,7 @@ FILES = [ "vllm/*.py", "vllm/assets", + "vllm/engine", "vllm/entrypoints", "vllm/inputs", "vllm/logging_utils", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7e66d8dba8ac..906418aea25e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1243,7 +1243,7 @@ def create_engine_config( (self.model, self.tokenizer, self.speculative_config) = ( maybe_override_with_speculators( model=self.model, - tokenizer=self.tokenizer, + tokenizer=self.tokenizer if self.tokenizer is not None else self.model, revision=self.revision, trust_remote_code=self.trust_remote_code, vllm_speculative_config=self.speculative_config, @@ -1685,7 +1685,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: return True def _set_default_args( - self, usage_context: UsageContext, model_config: ModelConfig + self, usage_context: Optional[UsageContext], model_config: ModelConfig ) -> None: """Set Default Arguments for V1 Engine.""" diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 8fdc5cf721d0..c53420789b5c 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -13,6 +13,7 @@ Optional, TypeVar, Union, + cast, ) import numpy as np @@ -366,7 +367,8 @@ def _is_embeddings( if isinstance(data, torch.Tensor): return data.ndim == 3 if is_list_of(data, torch.Tensor): - return data[0].ndim == 2 + tensors = cast(list[torch.Tensor], data) + return tensors[0].ndim == 2 return False @@ -434,7 +436,7 @@ def _parse_audio_data( elif isinstance(data, (np.ndarray, torch.Tensor)): data_items = [elem for elem in data] else: - data_items = data + data_items = data # type: ignore[assignment] new_audios = list[np.ndarray]() for data_item in data_items: @@ -498,7 +500,7 @@ def _parse_video_data( elif isinstance(data, tuple) and len(data) == 2: data_items = [data] else: - data_items = data + data_items = data # type: ignore[assignment] new_videos = list[tuple[np.ndarray, Optional[dict[str, Any]]]]() metadata_lst: list[Optional[dict[str, Any]]] = [] From 5799b37113aaf68c51fc83d33fc79108814cb2e9 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 9 Oct 2025 22:23:11 +0000 Subject: [PATCH 02/21] fix Signed-off-by: wwl2755 --- tools/pre_commit/mypy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 527279860e1e..2147111d9443 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -45,7 +45,6 @@ "vllm/attention", "vllm/compilation", "vllm/distributed", - "vllm/engine", "vllm/executor", "vllm/inputs", "vllm/lora", From 738f668d704448b1f87e1e576ea24425317ad105 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 10 Oct 2025 00:58:57 +0000 Subject: [PATCH 03/21] mypy for vllm/utils Signed-off-by: wwl2755 --- tools/pre_commit/mypy.py | 1 + vllm/engine/arg_utils.py | 4 +++- vllm/multimodal/parse.py | 8 ++++---- vllm/utils/__init__.py | 21 +++++++++++++-------- vllm/utils/jsontree.py | 12 ++++++------ 5 files changed, 27 insertions(+), 19 deletions(-) diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 2147111d9443..ddf7b652e04e 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -36,6 +36,7 @@ "vllm/transformers_utils", "vllm/triton_utils", "vllm/usage", + "vllm/utils", ] # After fixing errors resulting from changing follow_imports diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 906418aea25e..d50e42084e8f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1240,10 +1240,12 @@ def create_engine_config( self.model = model_config.model self.tokenizer = model_config.tokenizer + # After ModelConfig init, tokenizer must be resolved (never None). + assert self.tokenizer is not None (self.model, self.tokenizer, self.speculative_config) = ( maybe_override_with_speculators( model=self.model, - tokenizer=self.tokenizer if self.tokenizer is not None else self.model, + tokenizer=self.tokenizer, revision=self.revision, trust_remote_code=self.trust_remote_code, vllm_speculative_config=self.speculative_config, diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index c53420789b5c..f8b67cdc9546 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -13,7 +13,6 @@ Optional, TypeVar, Union, - cast, ) import numpy as np @@ -367,8 +366,7 @@ def _is_embeddings( if isinstance(data, torch.Tensor): return data.ndim == 3 if is_list_of(data, torch.Tensor): - tensors = cast(list[torch.Tensor], data) - return tensors[0].ndim == 2 + return data[0].ndim == 2 return False @@ -426,6 +424,8 @@ def _parse_audio_data( if self._is_embeddings(data): return AudioEmbeddingItems(data) + # Normalize into a list of audio items + data_items: list[AudioItem] if ( is_list_of(data, float) or isinstance(data, (np.ndarray, torch.Tensor)) @@ -436,7 +436,7 @@ def _parse_audio_data( elif isinstance(data, (np.ndarray, torch.Tensor)): data_items = [elem for elem in data] else: - data_items = data # type: ignore[assignment] + data_items = data new_audios = list[np.ndarray]() for data_item in data_items: diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 22c2a4b5362c..314f9ee6c08f 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -81,7 +81,7 @@ import setproctitle import torch import torch.types -import yaml +import yaml # type: ignore[import-untyped] import zmq import zmq.asyncio from packaging import version @@ -486,7 +486,10 @@ async def merge_async_iterators( loop = asyncio.get_running_loop() - awaits = {loop.create_task(anext(it)): (i, it) for i, it in enumerate(iterators)} + awaits: dict[asyncio.Task[T], tuple[int, AsyncGenerator[T, None]]] = { + loop.create_task(anext(it)): (i, it) # type: ignore[arg-type] + for i, it in enumerate(iterators) + } try: while awaits: done, _ = await asyncio.wait(awaits.keys(), return_when=FIRST_COMPLETED) @@ -495,7 +498,7 @@ async def merge_async_iterators( try: item = await d i, it = pair - awaits[loop.create_task(anext(it))] = pair + awaits[loop.create_task(anext(it))] = pair # type: ignore[arg-type] yield i, item except StopAsyncIteration: pass @@ -1163,11 +1166,13 @@ def find_nccl_include_paths() -> list[str] | None: import importlib.util spec = importlib.util.find_spec("nvidia.nccl") - if spec and getattr(spec, "submodule_search_locations", None): - for loc in spec.submodule_search_locations: - inc_dir = os.path.join(loc, "include") - if os.path.exists(os.path.join(inc_dir, "nccl.h")): - paths.append(inc_dir) + if spec: + locations = getattr(spec, "submodule_search_locations", None) + if locations: + for loc in locations: + inc_dir = os.path.join(loc, "include") + if os.path.exists(os.path.join(inc_dir, "nccl.h")): + paths.append(inc_dir) except Exception: pass diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py index dcdc6ccb4c63..045c547dd7be 100644 --- a/vllm/utils/jsontree.py +++ b/vllm/utils/jsontree.py @@ -4,7 +4,7 @@ from collections.abc import Iterable from functools import reduce -from typing import TYPE_CHECKING, Callable, TypeVar, Union, cast, overload +from typing import TYPE_CHECKING, Any, Callable, TypeVar, Union, overload if TYPE_CHECKING: import torch @@ -94,7 +94,7 @@ def json_map_leaves( for k, v in value.items() } elif isinstance(value, list): - return [json_map_leaves(func, v) for v in value] + return [json_map_leaves(func, v) for v in value] # type: ignore[return-value] elif isinstance(value, tuple): return tuple(json_map_leaves(func, v) for v in value) else: @@ -143,11 +143,11 @@ def json_reduce_leaves( def json_reduce_leaves( - func: Callable[..., Union[_T, _U]], - value: _JSONTree[_T], - initial: _U = cast(_U, ...), # noqa: B008 + func: Callable[..., Any], + value: _JSONTree[Any], + initial: Any = ..., # noqa: B008 /, -) -> Union[_T, _U]: +) -> Any: """ Apply a function of two arguments cumulatively to each leaf in a nested JSON structure, from left to right, so as to reduce the From ee3fc70baef2cbeed74deac1247af5e747ef2925 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 10 Oct 2025 01:01:31 +0000 Subject: [PATCH 04/21] minor Signed-off-by: wwl2755 --- vllm/multimodal/parse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index f8b67cdc9546..5316727ac8e2 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -424,7 +424,6 @@ def _parse_audio_data( if self._is_embeddings(data): return AudioEmbeddingItems(data) - # Normalize into a list of audio items data_items: list[AudioItem] if ( is_list_of(data, float) @@ -489,6 +488,7 @@ def _parse_video_data( if self._is_embeddings(data): return VideoEmbeddingItems(data) + data_items: list[VideoItem] if ( is_list_of(data, PILImage.Image) or isinstance(data, (np.ndarray, torch.Tensor)) @@ -500,7 +500,7 @@ def _parse_video_data( elif isinstance(data, tuple) and len(data) == 2: data_items = [data] else: - data_items = data # type: ignore[assignment] + data_items = data new_videos = list[tuple[np.ndarray, Optional[dict[str, Any]]]]() metadata_lst: list[Optional[dict[str, Any]]] = [] From ceb1f743ffb2bd3e4d3f04862f91361ec6b69304 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 10 Oct 2025 02:35:38 +0000 Subject: [PATCH 05/21] fix Signed-off-by: wwl2755 --- vllm/engine/arg_utils.py | 50 ++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d50e42084e8f..eafaf9bd61eb 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -54,6 +54,7 @@ ) from vllm.config.cache import BlockSize, CacheDType, MambaDType, PrefixCachingHashAlgo from vllm.config.device import Device +from vllm.config.lora import LoRAExtraVocabSize, MaxLoRARanks from vllm.config.model import ( ConvertOption, HfOverrides, @@ -65,7 +66,11 @@ ) from vllm.config.multimodal import MMCacheType, MMEncoderTPMode from vllm.config.observability import DetailedTraceModules -from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy +from vllm.config.parallel import ( + DataParallelBackend, + DistributedExecutorBackend, + ExpertPlacementStrategy, +) from vllm.config.scheduler import SchedulerPolicy from vllm.config.utils import get_field from vllm.logger import init_logger @@ -366,7 +371,7 @@ class EngineArgs: data_parallel_address: Optional[str] = None data_parallel_rpc_port: Optional[int] = None data_parallel_hybrid_lb: bool = False - data_parallel_backend: str = ParallelConfig.data_parallel_backend + data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel enable_dbo: bool = ParallelConfig.enable_dbo dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold @@ -436,17 +441,17 @@ class EngineArgs: mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode io_processor_plugin: Optional[str] = None skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling - video_pruning_rate: float = MultiModalConfig.video_pruning_rate + video_pruning_rate: Optional[float] = MultiModalConfig.video_pruning_rate # LoRA fields enable_lora: bool = False enable_lora_bias: bool = LoRAConfig.bias_enabled max_loras: int = LoRAConfig.max_loras - max_lora_rank: int = LoRAConfig.max_lora_rank + max_lora_rank: MaxLoRARanks = LoRAConfig.max_lora_rank default_mm_loras: Optional[dict[str, str]] = LoRAConfig.default_mm_loras fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype - lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size + lora_extra_vocab_size: LoRAExtraVocabSize = LoRAConfig.lora_extra_vocab_size ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight num_gpu_blocks_override: Optional[int] = CacheConfig.num_gpu_blocks_override @@ -502,7 +507,7 @@ class EngineArgs: ModelConfig, "override_generation_config" ) model_impl: str = ModelConfig.model_impl - override_attention_dtype: str = ModelConfig.override_attention_dtype + override_attention_dtype: Optional[str] = ModelConfig.override_attention_dtype calculate_kv_scales: bool = CacheConfig.calculate_kv_scales mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype @@ -511,7 +516,7 @@ class EngineArgs: additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config") use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load - pt_load_map_location: str = LoadConfig.pt_load_map_location + pt_load_map_location: Union[str, dict[str, str]] = LoadConfig.pt_load_map_location # DEPRECATED enable_multimodal_encoder_data_parallel: bool = False @@ -1095,13 +1100,12 @@ def create_model_config(self) -> ModelConfig: self.mm_encoder_tp_mode = "data" - return ModelConfig( + model_config_kwargs: dict[str, Any] = dict( model=self.model, hf_config_path=self.hf_config_path, runner=self.runner, convert=self.convert, task=self.task, - tokenizer=self.tokenizer, tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, allowed_local_media_path=self.allowed_local_media_path, @@ -1115,7 +1119,6 @@ def create_model_config(self) -> ModelConfig: hf_token=self.hf_token, hf_overrides=self.hf_overrides, tokenizer_revision=self.tokenizer_revision, - max_model_len=self.max_model_len, quantization=self.quantization, enforce_eager=self.enforce_eager, max_logprobs=self.max_logprobs, @@ -1147,6 +1150,11 @@ def create_model_config(self) -> ModelConfig: video_pruning_rate=self.video_pruning_rate, io_processor_plugin=self.io_processor_plugin, ) + if self.tokenizer is not None: + model_config_kwargs["tokenizer"] = self.tokenizer + if self.max_model_len is not None: + model_config_kwargs["max_model_len"] = self.max_model_len + return ModelConfig(**model_config_kwargs) def validate_tensorizer_args(self): from vllm.model_executor.model_loader.tensorizer import TensorizerConfig @@ -1488,10 +1496,8 @@ def create_engine_config( if speculative_config is not None: num_lookahead_slots = speculative_config.num_lookahead_slots - scheduler_config = SchedulerConfig( + scheduler_kwargs: dict[str, Any] = dict( runner_type=model_config.runner_type, - max_num_batched_tokens=self.max_num_batched_tokens, - max_num_seqs=self.max_num_seqs, max_model_len=model_config.max_model_len, cuda_graph_sizes=self.cuda_graph_sizes, num_lookahead_slots=num_lookahead_slots, @@ -1508,6 +1514,11 @@ def create_engine_config( disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager, async_scheduling=self.async_scheduling, ) + if self.max_num_batched_tokens is not None: + scheduler_kwargs["max_num_batched_tokens"] = self.max_num_batched_tokens + if self.max_num_seqs is not None: + scheduler_kwargs["max_num_seqs"] = self.max_num_seqs + scheduler_config = SchedulerConfig(**scheduler_kwargs) if not model_config.is_multimodal_model and self.default_mm_loras: raise ValueError( @@ -1545,17 +1556,15 @@ def create_engine_config( # Forward the deprecated CLI args to the StructuredOutputsConfig so_config = self.structured_outputs_config if self.guided_decoding_backend is not None: - so_config.guided_decoding_backend = self.guided_decoding_backend + so_config.backend = self.guided_decoding_backend if self.guided_decoding_disable_fallback is not None: - so_config.guided_decoding_disable_fallback = ( - self.guided_decoding_disable_fallback - ) + so_config.disable_fallback = self.guided_decoding_disable_fallback if self.guided_decoding_disable_any_whitespace is not None: - so_config.guided_decoding_disable_any_whitespace = ( + so_config.disable_any_whitespace = ( self.guided_decoding_disable_any_whitespace ) if self.guided_decoding_disable_additional_properties is not None: - so_config.guided_decoding_disable_additional_properties = ( + so_config.disable_additional_properties = ( self.guided_decoding_disable_additional_properties ) @@ -1599,7 +1608,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: # No Mamba or Encoder-Decoder so far. if not model_config.is_v1_compatible: _raise_or_fallback( - feature_name=model_config.architectures, recommend_to_remove=False + feature_name=str(model_config.architectures), recommend_to_remove=False ) return False @@ -1715,6 +1724,7 @@ def _set_default_args( else: self.enable_prefix_caching = True else: + assert model_config.pooler_config is not None pooling_type = model_config.pooler_config.pooling_type is_causal = getattr(model_config.hf_config, "is_causal", True) incremental_prefill_supported = ( From 6d8e0ce2f483bc55bf11cb4c95e0c4120edf0c69 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 10 Oct 2025 03:24:08 +0000 Subject: [PATCH 06/21] fix Signed-off-by: wwl2755 --- vllm/config/utils.py | 2 +- vllm/engine/arg_utils.py | 22 ++++++++++++++-------- vllm/engine/metrics.py | 16 +++++++++++++++- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 5e7e7580c5a9..3452315b3e2e 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -39,7 +39,7 @@ def config(cls: ConfigT) -> ConfigT: return cls -def get_field(cls: ConfigType, name: str) -> Field: +def get_field(cls: ConfigType, name: str) -> Any: """Get the default factory field of a dataclass by name. Used for getting default factory fields in `EngineArgs`.""" if not is_dataclass(cls): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index eafaf9bd61eb..f8d0d1255bf8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -72,6 +72,7 @@ ExpertPlacementStrategy, ) from vllm.config.scheduler import SchedulerPolicy +from vllm.config.structured_outputs import StructuredOutputsBackend from vllm.config.utils import get_field from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform @@ -217,11 +218,12 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]: default = field.default # Handle pydantic.Field defaults if isinstance(default, FieldInfo): - default = ( - default.default - if default.default_factory is None - else default.default_factory() - ) + if default.default_factory is not None and callable( + default.default_factory + ): + default = cast(Callable[[], Any], default.default_factory)() + else: + default = default.default elif field.default_factory is not MISSING: default = field.default_factory() @@ -1311,8 +1313,7 @@ def create_engine_config( f"dcp_size={self.decode_context_parallel_size}." ) - cache_config = CacheConfig( - block_size=self.block_size, + cache_kwargs: dict[str, Any] = dict( gpu_memory_utilization=self.gpu_memory_utilization, kv_cache_memory_bytes=self.kv_cache_memory_bytes, swap_space=self.swap_space, @@ -1328,6 +1329,9 @@ def create_engine_config( mamba_cache_dtype=self.mamba_cache_dtype, mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype, ) + if self.block_size is not None: + cache_kwargs["block_size"] = self.block_size + cache_config = CacheConfig(**cache_kwargs) ray_runtime_env = None if is_ray_initialized(): @@ -1556,7 +1560,9 @@ def create_engine_config( # Forward the deprecated CLI args to the StructuredOutputsConfig so_config = self.structured_outputs_config if self.guided_decoding_backend is not None: - so_config.backend = self.guided_decoding_backend + so_config.backend = cast( + StructuredOutputsBackend, self.guided_decoding_backend + ) if self.guided_decoding_disable_fallback is not None: so_config.disable_fallback = self.guided_decoding_disable_fallback if self.guided_decoding_disable_any_whitespace is not None: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 45b798ed96cb..bc4e1f5176f9 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -51,7 +51,11 @@ def __init__(self, labelnames: list[str], vllm_config: VllmConfig): # Use this flag to hide metrics that were deprecated in # a previous release and which will be removed future - self.show_hidden_metrics = vllm_config.observability_config.show_hidden_metrics + self.show_hidden_metrics = ( + vllm_config.observability_config.show_hidden_metrics + if vllm_config.observability_config is not None + else False + ) # System stats # Scheduler State @@ -451,6 +455,11 @@ class LoggingStatLogger(StatLoggerBase): def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None: super().__init__(local_interval, vllm_config) + # Explicitly annotate attributes for mypy when follow-imports=skip + self.num_prompt_tokens: list[int] = [] + self.num_generation_tokens: list[int] = [] + self.last_local_log: float = time.time() + self.local_interval: float = local_interval self.last_prompt_throughput: Optional[float] = None self.last_generation_throughput: Optional[float] = None @@ -533,6 +542,11 @@ def __init__( self, local_interval: float, labels: dict[str, str], vllm_config: VllmConfig ) -> None: super().__init__(local_interval, vllm_config) + # Explicitly annotate attributes for mypy when follow-imports=skip + self.num_prompt_tokens: list[int] = [] + self.num_generation_tokens: list[int] = [] + self.last_local_log: float = time.time() + self.local_interval: float = local_interval # Prometheus metrics self.labels = labels self.metrics = self._metrics_cls( From a21884d112b399ac82ac8f9dad29590f8896186d Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 10 Oct 2025 03:42:25 +0000 Subject: [PATCH 07/21] fix Signed-off-by: wwl2755 --- vllm/engine/metrics.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index bc4e1f5176f9..d1e11755d749 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -51,11 +51,8 @@ def __init__(self, labelnames: list[str], vllm_config: VllmConfig): # Use this flag to hide metrics that were deprecated in # a previous release and which will be removed future - self.show_hidden_metrics = ( - vllm_config.observability_config.show_hidden_metrics - if vllm_config.observability_config is not None - else False - ) + assert vllm_config.observability_config is not None + self.show_hidden_metrics = vllm_config.observability_config.show_hidden_metrics # System stats # Scheduler State @@ -455,11 +452,11 @@ class LoggingStatLogger(StatLoggerBase): def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None: super().__init__(local_interval, vllm_config) - # Explicitly annotate attributes for mypy when follow-imports=skip - self.num_prompt_tokens: list[int] = [] - self.num_generation_tokens: list[int] = [] - self.last_local_log: float = time.time() - self.local_interval: float = local_interval + + self.num_prompt_tokens: list[int] + self.num_generation_tokens: list[int] + self.last_local_log: float + self.local_interval: float self.last_prompt_throughput: Optional[float] = None self.last_generation_throughput: Optional[float] = None @@ -542,11 +539,11 @@ def __init__( self, local_interval: float, labels: dict[str, str], vllm_config: VllmConfig ) -> None: super().__init__(local_interval, vllm_config) - # Explicitly annotate attributes for mypy when follow-imports=skip - self.num_prompt_tokens: list[int] = [] - self.num_generation_tokens: list[int] = [] - self.last_local_log: float = time.time() - self.local_interval: float = local_interval + + self.num_prompt_tokens: list[int] + self.num_generation_tokens: list[int] + self.last_local_log: float + self.local_interval: float # Prometheus metrics self.labels = labels self.metrics = self._metrics_cls( From 335d034c08d9b1eab31187a2196add9f5931c7a3 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 10 Oct 2025 11:46:18 +0100 Subject: [PATCH 08/21] Update `get_field` to use `pydantic.Field` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 3452315b3e2e..6fcc29d3d253 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -6,12 +6,12 @@ import inspect import textwrap from collections.abc import Iterable -from dataclasses import MISSING, Field, field, fields, is_dataclass, replace +from dataclasses import MISSING, fields, is_dataclass, replace from itertools import pairwise from typing import TYPE_CHECKING, Any, Protocol, TypeVar import regex as re -from pydantic.fields import FieldInfo +from pydantic.fields import Field, FieldInfo from typing_extensions import runtime_checkable if TYPE_CHECKING: @@ -39,7 +39,7 @@ def config(cls: ConfigT) -> ConfigT: return cls -def get_field(cls: ConfigType, name: str) -> Any: +def get_field(cls: ConfigType, name: str) -> Field: """Get the default factory field of a dataclass by name. Used for getting default factory fields in `EngineArgs`.""" if not is_dataclass(cls): @@ -47,17 +47,17 @@ def get_field(cls: ConfigType, name: str) -> Any: cls_fields = {f.name: f for f in fields(cls)} if name not in cls_fields: raise ValueError(f"Field '{name}' not found in {cls.__name__}.") - named_field: Field = cls_fields[name] + named_field = cls_fields[name] if (default_factory := named_field.default_factory) is not MISSING: - return field(default_factory=default_factory) + return Field(default_factory=default_factory) if (default := named_field.default) is not MISSING: if isinstance(default, FieldInfo): # Handle pydantic.Field defaults if default.default_factory is not None: - return field(default_factory=default.default_factory) + return Field(default_factory=default.default_factory) else: default = default.default - return field(default=default) + return Field(default=default) raise ValueError( f"{cls.__name__}.{name} must have a default value or default factory." From 4c3a72ba3a68208338f6870b16a7eaf747d4b319 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 10 Oct 2025 13:54:30 +0100 Subject: [PATCH 09/21] The rest Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/cache.py | 4 +-- vllm/config/model.py | 6 ++--- vllm/config/scheduler.py | 8 +++--- vllm/config/vllm.py | 4 ++- vllm/engine/arg_utils.py | 55 +++++++++++++++++----------------------- vllm/engine/metrics.py | 21 ++++----------- vllm/utils/__init__.py | 21 ++++++--------- vllm/utils/jsontree.py | 14 +++++----- 8 files changed, 55 insertions(+), 78 deletions(-) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index fd47d5c8f976..a158ffa06126 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -5,7 +5,7 @@ from dataclasses import field from typing import TYPE_CHECKING, Any, Literal, Optional -from pydantic import Field, SkipValidation, field_validator +from pydantic import Field, field_validator from pydantic.dataclasses import dataclass from vllm.config.utils import config @@ -30,7 +30,7 @@ class CacheConfig: """Configuration for the KV cache.""" - block_size: SkipValidation[BlockSize] = None # type: ignore + block_size: BlockSize = Field(default=None) """Size of a contiguous cache block in number of tokens. On CUDA devices, only block sizes up to 32 are supported. diff --git a/vllm/config/model.py b/vllm/config/model.py index d0c027e47675..0436eab8dd36 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -18,7 +18,7 @@ ) import torch -from pydantic import ConfigDict, SkipValidation, field_validator, model_validator +from pydantic import ConfigDict, Field, SkipValidation, field_validator, model_validator from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE @@ -127,7 +127,7 @@ class ModelConfig: Note that the model may support other tasks using the same model runner. """ - tokenizer: SkipValidation[str] = None # type: ignore + tokenizer: str = Field(default=None) """Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.""" tokenizer_mode: TokenizerMode = "auto" @@ -178,7 +178,7 @@ class ModelConfig: """The specific revision to use for the tokenizer on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - max_model_len: SkipValidation[int] = None # type: ignore + max_model_len: int = Field(default=None) """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 396258aac287..dd0b966caa77 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -5,7 +5,7 @@ from dataclasses import InitVar, field from typing import Any, Literal, Union -from pydantic import SkipValidation, model_validator +from pydantic import Field, SkipValidation, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self @@ -31,19 +31,19 @@ class SchedulerConfig: runner_type: RunnerType = "generate" """The runner type to launch for the model.""" - max_num_batched_tokens: SkipValidation[int] = None # type: ignore + max_num_batched_tokens: int = Field(default=None) """Maximum number of tokens to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_num_seqs: SkipValidation[int] = None # type: ignore + max_num_seqs: int = Field(default=None) """Maximum number of sequences to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_model_len: SkipValidation[int] = None # type: ignore + max_model_len: int = Field(default=None) """Maximum length of a sequence (including prompt and generated text). This is primarily set in `ModelConfig` and that value should be manually duplicated here.""" diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 833581035a31..791f44a4f268 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -76,7 +76,9 @@ class VllmConfig: default_factory=StructuredOutputsConfig ) """Structured outputs configuration.""" - observability_config: Optional[ObservabilityConfig] = None + observability_config: ObservabilityConfig = field( + default_factory=ObservabilityConfig + ) """Observability configuration.""" quant_config: Optional[QuantizationConfig] = None """Quantization configuration.""" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f8d0d1255bf8..3c8be6dcf84f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -218,12 +218,11 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]: default = field.default # Handle pydantic.Field defaults if isinstance(default, FieldInfo): - if default.default_factory is not None and callable( - default.default_factory - ): - default = cast(Callable[[], Any], default.default_factory)() - else: + if default.default_factory is None: default = default.default + else: + default_factory = cast(Callable[[], Any], default.default_factory) + default = default_factory() elif field.default_factory is not MISSING: default = field.default_factory() @@ -354,7 +353,7 @@ class EngineArgs: dtype: ModelDType = ModelConfig.dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype seed: Optional[int] = ModelConfig.seed - max_model_len: Optional[int] = ModelConfig.max_model_len + max_model_len: int = ModelConfig.max_model_len cuda_graph_sizes: list[int] = get_field(SchedulerConfig, "cuda_graph_sizes") # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without @@ -395,7 +394,7 @@ class EngineArgs: max_parallel_loading_workers: Optional[int] = ( ParallelConfig.max_parallel_loading_workers ) - block_size: Optional[BlockSize] = CacheConfig.block_size + block_size: BlockSize = CacheConfig.block_size enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching prefix_caching_hash_algo: PrefixCachingHashAlgo = ( CacheConfig.prefix_caching_hash_algo @@ -406,11 +405,11 @@ class EngineArgs: cpu_offload_gb: float = CacheConfig.cpu_offload_gb gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization kv_cache_memory_bytes: Optional[int] = CacheConfig.kv_cache_memory_bytes - max_num_batched_tokens: Optional[int] = SchedulerConfig.max_num_batched_tokens + max_num_batched_tokens: int = SchedulerConfig.max_num_batched_tokens max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold - max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs + max_num_seqs: int = SchedulerConfig.max_num_seqs max_logprobs: int = ModelConfig.max_logprobs logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode disable_log_stats: bool = False @@ -473,7 +472,7 @@ class EngineArgs: ) reasoning_parser: str = StructuredOutputsConfig.reasoning_parser # Deprecated guided decoding fields - guided_decoding_backend: Optional[str] = None + guided_decoding_backend: Optional[StructuredOutputsBackend] = None guided_decoding_disable_fallback: Optional[bool] = None guided_decoding_disable_any_whitespace: Optional[bool] = None guided_decoding_disable_additional_properties: Optional[bool] = None @@ -518,7 +517,7 @@ class EngineArgs: additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config") use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load - pt_load_map_location: Union[str, dict[str, str]] = LoadConfig.pt_load_map_location + pt_load_map_location: str | dict[str, str] = LoadConfig.pt_load_map_location # DEPRECATED enable_multimodal_encoder_data_parallel: bool = False @@ -1102,7 +1101,11 @@ def create_model_config(self) -> ModelConfig: self.mm_encoder_tp_mode = "data" - model_config_kwargs: dict[str, Any] = dict( + kwargs = dict[str, Any]() + if self.tokenizer is not None: + kwargs["tokenizer"] = self.tokenizer + + return ModelConfig( model=self.model, hf_config_path=self.hf_config_path, runner=self.runner, @@ -1121,6 +1124,7 @@ def create_model_config(self) -> ModelConfig: hf_token=self.hf_token, hf_overrides=self.hf_overrides, tokenizer_revision=self.tokenizer_revision, + max_model_len=self.max_model_len, quantization=self.quantization, enforce_eager=self.enforce_eager, max_logprobs=self.max_logprobs, @@ -1151,12 +1155,8 @@ def create_model_config(self) -> ModelConfig: logits_processors=self.logits_processors, video_pruning_rate=self.video_pruning_rate, io_processor_plugin=self.io_processor_plugin, + **kwargs, ) - if self.tokenizer is not None: - model_config_kwargs["tokenizer"] = self.tokenizer - if self.max_model_len is not None: - model_config_kwargs["max_model_len"] = self.max_model_len - return ModelConfig(**model_config_kwargs) def validate_tensorizer_args(self): from vllm.model_executor.model_loader.tensorizer import TensorizerConfig @@ -1250,8 +1250,6 @@ def create_engine_config( self.model = model_config.model self.tokenizer = model_config.tokenizer - # After ModelConfig init, tokenizer must be resolved (never None). - assert self.tokenizer is not None (self.model, self.tokenizer, self.speculative_config) = ( maybe_override_with_speculators( model=self.model, @@ -1313,7 +1311,8 @@ def create_engine_config( f"dcp_size={self.decode_context_parallel_size}." ) - cache_kwargs: dict[str, Any] = dict( + cache_config = CacheConfig( + block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, kv_cache_memory_bytes=self.kv_cache_memory_bytes, swap_space=self.swap_space, @@ -1329,9 +1328,6 @@ def create_engine_config( mamba_cache_dtype=self.mamba_cache_dtype, mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype, ) - if self.block_size is not None: - cache_kwargs["block_size"] = self.block_size - cache_config = CacheConfig(**cache_kwargs) ray_runtime_env = None if is_ray_initialized(): @@ -1500,8 +1496,10 @@ def create_engine_config( if speculative_config is not None: num_lookahead_slots = speculative_config.num_lookahead_slots - scheduler_kwargs: dict[str, Any] = dict( + scheduler_config = SchedulerConfig( runner_type=model_config.runner_type, + max_num_batched_tokens=self.max_num_batched_tokens, + max_num_seqs=self.max_num_seqs, max_model_len=model_config.max_model_len, cuda_graph_sizes=self.cuda_graph_sizes, num_lookahead_slots=num_lookahead_slots, @@ -1518,11 +1516,6 @@ def create_engine_config( disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager, async_scheduling=self.async_scheduling, ) - if self.max_num_batched_tokens is not None: - scheduler_kwargs["max_num_batched_tokens"] = self.max_num_batched_tokens - if self.max_num_seqs is not None: - scheduler_kwargs["max_num_seqs"] = self.max_num_seqs - scheduler_config = SchedulerConfig(**scheduler_kwargs) if not model_config.is_multimodal_model and self.default_mm_loras: raise ValueError( @@ -1560,9 +1553,7 @@ def create_engine_config( # Forward the deprecated CLI args to the StructuredOutputsConfig so_config = self.structured_outputs_config if self.guided_decoding_backend is not None: - so_config.backend = cast( - StructuredOutputsBackend, self.guided_decoding_backend - ) + so_config.backend = self.guided_decoding_backend if self.guided_decoding_disable_fallback is not None: so_config.disable_fallback = self.guided_decoding_disable_fallback if self.guided_decoding_disable_any_whitespace is not None: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index d1e11755d749..05fdde3d9542 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -51,7 +51,6 @@ def __init__(self, labelnames: list[str], vllm_config: VllmConfig): # Use this flag to hide metrics that were deprecated in # a previous release and which will be removed future - assert vllm_config.observability_config is not None self.show_hidden_metrics = vllm_config.observability_config.show_hidden_metrics # System stats @@ -452,11 +451,6 @@ class LoggingStatLogger(StatLoggerBase): def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None: super().__init__(local_interval, vllm_config) - - self.num_prompt_tokens: list[int] - self.num_generation_tokens: list[int] - self.last_local_log: float - self.local_interval: float self.last_prompt_throughput: Optional[float] = None self.last_generation_throughput: Optional[float] = None @@ -519,8 +513,8 @@ def log(self, stats: Stats) -> None: def _reset(self, stats, prompt_throughput, generation_throughput) -> None: # Reset tracked stats for next interval. - self.num_prompt_tokens = [] - self.num_generation_tokens = [] + self.num_prompt_tokens: list[int] = [] + self.num_generation_tokens: list[int] = [] self.last_local_log = stats.now self.last_prompt_throughput = prompt_throughput self.last_generation_throughput = generation_throughput @@ -539,11 +533,6 @@ def __init__( self, local_interval: float, labels: dict[str, str], vllm_config: VllmConfig ) -> None: super().__init__(local_interval, vllm_config) - - self.num_prompt_tokens: list[int] - self.num_generation_tokens: list[int] - self.last_local_log: float - self.local_interval: float # Prometheus metrics self.labels = labels self.metrics = self._metrics_cls( @@ -671,9 +660,9 @@ def log(self, stats: Stats): # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): # Reset tracked stats for next interval. - self.num_prompt_tokens = [] - self.num_generation_tokens = [] - self.last_local_log = stats.now + self.num_prompt_tokens: list[int] = [] + self.num_generation_tokens: list[int] = [] + self.last_local_log: float = stats.now def info(self, type: str, obj: SupportsMetricsInfo) -> None: # Info type metrics are syntactic sugar for a gauge permanently set to 1 diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 314f9ee6c08f..1691572de0ac 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -81,7 +81,7 @@ import setproctitle import torch import torch.types -import yaml # type: ignore[import-untyped] +import yaml import zmq import zmq.asyncio from packaging import version @@ -484,11 +484,8 @@ async def merge_async_iterators( yield 0, item return - loop = asyncio.get_running_loop() - awaits: dict[asyncio.Task[T], tuple[int, AsyncGenerator[T, None]]] = { - loop.create_task(anext(it)): (i, it) # type: ignore[arg-type] - for i, it in enumerate(iterators) + asyncio.ensure_future(anext(it)): (i, it) for i, it in enumerate(iterators) } try: while awaits: @@ -498,7 +495,7 @@ async def merge_async_iterators( try: item = await d i, it = pair - awaits[loop.create_task(anext(it))] = pair # type: ignore[arg-type] + awaits[asyncio.ensure_future(anext(it))] = pair yield i, item except StopAsyncIteration: pass @@ -1166,13 +1163,11 @@ def find_nccl_include_paths() -> list[str] | None: import importlib.util spec = importlib.util.find_spec("nvidia.nccl") - if spec: - locations = getattr(spec, "submodule_search_locations", None) - if locations: - for loc in locations: - inc_dir = os.path.join(loc, "include") - if os.path.exists(os.path.join(inc_dir, "nccl.h")): - paths.append(inc_dir) + if spec is not None and spec.submodule_search_locations is not None: + for loc in spec.submodule_search_locations: + inc_dir = os.path.join(loc, "include") + if os.path.exists(os.path.join(inc_dir, "nccl.h")): + paths.append(inc_dir) except Exception: pass diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py index 045c547dd7be..4c833d9a0125 100644 --- a/vllm/utils/jsontree.py +++ b/vllm/utils/jsontree.py @@ -4,7 +4,7 @@ from collections.abc import Iterable from functools import reduce -from typing import TYPE_CHECKING, Any, Callable, TypeVar, Union, overload +from typing import TYPE_CHECKING, Callable, TypeVar, Union, cast, overload if TYPE_CHECKING: import torch @@ -83,7 +83,7 @@ def json_map_leaves( ) -> JSONTree[_U]: ... -def json_map_leaves( +def json_map_leaves( # type: ignore[misc] func: Callable[[_T], _U], value: Union["BatchedTensorInputs", _JSONTree[_T]], ) -> Union["BatchedTensorInputs", _JSONTree[_U]]: @@ -143,18 +143,18 @@ def json_reduce_leaves( def json_reduce_leaves( - func: Callable[..., Any], - value: _JSONTree[Any], - initial: Any = ..., # noqa: B008 + func: Callable[..., Union[_T, _U]], + value: _JSONTree[_T], + initial: _U = cast(_U, ...), # type: ignore # noqa /, -) -> Any: +) -> Union[_T, _U]: """ Apply a function of two arguments cumulatively to each leaf in a nested JSON structure, from left to right, so as to reduce the sequence to a single value. """ if initial is ...: - return reduce(func, json_iter_leaves(value)) # type: ignore[arg-type] + return reduce(func, json_iter_leaves(value)) # type: ignore return reduce( func, # type: ignore[arg-type] From fe9e0f58e05c2c351d35456dc3460539671aad11 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 10 Oct 2025 14:28:42 +0100 Subject: [PATCH 10/21] Don't use unsupported block size in test Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/v1/core/test_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index dfa965c56766..a220f852d76d 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1023,7 +1023,7 @@ def test_kv_connector_unable_to_allocate(): """ # Setup Scheduler With Mock External Cache Hit. - BLOCK_SIZE = 4 + BLOCK_SIZE = 8 NUM_BLOCKS = 10 scheduler = create_scheduler( enable_prefix_caching=True, From bcab4c3ace9e5dc0579d860c26532e286cf7e504 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 10 Oct 2025 15:31:32 +0100 Subject: [PATCH 11/21] Use `get_field` for fields which now use `Field` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/arg_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3c8be6dcf84f..f42ef30249aa 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -335,7 +335,7 @@ class EngineArgs: model: str = ModelConfig.model served_model_name: Optional[Union[str, list[str]]] = ModelConfig.served_model_name - tokenizer: Optional[str] = ModelConfig.tokenizer + tokenizer: Optional[str] = get_field(ModelConfig, "tokenizer") hf_config_path: Optional[str] = ModelConfig.hf_config_path runner: RunnerOption = ModelConfig.runner convert: ConvertOption = ModelConfig.convert @@ -353,7 +353,7 @@ class EngineArgs: dtype: ModelDType = ModelConfig.dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype seed: Optional[int] = ModelConfig.seed - max_model_len: int = ModelConfig.max_model_len + max_model_len: int = get_field(ModelConfig, "max_model_len") cuda_graph_sizes: list[int] = get_field(SchedulerConfig, "cuda_graph_sizes") # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without @@ -394,7 +394,7 @@ class EngineArgs: max_parallel_loading_workers: Optional[int] = ( ParallelConfig.max_parallel_loading_workers ) - block_size: BlockSize = CacheConfig.block_size + block_size: BlockSize = get_field(CacheConfig, "block_size") enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching prefix_caching_hash_algo: PrefixCachingHashAlgo = ( CacheConfig.prefix_caching_hash_algo @@ -405,11 +405,11 @@ class EngineArgs: cpu_offload_gb: float = CacheConfig.cpu_offload_gb gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization kv_cache_memory_bytes: Optional[int] = CacheConfig.kv_cache_memory_bytes - max_num_batched_tokens: int = SchedulerConfig.max_num_batched_tokens + max_num_batched_tokens: int = get_field(SchedulerConfig, "max_num_batched_tokens") max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold - max_num_seqs: int = SchedulerConfig.max_num_seqs + max_num_seqs: int = get_field(SchedulerConfig, "max_num_seqs") max_logprobs: int = ModelConfig.max_logprobs logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode disable_log_stats: bool = False From 6fb333d9f5e33f29d91b7cbb8eef945c990a815e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 10 Oct 2025 16:01:12 +0100 Subject: [PATCH 12/21] Update other invalid block choice Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/v1/core/test_scheduler.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index a220f852d76d..744acc3beb94 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1022,9 +1022,9 @@ def test_kv_connector_unable_to_allocate(): unable to allocate (run out of blocks in allocate_slots(). """ - # Setup Scheduler With Mock External Cache Hit. + # Setup Scheduler With Mock External Cache Hit. (2 blocks, 2 null) BLOCK_SIZE = 8 - NUM_BLOCKS = 10 + NUM_BLOCKS = 18 scheduler = create_scheduler( enable_prefix_caching=True, use_kv_connector=True, @@ -1104,10 +1104,9 @@ def test_kv_connector_handles_preemption(): unable to allocate (run out of blocks in allocate_slots(). """ - # Setup Scheduler With Mock External Cache Hit. - BLOCK_SIZE = 2 - # NOTE: there is 1 null block, so this is 6 blocks. - NUM_BLOCKS = 7 + # Setup Scheduler With Mock External Cache Hit. (6 blocks, 1 null) + BLOCK_SIZE = 8 + NUM_BLOCKS = 49 scheduler = create_scheduler( enable_prefix_caching=True, use_kv_connector=True, From 2a9189515fc2eb1f42988a6f9037ba2153bba74f Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 10 Oct 2025 18:44:01 +0000 Subject: [PATCH 13/21] fix test_scheduler Signed-off-by: wwl2755 --- tests/v1/core/test_scheduler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 744acc3beb94..1f874f4eb206 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1106,7 +1106,7 @@ def test_kv_connector_handles_preemption(): # Setup Scheduler With Mock External Cache Hit. (6 blocks, 1 null) BLOCK_SIZE = 8 - NUM_BLOCKS = 49 + NUM_BLOCKS = 7 scheduler = create_scheduler( enable_prefix_caching=True, use_kv_connector=True, @@ -1125,8 +1125,8 @@ def test_kv_connector_handles_preemption(): # Both can be scheduled at first, but the second request # will be preempted and re-scheduled. NUM_REQUESTS = 2 - NUM_TOKENS = BLOCK_SIZE * 2 + 1 - MAX_TOKENS = BLOCK_SIZE * 2 + NUM_TOKENS = 3 * BLOCK_SIZE - 1 + MAX_TOKENS = 4 requests = create_requests( num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, From 044d190a6ba166dba49df5237d2a4b317e0ca270 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 10 Oct 2025 19:34:06 +0000 Subject: [PATCH 14/21] fix Signed-off-by: wwl2755 --- tests/v1/core/test_scheduler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 1f874f4eb206..1bf56ac3803e 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1022,9 +1022,9 @@ def test_kv_connector_unable_to_allocate(): unable to allocate (run out of blocks in allocate_slots(). """ - # Setup Scheduler With Mock External Cache Hit. (2 blocks, 2 null) + # Setup Scheduler With Mock External Cache Hit. BLOCK_SIZE = 8 - NUM_BLOCKS = 18 + NUM_BLOCKS = 10 scheduler = create_scheduler( enable_prefix_caching=True, use_kv_connector=True, @@ -1104,8 +1104,9 @@ def test_kv_connector_handles_preemption(): unable to allocate (run out of blocks in allocate_slots(). """ - # Setup Scheduler With Mock External Cache Hit. (6 blocks, 1 null) + # Setup Scheduler With Mock External Cache Hit. BLOCK_SIZE = 8 + # NOTE: there is 1 null block, so this is 6 blocks. NUM_BLOCKS = 7 scheduler = create_scheduler( enable_prefix_caching=True, From 51d898018f033ccbc0da919e5c4b825b73b1d94d Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 16 Oct 2025 03:07:11 +0000 Subject: [PATCH 15/21] fix pre-commit Signed-off-by: wwl2755 --- vllm/config/cache.py | 2 +- vllm/config/model.py | 2 +- vllm/config/parallel.py | 20 +++++++++----------- vllm/config/scheduler.py | 4 ++-- vllm/engine/arg_utils.py | 11 +++++++---- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index f35117a747ef..6d15e79534f8 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -30,7 +30,7 @@ class CacheConfig: """Configuration for the KV cache.""" - block_size: BlockSize = Field(default=None) + block_size: BlockSize | None = Field(default=None) """Size of a contiguous cache block in number of tokens. On CUDA devices, only block sizes up to 32 are supported. diff --git a/vllm/config/model.py b/vllm/config/model.py index a9f538e1627c..8c5eb7168b5f 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -171,7 +171,7 @@ class ModelConfig: """The specific revision to use for the tokenizer on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - max_model_len: int = Field(default=None) + max_model_len: int | None = Field(default=None) """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 944a1e8666f4..155b4c528c8c 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -32,6 +32,14 @@ ExpertPlacementStrategy = Literal["linear", "round_robin"] DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"] DataParallelBackend = Literal["ray", "mp"] +All2allBackendType = Literal[ + "naive", + "pplx", + "deepep_high_throughput", + "deepep_low_latency", + "allgather_reducescatter", + "flashinfer_all2allv", +] @config @@ -113,17 +121,7 @@ class ParallelConfig: with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1 will have experts [1, 3]. This strategy can help improve load balancing for grouped expert models with no redundant experts.""" - all2all_backend: ( - Literal[ - "naive", - "pplx", - "deepep_high_throughput", - "deepep_low_latency", - "allgather_reducescatter", - "flashinfer_all2allv", - ] - | None - ) = None + all2all_backend: All2allBackendType | None = None """All2All backend for MoE expert parallel communication. If not set, uses the value from VLLM_ALL2ALL_BACKEND environment variable. Available options: - "naive": Naive all2all implementation using broadcasts diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 7aa0af67c928..ba309b267596 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -31,13 +31,13 @@ class SchedulerConfig: runner_type: RunnerType = "generate" """The runner type to launch for the model.""" - max_num_batched_tokens: int = Field(default=None) + max_num_batched_tokens: int | None = Field(default=None) """Maximum number of tokens to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_num_seqs: int = Field(default=None) + max_num_seqs: int | None = Field(default=None) """Maximum number of sequences to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 84823146d348..c88e4f2cfd32 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -55,6 +55,7 @@ ) from vllm.config.cache import BlockSize, CacheDType, MambaDType, PrefixCachingHashAlgo from vllm.config.device import Device +from vllm.config.lora import LoRAExtraVocabSize, MaxLoRARanks from vllm.config.model import ( ConvertOption, HfOverrides, @@ -67,11 +68,13 @@ from vllm.config.multimodal import MMCacheType, MMEncoderTPMode from vllm.config.observability import DetailedTraceModules from vllm.config.parallel import ( + All2allBackendType, DataParallelBackend, DistributedExecutorBackend, ExpertPlacementStrategy, ) from vllm.config.scheduler import SchedulerPolicy +from vllm.config.structured_outputs import StructuredOutputsBackend from vllm.config.utils import get_field from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform @@ -375,7 +378,7 @@ class EngineArgs: data_parallel_hybrid_lb: bool = False data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel - all2all_backend: str | None = ParallelConfig.all2all_backend + all2all_backend: All2allBackendType | None = ParallelConfig.all2all_backend enable_dbo: bool = ParallelConfig.enable_dbo dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold @@ -449,12 +452,12 @@ class EngineArgs: # LoRA fields enable_lora: bool = False max_loras: int = LoRAConfig.max_loras - max_lora_rank: int = LoRAConfig.max_lora_rank + max_lora_rank: MaxLoRARanks = LoRAConfig.max_lora_rank default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras max_cpu_loras: int | None = LoRAConfig.max_cpu_loras lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype - lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size + lora_extra_vocab_size: LoRAExtraVocabSize = LoRAConfig.lora_extra_vocab_size ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override @@ -474,7 +477,7 @@ class EngineArgs: ) reasoning_parser: str = StructuredOutputsConfig.reasoning_parser # Deprecated guided decoding fields - guided_decoding_backend: str | None = None + guided_decoding_backend: StructuredOutputsBackend | None = None guided_decoding_disable_fallback: bool | None = None guided_decoding_disable_any_whitespace: bool | None = None guided_decoding_disable_additional_properties: bool | None = None From ec7c358af57e5c2680808c80174d1357b306659d Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 16 Oct 2025 03:42:03 +0000 Subject: [PATCH 16/21] fix Signed-off-by: wwl2755 --- vllm/config/cache.py | 2 +- vllm/config/model.py | 2 +- vllm/config/scheduler.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 6d15e79534f8..f35117a747ef 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -30,7 +30,7 @@ class CacheConfig: """Configuration for the KV cache.""" - block_size: BlockSize | None = Field(default=None) + block_size: BlockSize = Field(default=None) """Size of a contiguous cache block in number of tokens. On CUDA devices, only block sizes up to 32 are supported. diff --git a/vllm/config/model.py b/vllm/config/model.py index 8c5eb7168b5f..a9f538e1627c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -171,7 +171,7 @@ class ModelConfig: """The specific revision to use for the tokenizer on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - max_model_len: int | None = Field(default=None) + max_model_len: int = Field(default=None) """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index ba309b267596..7aa0af67c928 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -31,13 +31,13 @@ class SchedulerConfig: runner_type: RunnerType = "generate" """The runner type to launch for the model.""" - max_num_batched_tokens: int | None = Field(default=None) + max_num_batched_tokens: int = Field(default=None) """Maximum number of tokens to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_num_seqs: int | None = Field(default=None) + max_num_seqs: int = Field(default=None) """Maximum number of sequences to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will From 5f7cef9e35ed90e959bcbb96cc537c6e42999d2b Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 16 Oct 2025 04:36:55 +0000 Subject: [PATCH 17/21] fix Signed-off-by: wwl2755 --- vllm/engine/arg_utils.py | 12 ++++++------ vllm/utils/async_utils.py | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c88e4f2cfd32..ee183772a3ec 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1137,7 +1137,7 @@ def create_model_config(self) -> ModelConfig: hf_token=self.hf_token, hf_overrides=self.hf_overrides, tokenizer_revision=self.tokenizer_revision, - max_model_len=self.max_model_len, + max_model_len=self.max_model_len, # type: ignore[arg-type] quantization=self.quantization, enforce_eager=self.enforce_eager, max_logprobs=self.max_logprobs, @@ -1325,7 +1325,7 @@ def create_engine_config( ) cache_config = CacheConfig( - block_size=self.block_size, + block_size=self.block_size, # type: ignore[arg-type] gpu_memory_utilization=self.gpu_memory_utilization, kv_cache_memory_bytes=self.kv_cache_memory_bytes, swap_space=self.swap_space, @@ -1512,9 +1512,9 @@ def create_engine_config( scheduler_config = SchedulerConfig( runner_type=model_config.runner_type, - max_num_batched_tokens=self.max_num_batched_tokens, - max_num_seqs=self.max_num_seqs, - max_model_len=model_config.max_model_len, + max_num_batched_tokens=self.max_num_batched_tokens, # type: ignore[arg-type] + max_num_seqs=self.max_num_seqs, # type: ignore[arg-type] + max_model_len=model_config.max_model_len, # type: ignore[arg-type] cuda_graph_sizes=self.cuda_graph_sizes, num_lookahead_slots=num_lookahead_slots, enable_chunked_prefill=self.enable_chunked_prefill, @@ -1616,7 +1616,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: return False # No Mamba or Encoder-Decoder so far. - if not model_config.is_v1_compatible: + if not getattr(model_config, "is_v1_compatible", True): _raise_or_fallback( feature_name=str(model_config.architectures), recommend_to_remove=False ) diff --git a/vllm/utils/async_utils.py b/vllm/utils/async_utils.py index aeabd808add5..4f0f36859de9 100644 --- a/vllm/utils/async_utils.py +++ b/vllm/utils/async_utils.py @@ -270,7 +270,7 @@ async def merge_async_iterators( loop = asyncio.get_running_loop() - awaits = {loop.create_task(anext(it)): (i, it) for i, it in enumerate(iterators)} + awaits = {loop.create_task(anext(it)): (i, it) for i, it in enumerate(iterators)} # type: ignore[var-annotated, arg-type] try: while awaits: done, _ = await asyncio.wait(awaits.keys(), return_when=FIRST_COMPLETED) @@ -279,7 +279,7 @@ async def merge_async_iterators( try: item = await d i, it = pair - awaits[loop.create_task(anext(it))] = pair + awaits[loop.create_task(anext(it))] = pair # type: ignore[arg-type] yield i, item except StopAsyncIteration: pass From 41eaf97d7944d96e2a061212f532242d005890a3 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 17 Oct 2025 11:48:36 +0200 Subject: [PATCH 18/21] Don't use `pydantic.Field` for `get_field` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 6fcc29d3d253..5e7e7580c5a9 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -6,12 +6,12 @@ import inspect import textwrap from collections.abc import Iterable -from dataclasses import MISSING, fields, is_dataclass, replace +from dataclasses import MISSING, Field, field, fields, is_dataclass, replace from itertools import pairwise from typing import TYPE_CHECKING, Any, Protocol, TypeVar import regex as re -from pydantic.fields import Field, FieldInfo +from pydantic.fields import FieldInfo from typing_extensions import runtime_checkable if TYPE_CHECKING: @@ -47,17 +47,17 @@ def get_field(cls: ConfigType, name: str) -> Field: cls_fields = {f.name: f for f in fields(cls)} if name not in cls_fields: raise ValueError(f"Field '{name}' not found in {cls.__name__}.") - named_field = cls_fields[name] + named_field: Field = cls_fields[name] if (default_factory := named_field.default_factory) is not MISSING: - return Field(default_factory=default_factory) + return field(default_factory=default_factory) if (default := named_field.default) is not MISSING: if isinstance(default, FieldInfo): # Handle pydantic.Field defaults if default.default_factory is not None: - return Field(default_factory=default.default_factory) + return field(default_factory=default.default_factory) else: default = default.default - return Field(default=default) + return field(default=default) raise ValueError( f"{cls.__name__}.{name} must have a default value or default factory." From 3a4a61624882150e3b3126c07d57f218933552fa Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 17 Oct 2025 11:55:23 +0200 Subject: [PATCH 19/21] Use wrap validator to only skip validation when `None` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/cache.py | 8 ++++++++ vllm/config/model.py | 15 +++++++++++---- vllm/engine/arg_utils.py | 6 +----- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index f35117a747ef..6a28bdb753ab 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib +from collections.abc import Callable from dataclasses import field from typing import TYPE_CHECKING, Any, Literal @@ -150,6 +151,13 @@ def metrics_info(self): # metrics info return {key: str(value) for key, value in self.__dict__.items()} + @field_validator("block_size", mode="wrap") + @classmethod + def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: + if value is None: + return value + return handler(value) + @field_validator("cache_dtype", mode="after") @classmethod def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType: diff --git a/vllm/config/model.py b/vllm/config/model.py index a9f538e1627c..4c7bf99c26b4 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -10,7 +10,7 @@ from typing import TYPE_CHECKING, Any, Literal, cast, get_args import torch -from pydantic import ConfigDict, Field, SkipValidation, field_validator, model_validator +from pydantic import ConfigDict, field_validator, model_validator from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE @@ -120,7 +120,7 @@ class ModelConfig: Note that the model may support other tasks using the same model runner. """ - tokenizer: str = Field(default=None) + tokenizer: str = None """Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.""" tokenizer_mode: TokenizerMode = "auto" @@ -171,7 +171,7 @@ class ModelConfig: """The specific revision to use for the tokenizer on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - max_model_len: int = Field(default=None) + max_model_len: int = None """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. @@ -182,7 +182,7 @@ class ModelConfig: - 25.6k -> 25,600""" spec_target_max_model_len: int | None = None """Specify the maximum length for spec decoding draft models.""" - quantization: SkipValidation[QuantizationMethods | None] = None + quantization: str | QuantizationMethods | None = None """Method used to quantize the weights. If `None`, we first check the `quantization_config` attribute in the model config file. If that is `None`, we assume the model weights are not quantized and use `dtype` to @@ -302,6 +302,13 @@ class ModelConfig: skip_mm_profiling: InitVar[bool | None] = None video_pruning_rate: InitVar[float | None] = None + @field_validator("tokenizer", "max_model_len", mode="wrap") + @classmethod + def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: + if value is None: + return value + return handler(value) + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ee183772a3ec..7ce47c581a50 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1114,16 +1114,13 @@ def create_model_config(self) -> ModelConfig: self.mm_encoder_tp_mode = "data" - kwargs = dict[str, Any]() - if self.tokenizer is not None: - kwargs["tokenizer"] = self.tokenizer - return ModelConfig( model=self.model, hf_config_path=self.hf_config_path, runner=self.runner, convert=self.convert, task=self.task, + tokenizer=self.tokenizer, # type: ignore[arg-type] tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, allowed_local_media_path=self.allowed_local_media_path, @@ -1168,7 +1165,6 @@ def create_model_config(self) -> ModelConfig: logits_processors=self.logits_processors, video_pruning_rate=self.video_pruning_rate, io_processor_plugin=self.io_processor_plugin, - **kwargs, ) def validate_tensorizer_args(self): From 18d254b76e982c1d70b5b4a67e59639704802d55 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 17 Oct 2025 12:04:27 +0200 Subject: [PATCH 20/21] pre-commit Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/utils.py | 2 +- vllm/engine/arg_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 5e7e7580c5a9..3452315b3e2e 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -39,7 +39,7 @@ def config(cls: ConfigT) -> ConfigT: return cls -def get_field(cls: ConfigType, name: str) -> Field: +def get_field(cls: ConfigType, name: str) -> Any: """Get the default factory field of a dataclass by name. Used for getting default factory fields in `EngineArgs`.""" if not is_dataclass(cls): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7ce47c581a50..2575665e1ad2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -426,7 +426,7 @@ class EngineArgs: hf_token: bool | str | None = ModelConfig.hf_token hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides") tokenizer_revision: str | None = ModelConfig.tokenizer_revision - quantization: QuantizationMethods | None = ModelConfig.quantization + quantization: str | QuantizationMethods | None = ModelConfig.quantization enforce_eager: bool = ModelConfig.enforce_eager disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field( From f23315d92bd0654a7a3a4ab259ce903ae1830516 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 21 Oct 2025 11:41:34 +0200 Subject: [PATCH 21/21] Skip some more `None` validation Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/cache.py | 2 +- vllm/config/scheduler.py | 24 +++++++++++++++++++----- vllm/config/vllm.py | 2 +- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 6a28bdb753ab..41537b56707e 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -31,7 +31,7 @@ class CacheConfig: """Configuration for the KV cache.""" - block_size: BlockSize = Field(default=None) + block_size: BlockSize = None """Size of a contiguous cache block in number of tokens. On CUDA devices, only block sizes up to 32 are supported. diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 7aa0af67c928..f4015f43de8b 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -2,10 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib +from collections.abc import Callable from dataclasses import InitVar, field from typing import Any, Literal -from pydantic import Field, SkipValidation, model_validator +from pydantic import field_validator, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self @@ -31,19 +32,19 @@ class SchedulerConfig: runner_type: RunnerType = "generate" """The runner type to launch for the model.""" - max_num_batched_tokens: int = Field(default=None) + max_num_batched_tokens: int = None """Maximum number of tokens to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_num_seqs: int = Field(default=None) + max_num_seqs: int = None """Maximum number of sequences to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_model_len: int = Field(default=None) + max_model_len: int = None """Maximum length of a sequence (including prompt and generated text). This is primarily set in `ModelConfig` and that value should be manually duplicated here.""" @@ -79,7 +80,7 @@ class SchedulerConfig: 3. more than one value (e.g. 1 2 128) is provided, then the capture list will follow the provided list.""" - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore + enable_chunked_prefill: bool = None """If True, prefill requests can be chunked based on the remaining max_num_batched_tokens.""" @@ -169,6 +170,19 @@ def compute_hash(self) -> str: hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str + @field_validator( + "max_num_batched_tokens", + "max_num_seqs", + "max_model_len", + "enable_chunked_prefill", + mode="wrap", + ) + @classmethod + def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: + if value is None: + return value + return handler(value) + def __post_init__(self, is_encoder_decoder: bool) -> None: if self.max_model_len is None: self.max_model_len = 8192 diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index da7ec7032969..cd31fade025b 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -57,7 +57,7 @@ class VllmConfig: # TODO: use default_factory once default constructing ModelConfig doesn't # try to download a model - model_config: ModelConfig = Field(default=None) + model_config: ModelConfig = None """Model configuration.""" cache_config: CacheConfig = Field(default_factory=CacheConfig) """Cache configuration."""