From 1b80912528d2f26dba6ec173fb0d8cbd93745676 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Thu, 9 Oct 2025 22:17:54 +0000
Subject: [PATCH 01/21] fix mypy

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 tools/pre_commit/mypy.py | 1 +
 vllm/engine/arg_utils.py | 4 ++--
 vllm/multimodal/parse.py | 8 +++++---
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 22ee08535bdd..527279860e1e 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -27,6 +27,7 @@
 FILES = [
     "vllm/*.py",
     "vllm/assets",
+    "vllm/engine",
     "vllm/entrypoints",
     "vllm/inputs",
     "vllm/logging_utils",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7e66d8dba8ac..906418aea25e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1243,7 +1243,7 @@ def create_engine_config(
         (self.model, self.tokenizer, self.speculative_config) = (
             maybe_override_with_speculators(
                 model=self.model,
-                tokenizer=self.tokenizer,
+                tokenizer=self.tokenizer if self.tokenizer is not None else self.model,
                 revision=self.revision,
                 trust_remote_code=self.trust_remote_code,
                 vllm_speculative_config=self.speculative_config,
@@ -1685,7 +1685,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         return True
 
     def _set_default_args(
-        self, usage_context: UsageContext, model_config: ModelConfig
+        self, usage_context: Optional[UsageContext], model_config: ModelConfig
     ) -> None:
         """Set Default Arguments for V1 Engine."""
 
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 8fdc5cf721d0..c53420789b5c 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -13,6 +13,7 @@
     Optional,
     TypeVar,
     Union,
+    cast,
 )
 
 import numpy as np
@@ -366,7 +367,8 @@ def _is_embeddings(
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
         if is_list_of(data, torch.Tensor):
-            return data[0].ndim == 2
+            tensors = cast(list[torch.Tensor], data)
+            return tensors[0].ndim == 2
 
         return False
 
@@ -434,7 +436,7 @@ def _parse_audio_data(
         elif isinstance(data, (np.ndarray, torch.Tensor)):
             data_items = [elem for elem in data]
         else:
-            data_items = data
+            data_items = data  # type: ignore[assignment]
 
         new_audios = list[np.ndarray]()
         for data_item in data_items:
@@ -498,7 +500,7 @@ def _parse_video_data(
         elif isinstance(data, tuple) and len(data) == 2:
             data_items = [data]
         else:
-            data_items = data
+            data_items = data  # type: ignore[assignment]
 
         new_videos = list[tuple[np.ndarray, Optional[dict[str, Any]]]]()
         metadata_lst: list[Optional[dict[str, Any]]] = []

From 5799b37113aaf68c51fc83d33fc79108814cb2e9 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Thu, 9 Oct 2025 22:23:11 +0000
Subject: [PATCH 02/21] fix

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 tools/pre_commit/mypy.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 527279860e1e..2147111d9443 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -45,7 +45,6 @@
     "vllm/attention",
     "vllm/compilation",
     "vllm/distributed",
-    "vllm/engine",
     "vllm/executor",
     "vllm/inputs",
     "vllm/lora",

From 738f668d704448b1f87e1e576ea24425317ad105 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Fri, 10 Oct 2025 00:58:57 +0000
Subject: [PATCH 03/21] mypy for vllm/utils

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 tools/pre_commit/mypy.py |  1 +
 vllm/engine/arg_utils.py |  4 +++-
 vllm/multimodal/parse.py |  8 ++++----
 vllm/utils/__init__.py   | 21 +++++++++++++--------
 vllm/utils/jsontree.py   | 12 ++++++------
 5 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 2147111d9443..ddf7b652e04e 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -36,6 +36,7 @@
     "vllm/transformers_utils",
     "vllm/triton_utils",
     "vllm/usage",
+    "vllm/utils",
 ]
 
 # After fixing errors resulting from changing follow_imports
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 906418aea25e..d50e42084e8f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1240,10 +1240,12 @@ def create_engine_config(
         self.model = model_config.model
         self.tokenizer = model_config.tokenizer
 
+        # After ModelConfig init, tokenizer must be resolved (never None).
+        assert self.tokenizer is not None
         (self.model, self.tokenizer, self.speculative_config) = (
             maybe_override_with_speculators(
                 model=self.model,
-                tokenizer=self.tokenizer if self.tokenizer is not None else self.model,
+                tokenizer=self.tokenizer,
                 revision=self.revision,
                 trust_remote_code=self.trust_remote_code,
                 vllm_speculative_config=self.speculative_config,
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index c53420789b5c..f8b67cdc9546 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -13,7 +13,6 @@
     Optional,
     TypeVar,
     Union,
-    cast,
 )
 
 import numpy as np
@@ -367,8 +366,7 @@ def _is_embeddings(
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
         if is_list_of(data, torch.Tensor):
-            tensors = cast(list[torch.Tensor], data)
-            return tensors[0].ndim == 2
+            return data[0].ndim == 2
 
         return False
 
@@ -426,6 +424,8 @@ def _parse_audio_data(
         if self._is_embeddings(data):
             return AudioEmbeddingItems(data)
 
+        # Normalize into a list of audio items
+        data_items: list[AudioItem]
         if (
             is_list_of(data, float)
             or isinstance(data, (np.ndarray, torch.Tensor))
@@ -436,7 +436,7 @@ def _parse_audio_data(
         elif isinstance(data, (np.ndarray, torch.Tensor)):
             data_items = [elem for elem in data]
         else:
-            data_items = data  # type: ignore[assignment]
+            data_items = data
 
         new_audios = list[np.ndarray]()
         for data_item in data_items:
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 22c2a4b5362c..314f9ee6c08f 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -81,7 +81,7 @@
 import setproctitle
 import torch
 import torch.types
-import yaml
+import yaml  # type: ignore[import-untyped]
 import zmq
 import zmq.asyncio
 from packaging import version
@@ -486,7 +486,10 @@ async def merge_async_iterators(
 
     loop = asyncio.get_running_loop()
 
-    awaits = {loop.create_task(anext(it)): (i, it) for i, it in enumerate(iterators)}
+    awaits: dict[asyncio.Task[T], tuple[int, AsyncGenerator[T, None]]] = {
+        loop.create_task(anext(it)): (i, it)  # type: ignore[arg-type]
+        for i, it in enumerate(iterators)
+    }
     try:
         while awaits:
             done, _ = await asyncio.wait(awaits.keys(), return_when=FIRST_COMPLETED)
@@ -495,7 +498,7 @@ async def merge_async_iterators(
                 try:
                     item = await d
                     i, it = pair
-                    awaits[loop.create_task(anext(it))] = pair
+                    awaits[loop.create_task(anext(it))] = pair  # type: ignore[arg-type]
                     yield i, item
                 except StopAsyncIteration:
                     pass
@@ -1163,11 +1166,13 @@ def find_nccl_include_paths() -> list[str] | None:
         import importlib.util
 
         spec = importlib.util.find_spec("nvidia.nccl")
-        if spec and getattr(spec, "submodule_search_locations", None):
-            for loc in spec.submodule_search_locations:
-                inc_dir = os.path.join(loc, "include")
-                if os.path.exists(os.path.join(inc_dir, "nccl.h")):
-                    paths.append(inc_dir)
+        if spec:
+            locations = getattr(spec, "submodule_search_locations", None)
+            if locations:
+                for loc in locations:
+                    inc_dir = os.path.join(loc, "include")
+                    if os.path.exists(os.path.join(inc_dir, "nccl.h")):
+                        paths.append(inc_dir)
     except Exception:
         pass
 
diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py
index dcdc6ccb4c63..045c547dd7be 100644
--- a/vllm/utils/jsontree.py
+++ b/vllm/utils/jsontree.py
@@ -4,7 +4,7 @@
 
 from collections.abc import Iterable
 from functools import reduce
-from typing import TYPE_CHECKING, Callable, TypeVar, Union, cast, overload
+from typing import TYPE_CHECKING, Any, Callable, TypeVar, Union, overload
 
 if TYPE_CHECKING:
     import torch
@@ -94,7 +94,7 @@ def json_map_leaves(
             for k, v in value.items()
         }
     elif isinstance(value, list):
-        return [json_map_leaves(func, v) for v in value]
+        return [json_map_leaves(func, v) for v in value]  # type: ignore[return-value]
     elif isinstance(value, tuple):
         return tuple(json_map_leaves(func, v) for v in value)
     else:
@@ -143,11 +143,11 @@ def json_reduce_leaves(
 
 
 def json_reduce_leaves(
-    func: Callable[..., Union[_T, _U]],
-    value: _JSONTree[_T],
-    initial: _U = cast(_U, ...),  # noqa: B008
+    func: Callable[..., Any],
+    value: _JSONTree[Any],
+    initial: Any = ...,  # noqa: B008
     /,
-) -> Union[_T, _U]:
+) -> Any:
     """
     Apply a function of two arguments cumulatively to each leaf in a
     nested JSON structure, from left to right, so as to reduce the

From ee3fc70baef2cbeed74deac1247af5e747ef2925 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Fri, 10 Oct 2025 01:01:31 +0000
Subject: [PATCH 04/21] minor

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 vllm/multimodal/parse.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index f8b67cdc9546..5316727ac8e2 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -424,7 +424,6 @@ def _parse_audio_data(
         if self._is_embeddings(data):
             return AudioEmbeddingItems(data)
 
-        # Normalize into a list of audio items
         data_items: list[AudioItem]
         if (
             is_list_of(data, float)
@@ -489,6 +488,7 @@ def _parse_video_data(
         if self._is_embeddings(data):
             return VideoEmbeddingItems(data)
 
+        data_items: list[VideoItem]
         if (
             is_list_of(data, PILImage.Image)
             or isinstance(data, (np.ndarray, torch.Tensor))
@@ -500,7 +500,7 @@ def _parse_video_data(
         elif isinstance(data, tuple) and len(data) == 2:
             data_items = [data]
         else:
-            data_items = data  # type: ignore[assignment]
+            data_items = data
 
         new_videos = list[tuple[np.ndarray, Optional[dict[str, Any]]]]()
         metadata_lst: list[Optional[dict[str, Any]]] = []

From ceb1f743ffb2bd3e4d3f04862f91361ec6b69304 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Fri, 10 Oct 2025 02:35:38 +0000
Subject: [PATCH 05/21] fix

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 vllm/engine/arg_utils.py | 50 ++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d50e42084e8f..eafaf9bd61eb 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -54,6 +54,7 @@
 )
 from vllm.config.cache import BlockSize, CacheDType, MambaDType, PrefixCachingHashAlgo
 from vllm.config.device import Device
+from vllm.config.lora import LoRAExtraVocabSize, MaxLoRARanks
 from vllm.config.model import (
     ConvertOption,
     HfOverrides,
@@ -65,7 +66,11 @@
 )
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
 from vllm.config.observability import DetailedTraceModules
-from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
+from vllm.config.parallel import (
+    DataParallelBackend,
+    DistributedExecutorBackend,
+    ExpertPlacementStrategy,
+)
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
 from vllm.logger import init_logger
@@ -366,7 +371,7 @@ class EngineArgs:
     data_parallel_address: Optional[str] = None
     data_parallel_rpc_port: Optional[int] = None
     data_parallel_hybrid_lb: bool = False
-    data_parallel_backend: str = ParallelConfig.data_parallel_backend
+    data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     enable_dbo: bool = ParallelConfig.enable_dbo
     dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
@@ -436,17 +441,17 @@ class EngineArgs:
     mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
     io_processor_plugin: Optional[str] = None
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
-    video_pruning_rate: float = MultiModalConfig.video_pruning_rate
+    video_pruning_rate: Optional[float] = MultiModalConfig.video_pruning_rate
     # LoRA fields
     enable_lora: bool = False
     enable_lora_bias: bool = LoRAConfig.bias_enabled
     max_loras: int = LoRAConfig.max_loras
-    max_lora_rank: int = LoRAConfig.max_lora_rank
+    max_lora_rank: MaxLoRARanks = LoRAConfig.max_lora_rank
     default_mm_loras: Optional[dict[str, str]] = LoRAConfig.default_mm_loras
     fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
     max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
     lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
-    lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
+    lora_extra_vocab_size: LoRAExtraVocabSize = LoRAConfig.lora_extra_vocab_size
 
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: Optional[int] = CacheConfig.num_gpu_blocks_override
@@ -502,7 +507,7 @@ class EngineArgs:
         ModelConfig, "override_generation_config"
     )
     model_impl: str = ModelConfig.model_impl
-    override_attention_dtype: str = ModelConfig.override_attention_dtype
+    override_attention_dtype: Optional[str] = ModelConfig.override_attention_dtype
 
     calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
     mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype
@@ -511,7 +516,7 @@ class EngineArgs:
     additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config")
 
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
-    pt_load_map_location: str = LoadConfig.pt_load_map_location
+    pt_load_map_location: Union[str, dict[str, str]] = LoadConfig.pt_load_map_location
 
     # DEPRECATED
     enable_multimodal_encoder_data_parallel: bool = False
@@ -1095,13 +1100,12 @@ def create_model_config(self) -> ModelConfig:
 
             self.mm_encoder_tp_mode = "data"
 
-        return ModelConfig(
+        model_config_kwargs: dict[str, Any] = dict(
             model=self.model,
             hf_config_path=self.hf_config_path,
             runner=self.runner,
             convert=self.convert,
             task=self.task,
-            tokenizer=self.tokenizer,
             tokenizer_mode=self.tokenizer_mode,
             trust_remote_code=self.trust_remote_code,
             allowed_local_media_path=self.allowed_local_media_path,
@@ -1115,7 +1119,6 @@ def create_model_config(self) -> ModelConfig:
             hf_token=self.hf_token,
             hf_overrides=self.hf_overrides,
             tokenizer_revision=self.tokenizer_revision,
-            max_model_len=self.max_model_len,
             quantization=self.quantization,
             enforce_eager=self.enforce_eager,
             max_logprobs=self.max_logprobs,
@@ -1147,6 +1150,11 @@ def create_model_config(self) -> ModelConfig:
             video_pruning_rate=self.video_pruning_rate,
             io_processor_plugin=self.io_processor_plugin,
         )
+        if self.tokenizer is not None:
+            model_config_kwargs["tokenizer"] = self.tokenizer
+        if self.max_model_len is not None:
+            model_config_kwargs["max_model_len"] = self.max_model_len
+        return ModelConfig(**model_config_kwargs)
 
     def validate_tensorizer_args(self):
         from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@@ -1488,10 +1496,8 @@ def create_engine_config(
         if speculative_config is not None:
             num_lookahead_slots = speculative_config.num_lookahead_slots
 
-        scheduler_config = SchedulerConfig(
+        scheduler_kwargs: dict[str, Any] = dict(
             runner_type=model_config.runner_type,
-            max_num_batched_tokens=self.max_num_batched_tokens,
-            max_num_seqs=self.max_num_seqs,
             max_model_len=model_config.max_model_len,
             cuda_graph_sizes=self.cuda_graph_sizes,
             num_lookahead_slots=num_lookahead_slots,
@@ -1508,6 +1514,11 @@ def create_engine_config(
             disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager,
             async_scheduling=self.async_scheduling,
         )
+        if self.max_num_batched_tokens is not None:
+            scheduler_kwargs["max_num_batched_tokens"] = self.max_num_batched_tokens
+        if self.max_num_seqs is not None:
+            scheduler_kwargs["max_num_seqs"] = self.max_num_seqs
+        scheduler_config = SchedulerConfig(**scheduler_kwargs)
 
         if not model_config.is_multimodal_model and self.default_mm_loras:
             raise ValueError(
@@ -1545,17 +1556,15 @@ def create_engine_config(
         # Forward the deprecated CLI args to the StructuredOutputsConfig
         so_config = self.structured_outputs_config
         if self.guided_decoding_backend is not None:
-            so_config.guided_decoding_backend = self.guided_decoding_backend
+            so_config.backend = self.guided_decoding_backend
         if self.guided_decoding_disable_fallback is not None:
-            so_config.guided_decoding_disable_fallback = (
-                self.guided_decoding_disable_fallback
-            )
+            so_config.disable_fallback = self.guided_decoding_disable_fallback
         if self.guided_decoding_disable_any_whitespace is not None:
-            so_config.guided_decoding_disable_any_whitespace = (
+            so_config.disable_any_whitespace = (
                 self.guided_decoding_disable_any_whitespace
             )
         if self.guided_decoding_disable_additional_properties is not None:
-            so_config.guided_decoding_disable_additional_properties = (
+            so_config.disable_additional_properties = (
                 self.guided_decoding_disable_additional_properties
             )
 
@@ -1599,7 +1608,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         # No Mamba or Encoder-Decoder so far.
         if not model_config.is_v1_compatible:
             _raise_or_fallback(
-                feature_name=model_config.architectures, recommend_to_remove=False
+                feature_name=str(model_config.architectures), recommend_to_remove=False
             )
             return False
 
@@ -1715,6 +1724,7 @@ def _set_default_args(
                 else:
                     self.enable_prefix_caching = True
         else:
+            assert model_config.pooler_config is not None
             pooling_type = model_config.pooler_config.pooling_type
             is_causal = getattr(model_config.hf_config, "is_causal", True)
             incremental_prefill_supported = (

From 6d8e0ce2f483bc55bf11cb4c95e0c4120edf0c69 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Fri, 10 Oct 2025 03:24:08 +0000
Subject: [PATCH 06/21] fix

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 vllm/config/utils.py     |  2 +-
 vllm/engine/arg_utils.py | 22 ++++++++++++++--------
 vllm/engine/metrics.py   | 16 +++++++++++++++-
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 5e7e7580c5a9..3452315b3e2e 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -39,7 +39,7 @@ def config(cls: ConfigT) -> ConfigT:
     return cls
 
 
-def get_field(cls: ConfigType, name: str) -> Field:
+def get_field(cls: ConfigType, name: str) -> Any:
     """Get the default factory field of a dataclass by name. Used for getting
     default factory fields in `EngineArgs`."""
     if not is_dataclass(cls):
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index eafaf9bd61eb..f8d0d1255bf8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -72,6 +72,7 @@
     ExpertPlacementStrategy,
 )
 from vllm.config.scheduler import SchedulerPolicy
+from vllm.config.structured_outputs import StructuredOutputsBackend
 from vllm.config.utils import get_field
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
@@ -217,11 +218,12 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
             default = field.default
             # Handle pydantic.Field defaults
             if isinstance(default, FieldInfo):
-                default = (
-                    default.default
-                    if default.default_factory is None
-                    else default.default_factory()
-                )
+                if default.default_factory is not None and callable(
+                    default.default_factory
+                ):
+                    default = cast(Callable[[], Any], default.default_factory)()
+                else:
+                    default = default.default
         elif field.default_factory is not MISSING:
             default = field.default_factory()
 
@@ -1311,8 +1313,7 @@ def create_engine_config(
             f"dcp_size={self.decode_context_parallel_size}."
         )
 
-        cache_config = CacheConfig(
-            block_size=self.block_size,
+        cache_kwargs: dict[str, Any] = dict(
             gpu_memory_utilization=self.gpu_memory_utilization,
             kv_cache_memory_bytes=self.kv_cache_memory_bytes,
             swap_space=self.swap_space,
@@ -1328,6 +1329,9 @@ def create_engine_config(
             mamba_cache_dtype=self.mamba_cache_dtype,
             mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
         )
+        if self.block_size is not None:
+            cache_kwargs["block_size"] = self.block_size
+        cache_config = CacheConfig(**cache_kwargs)
 
         ray_runtime_env = None
         if is_ray_initialized():
@@ -1556,7 +1560,9 @@ def create_engine_config(
         # Forward the deprecated CLI args to the StructuredOutputsConfig
         so_config = self.structured_outputs_config
         if self.guided_decoding_backend is not None:
-            so_config.backend = self.guided_decoding_backend
+            so_config.backend = cast(
+                StructuredOutputsBackend, self.guided_decoding_backend
+            )
         if self.guided_decoding_disable_fallback is not None:
             so_config.disable_fallback = self.guided_decoding_disable_fallback
         if self.guided_decoding_disable_any_whitespace is not None:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 45b798ed96cb..bc4e1f5176f9 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -51,7 +51,11 @@ def __init__(self, labelnames: list[str], vllm_config: VllmConfig):
 
         # Use this flag to hide metrics that were deprecated in
         # a previous release and which will be removed future
-        self.show_hidden_metrics = vllm_config.observability_config.show_hidden_metrics
+        self.show_hidden_metrics = (
+            vllm_config.observability_config.show_hidden_metrics
+            if vllm_config.observability_config is not None
+            else False
+        )
 
         # System stats
         #   Scheduler State
@@ -451,6 +455,11 @@ class LoggingStatLogger(StatLoggerBase):
 
     def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
         super().__init__(local_interval, vllm_config)
+        # Explicitly annotate attributes for mypy when follow-imports=skip
+        self.num_prompt_tokens: list[int] = []
+        self.num_generation_tokens: list[int] = []
+        self.last_local_log: float = time.time()
+        self.local_interval: float = local_interval
         self.last_prompt_throughput: Optional[float] = None
         self.last_generation_throughput: Optional[float] = None
 
@@ -533,6 +542,11 @@ def __init__(
         self, local_interval: float, labels: dict[str, str], vllm_config: VllmConfig
     ) -> None:
         super().__init__(local_interval, vllm_config)
+        # Explicitly annotate attributes for mypy when follow-imports=skip
+        self.num_prompt_tokens: list[int] = []
+        self.num_generation_tokens: list[int] = []
+        self.last_local_log: float = time.time()
+        self.local_interval: float = local_interval
         # Prometheus metrics
         self.labels = labels
         self.metrics = self._metrics_cls(

From a21884d112b399ac82ac8f9dad29590f8896186d Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Fri, 10 Oct 2025 03:42:25 +0000
Subject: [PATCH 07/21] fix

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 vllm/engine/metrics.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index bc4e1f5176f9..d1e11755d749 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -51,11 +51,8 @@ def __init__(self, labelnames: list[str], vllm_config: VllmConfig):
 
         # Use this flag to hide metrics that were deprecated in
         # a previous release and which will be removed future
-        self.show_hidden_metrics = (
-            vllm_config.observability_config.show_hidden_metrics
-            if vllm_config.observability_config is not None
-            else False
-        )
+        assert vllm_config.observability_config is not None
+        self.show_hidden_metrics = vllm_config.observability_config.show_hidden_metrics
 
         # System stats
         #   Scheduler State
@@ -455,11 +452,11 @@ class LoggingStatLogger(StatLoggerBase):
 
     def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
         super().__init__(local_interval, vllm_config)
-        # Explicitly annotate attributes for mypy when follow-imports=skip
-        self.num_prompt_tokens: list[int] = []
-        self.num_generation_tokens: list[int] = []
-        self.last_local_log: float = time.time()
-        self.local_interval: float = local_interval
+
+        self.num_prompt_tokens: list[int]
+        self.num_generation_tokens: list[int]
+        self.last_local_log: float
+        self.local_interval: float
         self.last_prompt_throughput: Optional[float] = None
         self.last_generation_throughput: Optional[float] = None
 
@@ -542,11 +539,11 @@ def __init__(
         self, local_interval: float, labels: dict[str, str], vllm_config: VllmConfig
     ) -> None:
         super().__init__(local_interval, vllm_config)
-        # Explicitly annotate attributes for mypy when follow-imports=skip
-        self.num_prompt_tokens: list[int] = []
-        self.num_generation_tokens: list[int] = []
-        self.last_local_log: float = time.time()
-        self.local_interval: float = local_interval
+
+        self.num_prompt_tokens: list[int]
+        self.num_generation_tokens: list[int]
+        self.last_local_log: float
+        self.local_interval: float
         # Prometheus metrics
         self.labels = labels
         self.metrics = self._metrics_cls(

From 335d034c08d9b1eab31187a2196add9f5931c7a3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Oct 2025 11:46:18 +0100
Subject: [PATCH 08/21] Update `get_field` to use `pydantic.Field`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/utils.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 3452315b3e2e..6fcc29d3d253 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -6,12 +6,12 @@
 import inspect
 import textwrap
 from collections.abc import Iterable
-from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
+from dataclasses import MISSING, fields, is_dataclass, replace
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar
 
 import regex as re
-from pydantic.fields import FieldInfo
+from pydantic.fields import Field, FieldInfo
 from typing_extensions import runtime_checkable
 
 if TYPE_CHECKING:
@@ -39,7 +39,7 @@ def config(cls: ConfigT) -> ConfigT:
     return cls
 
 
-def get_field(cls: ConfigType, name: str) -> Any:
+def get_field(cls: ConfigType, name: str) -> Field:
     """Get the default factory field of a dataclass by name. Used for getting
     default factory fields in `EngineArgs`."""
     if not is_dataclass(cls):
@@ -47,17 +47,17 @@ def get_field(cls: ConfigType, name: str) -> Any:
     cls_fields = {f.name: f for f in fields(cls)}
     if name not in cls_fields:
         raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
-    named_field: Field = cls_fields[name]
+    named_field = cls_fields[name]
     if (default_factory := named_field.default_factory) is not MISSING:
-        return field(default_factory=default_factory)
+        return Field(default_factory=default_factory)
     if (default := named_field.default) is not MISSING:
         if isinstance(default, FieldInfo):
             # Handle pydantic.Field defaults
             if default.default_factory is not None:
-                return field(default_factory=default.default_factory)
+                return Field(default_factory=default.default_factory)
             else:
                 default = default.default
-        return field(default=default)
+        return Field(default=default)
 
     raise ValueError(
         f"{cls.__name__}.{name} must have a default value or default factory."

From 4c3a72ba3a68208338f6870b16a7eaf747d4b319 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Oct 2025 13:54:30 +0100
Subject: [PATCH 09/21] The rest

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/cache.py     |  4 +--
 vllm/config/model.py     |  6 ++---
 vllm/config/scheduler.py |  8 +++---
 vllm/config/vllm.py      |  4 ++-
 vllm/engine/arg_utils.py | 55 +++++++++++++++++-----------------------
 vllm/engine/metrics.py   | 21 ++++-----------
 vllm/utils/__init__.py   | 21 ++++++---------
 vllm/utils/jsontree.py   | 14 +++++-----
 8 files changed, 55 insertions(+), 78 deletions(-)

diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index fd47d5c8f976..a158ffa06126 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -5,7 +5,7 @@
 from dataclasses import field
 from typing import TYPE_CHECKING, Any, Literal, Optional
 
-from pydantic import Field, SkipValidation, field_validator
+from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
@@ -30,7 +30,7 @@
 class CacheConfig:
     """Configuration for the KV cache."""
 
-    block_size: SkipValidation[BlockSize] = None  # type: ignore
+    block_size: BlockSize = Field(default=None)
     """Size of a contiguous cache block in number of tokens. On CUDA devices,
     only block sizes up to 32 are supported.
 
diff --git a/vllm/config/model.py b/vllm/config/model.py
index d0c027e47675..0436eab8dd36 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -18,7 +18,7 @@
 )
 
 import torch
-from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
+from pydantic import ConfigDict, Field, SkipValidation, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 
@@ -127,7 +127,7 @@ class ModelConfig:
 
     Note that the model may support other tasks using the same model runner.
     """
-    tokenizer: SkipValidation[str] = None  # type: ignore
+    tokenizer: str = Field(default=None)
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
     tokenizer_mode: TokenizerMode = "auto"
@@ -178,7 +178,7 @@ class ModelConfig:
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    max_model_len: SkipValidation[int] = None  # type: ignore
+    max_model_len: int = Field(default=None)
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 396258aac287..dd0b966caa77 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -5,7 +5,7 @@
 from dataclasses import InitVar, field
 from typing import Any, Literal, Union
 
-from pydantic import SkipValidation, model_validator
+from pydantic import Field, SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
 from typing_extensions import Self
 
@@ -31,19 +31,19 @@ class SchedulerConfig:
     runner_type: RunnerType = "generate"
     """The runner type to launch for the model."""
 
-    max_num_batched_tokens: SkipValidation[int] = None  # type: ignore
+    max_num_batched_tokens: int = Field(default=None)
     """Maximum number of tokens to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    max_num_seqs: SkipValidation[int] = None  # type: ignore
+    max_num_seqs: int = Field(default=None)
     """Maximum number of sequences to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    max_model_len: SkipValidation[int] = None  # type: ignore
+    max_model_len: int = Field(default=None)
     """Maximum length of a sequence (including prompt and generated text). This
     is primarily set in `ModelConfig` and that value should be manually
     duplicated here."""
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 833581035a31..791f44a4f268 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -76,7 +76,9 @@ class VllmConfig:
         default_factory=StructuredOutputsConfig
     )
     """Structured outputs configuration."""
-    observability_config: Optional[ObservabilityConfig] = None
+    observability_config: ObservabilityConfig = field(
+        default_factory=ObservabilityConfig
+    )
     """Observability configuration."""
     quant_config: Optional[QuantizationConfig] = None
     """Quantization configuration."""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f8d0d1255bf8..3c8be6dcf84f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -218,12 +218,11 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
             default = field.default
             # Handle pydantic.Field defaults
             if isinstance(default, FieldInfo):
-                if default.default_factory is not None and callable(
-                    default.default_factory
-                ):
-                    default = cast(Callable[[], Any], default.default_factory)()
-                else:
+                if default.default_factory is None:
                     default = default.default
+                else:
+                    default_factory = cast(Callable[[], Any], default.default_factory)
+                    default = default_factory()
         elif field.default_factory is not MISSING:
             default = field.default_factory()
 
@@ -354,7 +353,7 @@ class EngineArgs:
     dtype: ModelDType = ModelConfig.dtype
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
     seed: Optional[int] = ModelConfig.seed
-    max_model_len: Optional[int] = ModelConfig.max_model_len
+    max_model_len: int = ModelConfig.max_model_len
     cuda_graph_sizes: list[int] = get_field(SchedulerConfig, "cuda_graph_sizes")
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
@@ -395,7 +394,7 @@ class EngineArgs:
     max_parallel_loading_workers: Optional[int] = (
         ParallelConfig.max_parallel_loading_workers
     )
-    block_size: Optional[BlockSize] = CacheConfig.block_size
+    block_size: BlockSize = CacheConfig.block_size
     enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
     prefix_caching_hash_algo: PrefixCachingHashAlgo = (
         CacheConfig.prefix_caching_hash_algo
@@ -406,11 +405,11 @@ class EngineArgs:
     cpu_offload_gb: float = CacheConfig.cpu_offload_gb
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
     kv_cache_memory_bytes: Optional[int] = CacheConfig.kv_cache_memory_bytes
-    max_num_batched_tokens: Optional[int] = SchedulerConfig.max_num_batched_tokens
+    max_num_batched_tokens: int = SchedulerConfig.max_num_batched_tokens
     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
     max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
     long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
-    max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
+    max_num_seqs: int = SchedulerConfig.max_num_seqs
     max_logprobs: int = ModelConfig.max_logprobs
     logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
     disable_log_stats: bool = False
@@ -473,7 +472,7 @@ class EngineArgs:
     )
     reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
     # Deprecated guided decoding fields
-    guided_decoding_backend: Optional[str] = None
+    guided_decoding_backend: Optional[StructuredOutputsBackend] = None
     guided_decoding_disable_fallback: Optional[bool] = None
     guided_decoding_disable_any_whitespace: Optional[bool] = None
     guided_decoding_disable_additional_properties: Optional[bool] = None
@@ -518,7 +517,7 @@ class EngineArgs:
     additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config")
 
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
-    pt_load_map_location: Union[str, dict[str, str]] = LoadConfig.pt_load_map_location
+    pt_load_map_location: str | dict[str, str] = LoadConfig.pt_load_map_location
 
     # DEPRECATED
     enable_multimodal_encoder_data_parallel: bool = False
@@ -1102,7 +1101,11 @@ def create_model_config(self) -> ModelConfig:
 
             self.mm_encoder_tp_mode = "data"
 
-        model_config_kwargs: dict[str, Any] = dict(
+        kwargs = dict[str, Any]()
+        if self.tokenizer is not None:
+            kwargs["tokenizer"] = self.tokenizer
+
+        return ModelConfig(
             model=self.model,
             hf_config_path=self.hf_config_path,
             runner=self.runner,
@@ -1121,6 +1124,7 @@ def create_model_config(self) -> ModelConfig:
             hf_token=self.hf_token,
             hf_overrides=self.hf_overrides,
             tokenizer_revision=self.tokenizer_revision,
+            max_model_len=self.max_model_len,
             quantization=self.quantization,
             enforce_eager=self.enforce_eager,
             max_logprobs=self.max_logprobs,
@@ -1151,12 +1155,8 @@ def create_model_config(self) -> ModelConfig:
             logits_processors=self.logits_processors,
             video_pruning_rate=self.video_pruning_rate,
             io_processor_plugin=self.io_processor_plugin,
+            **kwargs,
         )
-        if self.tokenizer is not None:
-            model_config_kwargs["tokenizer"] = self.tokenizer
-        if self.max_model_len is not None:
-            model_config_kwargs["max_model_len"] = self.max_model_len
-        return ModelConfig(**model_config_kwargs)
 
     def validate_tensorizer_args(self):
         from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@@ -1250,8 +1250,6 @@ def create_engine_config(
         self.model = model_config.model
         self.tokenizer = model_config.tokenizer
 
-        # After ModelConfig init, tokenizer must be resolved (never None).
-        assert self.tokenizer is not None
         (self.model, self.tokenizer, self.speculative_config) = (
             maybe_override_with_speculators(
                 model=self.model,
@@ -1313,7 +1311,8 @@ def create_engine_config(
             f"dcp_size={self.decode_context_parallel_size}."
         )
 
-        cache_kwargs: dict[str, Any] = dict(
+        cache_config = CacheConfig(
+            block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
             kv_cache_memory_bytes=self.kv_cache_memory_bytes,
             swap_space=self.swap_space,
@@ -1329,9 +1328,6 @@ def create_engine_config(
             mamba_cache_dtype=self.mamba_cache_dtype,
             mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
         )
-        if self.block_size is not None:
-            cache_kwargs["block_size"] = self.block_size
-        cache_config = CacheConfig(**cache_kwargs)
 
         ray_runtime_env = None
         if is_ray_initialized():
@@ -1500,8 +1496,10 @@ def create_engine_config(
         if speculative_config is not None:
             num_lookahead_slots = speculative_config.num_lookahead_slots
 
-        scheduler_kwargs: dict[str, Any] = dict(
+        scheduler_config = SchedulerConfig(
             runner_type=model_config.runner_type,
+            max_num_batched_tokens=self.max_num_batched_tokens,
+            max_num_seqs=self.max_num_seqs,
             max_model_len=model_config.max_model_len,
             cuda_graph_sizes=self.cuda_graph_sizes,
             num_lookahead_slots=num_lookahead_slots,
@@ -1518,11 +1516,6 @@ def create_engine_config(
             disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager,
             async_scheduling=self.async_scheduling,
         )
-        if self.max_num_batched_tokens is not None:
-            scheduler_kwargs["max_num_batched_tokens"] = self.max_num_batched_tokens
-        if self.max_num_seqs is not None:
-            scheduler_kwargs["max_num_seqs"] = self.max_num_seqs
-        scheduler_config = SchedulerConfig(**scheduler_kwargs)
 
         if not model_config.is_multimodal_model and self.default_mm_loras:
             raise ValueError(
@@ -1560,9 +1553,7 @@ def create_engine_config(
         # Forward the deprecated CLI args to the StructuredOutputsConfig
         so_config = self.structured_outputs_config
         if self.guided_decoding_backend is not None:
-            so_config.backend = cast(
-                StructuredOutputsBackend, self.guided_decoding_backend
-            )
+            so_config.backend = self.guided_decoding_backend
         if self.guided_decoding_disable_fallback is not None:
             so_config.disable_fallback = self.guided_decoding_disable_fallback
         if self.guided_decoding_disable_any_whitespace is not None:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index d1e11755d749..05fdde3d9542 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -51,7 +51,6 @@ def __init__(self, labelnames: list[str], vllm_config: VllmConfig):
 
         # Use this flag to hide metrics that were deprecated in
         # a previous release and which will be removed future
-        assert vllm_config.observability_config is not None
         self.show_hidden_metrics = vllm_config.observability_config.show_hidden_metrics
 
         # System stats
@@ -452,11 +451,6 @@ class LoggingStatLogger(StatLoggerBase):
 
     def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
         super().__init__(local_interval, vllm_config)
-
-        self.num_prompt_tokens: list[int]
-        self.num_generation_tokens: list[int]
-        self.last_local_log: float
-        self.local_interval: float
         self.last_prompt_throughput: Optional[float] = None
         self.last_generation_throughput: Optional[float] = None
 
@@ -519,8 +513,8 @@ def log(self, stats: Stats) -> None:
 
     def _reset(self, stats, prompt_throughput, generation_throughput) -> None:
         # Reset tracked stats for next interval.
-        self.num_prompt_tokens = []
-        self.num_generation_tokens = []
+        self.num_prompt_tokens: list[int] = []
+        self.num_generation_tokens: list[int] = []
         self.last_local_log = stats.now
         self.last_prompt_throughput = prompt_throughput
         self.last_generation_throughput = generation_throughput
@@ -539,11 +533,6 @@ def __init__(
         self, local_interval: float, labels: dict[str, str], vllm_config: VllmConfig
     ) -> None:
         super().__init__(local_interval, vllm_config)
-
-        self.num_prompt_tokens: list[int]
-        self.num_generation_tokens: list[int]
-        self.last_local_log: float
-        self.local_interval: float
         # Prometheus metrics
         self.labels = labels
         self.metrics = self._metrics_cls(
@@ -671,9 +660,9 @@ def log(self, stats: Stats):
         # Log locally every local_interval seconds.
         if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval):
             # Reset tracked stats for next interval.
-            self.num_prompt_tokens = []
-            self.num_generation_tokens = []
-            self.last_local_log = stats.now
+            self.num_prompt_tokens: list[int] = []
+            self.num_generation_tokens: list[int] = []
+            self.last_local_log: float = stats.now
 
     def info(self, type: str, obj: SupportsMetricsInfo) -> None:
         # Info type metrics are syntactic sugar for a gauge permanently set to 1
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 314f9ee6c08f..1691572de0ac 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -81,7 +81,7 @@
 import setproctitle
 import torch
 import torch.types
-import yaml  # type: ignore[import-untyped]
+import yaml
 import zmq
 import zmq.asyncio
 from packaging import version
@@ -484,11 +484,8 @@ async def merge_async_iterators(
             yield 0, item
         return
 
-    loop = asyncio.get_running_loop()
-
     awaits: dict[asyncio.Task[T], tuple[int, AsyncGenerator[T, None]]] = {
-        loop.create_task(anext(it)): (i, it)  # type: ignore[arg-type]
-        for i, it in enumerate(iterators)
+        asyncio.ensure_future(anext(it)): (i, it) for i, it in enumerate(iterators)
     }
     try:
         while awaits:
@@ -498,7 +495,7 @@ async def merge_async_iterators(
                 try:
                     item = await d
                     i, it = pair
-                    awaits[loop.create_task(anext(it))] = pair  # type: ignore[arg-type]
+                    awaits[asyncio.ensure_future(anext(it))] = pair
                     yield i, item
                 except StopAsyncIteration:
                     pass
@@ -1166,13 +1163,11 @@ def find_nccl_include_paths() -> list[str] | None:
         import importlib.util
 
         spec = importlib.util.find_spec("nvidia.nccl")
-        if spec:
-            locations = getattr(spec, "submodule_search_locations", None)
-            if locations:
-                for loc in locations:
-                    inc_dir = os.path.join(loc, "include")
-                    if os.path.exists(os.path.join(inc_dir, "nccl.h")):
-                        paths.append(inc_dir)
+        if spec is not None and spec.submodule_search_locations is not None:
+            for loc in spec.submodule_search_locations:
+                inc_dir = os.path.join(loc, "include")
+                if os.path.exists(os.path.join(inc_dir, "nccl.h")):
+                    paths.append(inc_dir)
     except Exception:
         pass
 
diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py
index 045c547dd7be..4c833d9a0125 100644
--- a/vllm/utils/jsontree.py
+++ b/vllm/utils/jsontree.py
@@ -4,7 +4,7 @@
 
 from collections.abc import Iterable
 from functools import reduce
-from typing import TYPE_CHECKING, Any, Callable, TypeVar, Union, overload
+from typing import TYPE_CHECKING, Callable, TypeVar, Union, cast, overload
 
 if TYPE_CHECKING:
     import torch
@@ -83,7 +83,7 @@ def json_map_leaves(
 ) -> JSONTree[_U]: ...
 
 
-def json_map_leaves(
+def json_map_leaves(  # type: ignore[misc]
     func: Callable[[_T], _U],
     value: Union["BatchedTensorInputs", _JSONTree[_T]],
 ) -> Union["BatchedTensorInputs", _JSONTree[_U]]:
@@ -143,18 +143,18 @@ def json_reduce_leaves(
 
 
 def json_reduce_leaves(
-    func: Callable[..., Any],
-    value: _JSONTree[Any],
-    initial: Any = ...,  # noqa: B008
+    func: Callable[..., Union[_T, _U]],
+    value: _JSONTree[_T],
+    initial: _U = cast(_U, ...),  # type: ignore  # noqa
     /,
-) -> Any:
+) -> Union[_T, _U]:
     """
     Apply a function of two arguments cumulatively to each leaf in a
     nested JSON structure, from left to right, so as to reduce the
     sequence to a single value.
     """
     if initial is ...:
-        return reduce(func, json_iter_leaves(value))  # type: ignore[arg-type]
+        return reduce(func, json_iter_leaves(value))  # type: ignore
 
     return reduce(
         func,  # type: ignore[arg-type]

From fe9e0f58e05c2c351d35456dc3460539671aad11 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Oct 2025 14:28:42 +0100
Subject: [PATCH 10/21] Don't use unsupported block size in test

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/v1/core/test_scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index dfa965c56766..a220f852d76d 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1023,7 +1023,7 @@ def test_kv_connector_unable_to_allocate():
     """
 
     # Setup Scheduler With Mock External Cache Hit.
-    BLOCK_SIZE = 4
+    BLOCK_SIZE = 8
     NUM_BLOCKS = 10
     scheduler = create_scheduler(
         enable_prefix_caching=True,

From bcab4c3ace9e5dc0579d860c26532e286cf7e504 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Oct 2025 15:31:32 +0100
Subject: [PATCH 11/21] Use `get_field` for fields which now use `Field`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3c8be6dcf84f..f42ef30249aa 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -335,7 +335,7 @@ class EngineArgs:
 
     model: str = ModelConfig.model
     served_model_name: Optional[Union[str, list[str]]] = ModelConfig.served_model_name
-    tokenizer: Optional[str] = ModelConfig.tokenizer
+    tokenizer: Optional[str] = get_field(ModelConfig, "tokenizer")
     hf_config_path: Optional[str] = ModelConfig.hf_config_path
     runner: RunnerOption = ModelConfig.runner
     convert: ConvertOption = ModelConfig.convert
@@ -353,7 +353,7 @@ class EngineArgs:
     dtype: ModelDType = ModelConfig.dtype
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
     seed: Optional[int] = ModelConfig.seed
-    max_model_len: int = ModelConfig.max_model_len
+    max_model_len: int = get_field(ModelConfig, "max_model_len")
     cuda_graph_sizes: list[int] = get_field(SchedulerConfig, "cuda_graph_sizes")
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
@@ -394,7 +394,7 @@ class EngineArgs:
     max_parallel_loading_workers: Optional[int] = (
         ParallelConfig.max_parallel_loading_workers
     )
-    block_size: BlockSize = CacheConfig.block_size
+    block_size: BlockSize = get_field(CacheConfig, "block_size")
     enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
     prefix_caching_hash_algo: PrefixCachingHashAlgo = (
         CacheConfig.prefix_caching_hash_algo
@@ -405,11 +405,11 @@ class EngineArgs:
     cpu_offload_gb: float = CacheConfig.cpu_offload_gb
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
     kv_cache_memory_bytes: Optional[int] = CacheConfig.kv_cache_memory_bytes
-    max_num_batched_tokens: int = SchedulerConfig.max_num_batched_tokens
+    max_num_batched_tokens: int = get_field(SchedulerConfig, "max_num_batched_tokens")
     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
     max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
     long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
-    max_num_seqs: int = SchedulerConfig.max_num_seqs
+    max_num_seqs: int = get_field(SchedulerConfig, "max_num_seqs")
     max_logprobs: int = ModelConfig.max_logprobs
     logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
     disable_log_stats: bool = False

From 6fb333d9f5e33f29d91b7cbb8eef945c990a815e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Oct 2025 16:01:12 +0100
Subject: [PATCH 12/21] Update other invalid block choice

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/v1/core/test_scheduler.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index a220f852d76d..744acc3beb94 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1022,9 +1022,9 @@ def test_kv_connector_unable_to_allocate():
     unable to allocate (run out of blocks in allocate_slots().
     """
 
-    # Setup Scheduler With Mock External Cache Hit.
+    # Setup Scheduler With Mock External Cache Hit. (2 blocks, 2 null)
     BLOCK_SIZE = 8
-    NUM_BLOCKS = 10
+    NUM_BLOCKS = 18
     scheduler = create_scheduler(
         enable_prefix_caching=True,
         use_kv_connector=True,
@@ -1104,10 +1104,9 @@ def test_kv_connector_handles_preemption():
     unable to allocate (run out of blocks in allocate_slots().
     """
 
-    # Setup Scheduler With Mock External Cache Hit.
-    BLOCK_SIZE = 2
-    # NOTE: there is 1 null block, so this is 6 blocks.
-    NUM_BLOCKS = 7
+    # Setup Scheduler With Mock External Cache Hit. (6 blocks, 1 null)
+    BLOCK_SIZE = 8
+    NUM_BLOCKS = 49
     scheduler = create_scheduler(
         enable_prefix_caching=True,
         use_kv_connector=True,

From 2a9189515fc2eb1f42988a6f9037ba2153bba74f Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Fri, 10 Oct 2025 18:44:01 +0000
Subject: [PATCH 13/21] fix test_scheduler

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 tests/v1/core/test_scheduler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 744acc3beb94..1f874f4eb206 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1106,7 +1106,7 @@ def test_kv_connector_handles_preemption():
 
     # Setup Scheduler With Mock External Cache Hit. (6 blocks, 1 null)
     BLOCK_SIZE = 8
-    NUM_BLOCKS = 49
+    NUM_BLOCKS = 7
     scheduler = create_scheduler(
         enable_prefix_caching=True,
         use_kv_connector=True,
@@ -1125,8 +1125,8 @@ def test_kv_connector_handles_preemption():
     # Both can be scheduled at first, but the second request
     # will be preempted and re-scheduled.
     NUM_REQUESTS = 2
-    NUM_TOKENS = BLOCK_SIZE * 2 + 1
-    MAX_TOKENS = BLOCK_SIZE * 2
+    NUM_TOKENS = 3 * BLOCK_SIZE - 1
+    MAX_TOKENS = 4
     requests = create_requests(
         num_requests=NUM_REQUESTS,
         num_tokens=NUM_TOKENS,

From 044d190a6ba166dba49df5237d2a4b317e0ca270 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Fri, 10 Oct 2025 19:34:06 +0000
Subject: [PATCH 14/21] fix

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 tests/v1/core/test_scheduler.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 1f874f4eb206..1bf56ac3803e 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1022,9 +1022,9 @@ def test_kv_connector_unable_to_allocate():
     unable to allocate (run out of blocks in allocate_slots().
     """
 
-    # Setup Scheduler With Mock External Cache Hit. (2 blocks, 2 null)
+    # Setup Scheduler With Mock External Cache Hit.
     BLOCK_SIZE = 8
-    NUM_BLOCKS = 18
+    NUM_BLOCKS = 10
     scheduler = create_scheduler(
         enable_prefix_caching=True,
         use_kv_connector=True,
@@ -1104,8 +1104,9 @@ def test_kv_connector_handles_preemption():
     unable to allocate (run out of blocks in allocate_slots().
     """
 
-    # Setup Scheduler With Mock External Cache Hit. (6 blocks, 1 null)
+    # Setup Scheduler With Mock External Cache Hit.
     BLOCK_SIZE = 8
+    # NOTE: there is 1 null block, so this is 6 blocks.
     NUM_BLOCKS = 7
     scheduler = create_scheduler(
         enable_prefix_caching=True,

From 51d898018f033ccbc0da919e5c4b825b73b1d94d Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Thu, 16 Oct 2025 03:07:11 +0000
Subject: [PATCH 15/21] fix pre-commit

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 vllm/config/cache.py     |  2 +-
 vllm/config/model.py     |  2 +-
 vllm/config/parallel.py  | 20 +++++++++-----------
 vllm/config/scheduler.py |  4 ++--
 vllm/engine/arg_utils.py | 11 +++++++----
 5 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index f35117a747ef..6d15e79534f8 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -30,7 +30,7 @@
 class CacheConfig:
     """Configuration for the KV cache."""
 
-    block_size: BlockSize = Field(default=None)
+    block_size: BlockSize | None = Field(default=None)
     """Size of a contiguous cache block in number of tokens. On CUDA devices,
     only block sizes up to 32 are supported.
 
diff --git a/vllm/config/model.py b/vllm/config/model.py
index a9f538e1627c..8c5eb7168b5f 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -171,7 +171,7 @@ class ModelConfig:
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    max_model_len: int = Field(default=None)
+    max_model_len: int | None = Field(default=None)
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
 
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 944a1e8666f4..155b4c528c8c 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -32,6 +32,14 @@
 ExpertPlacementStrategy = Literal["linear", "round_robin"]
 DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
 DataParallelBackend = Literal["ray", "mp"]
+All2allBackendType = Literal[
+    "naive",
+    "pplx",
+    "deepep_high_throughput",
+    "deepep_low_latency",
+    "allgather_reducescatter",
+    "flashinfer_all2allv",
+]
 
 
 @config
@@ -113,17 +121,7 @@ class ParallelConfig:
       with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
       will have experts [1, 3]. This strategy can help improve load balancing
       for grouped expert models with no redundant experts."""
-    all2all_backend: (
-        Literal[
-            "naive",
-            "pplx",
-            "deepep_high_throughput",
-            "deepep_low_latency",
-            "allgather_reducescatter",
-            "flashinfer_all2allv",
-        ]
-        | None
-    ) = None
+    all2all_backend: All2allBackendType | None = None
     """All2All backend for MoE expert parallel communication. If not set, uses
     the value from VLLM_ALL2ALL_BACKEND environment variable. Available options:
     - "naive": Naive all2all implementation using broadcasts
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 7aa0af67c928..ba309b267596 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -31,13 +31,13 @@ class SchedulerConfig:
     runner_type: RunnerType = "generate"
     """The runner type to launch for the model."""
 
-    max_num_batched_tokens: int = Field(default=None)
+    max_num_batched_tokens: int | None = Field(default=None)
     """Maximum number of tokens to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    max_num_seqs: int = Field(default=None)
+    max_num_seqs: int | None = Field(default=None)
     """Maximum number of sequences to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 84823146d348..c88e4f2cfd32 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -55,6 +55,7 @@
 )
 from vllm.config.cache import BlockSize, CacheDType, MambaDType, PrefixCachingHashAlgo
 from vllm.config.device import Device
+from vllm.config.lora import LoRAExtraVocabSize, MaxLoRARanks
 from vllm.config.model import (
     ConvertOption,
     HfOverrides,
@@ -67,11 +68,13 @@
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
 from vllm.config.observability import DetailedTraceModules
 from vllm.config.parallel import (
+    All2allBackendType,
     DataParallelBackend,
     DistributedExecutorBackend,
     ExpertPlacementStrategy,
 )
 from vllm.config.scheduler import SchedulerPolicy
+from vllm.config.structured_outputs import StructuredOutputsBackend
 from vllm.config.utils import get_field
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
@@ -375,7 +378,7 @@ class EngineArgs:
     data_parallel_hybrid_lb: bool = False
     data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
-    all2all_backend: str | None = ParallelConfig.all2all_backend
+    all2all_backend: All2allBackendType | None = ParallelConfig.all2all_backend
     enable_dbo: bool = ParallelConfig.enable_dbo
     dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
     dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold
@@ -449,12 +452,12 @@ class EngineArgs:
     # LoRA fields
     enable_lora: bool = False
     max_loras: int = LoRAConfig.max_loras
-    max_lora_rank: int = LoRAConfig.max_lora_rank
+    max_lora_rank: MaxLoRARanks = LoRAConfig.max_lora_rank
     default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras
     fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
     max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
     lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
-    lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
+    lora_extra_vocab_size: LoRAExtraVocabSize = LoRAConfig.lora_extra_vocab_size
 
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
@@ -474,7 +477,7 @@ class EngineArgs:
     )
     reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
     # Deprecated guided decoding fields
-    guided_decoding_backend: str | None = None
+    guided_decoding_backend: StructuredOutputsBackend | None = None
     guided_decoding_disable_fallback: bool | None = None
     guided_decoding_disable_any_whitespace: bool | None = None
     guided_decoding_disable_additional_properties: bool | None = None

From ec7c358af57e5c2680808c80174d1357b306659d Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Thu, 16 Oct 2025 03:42:03 +0000
Subject: [PATCH 16/21] fix

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 vllm/config/cache.py     | 2 +-
 vllm/config/model.py     | 2 +-
 vllm/config/scheduler.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 6d15e79534f8..f35117a747ef 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -30,7 +30,7 @@
 class CacheConfig:
     """Configuration for the KV cache."""
 
-    block_size: BlockSize | None = Field(default=None)
+    block_size: BlockSize = Field(default=None)
     """Size of a contiguous cache block in number of tokens. On CUDA devices,
     only block sizes up to 32 are supported.
 
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 8c5eb7168b5f..a9f538e1627c 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -171,7 +171,7 @@ class ModelConfig:
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    max_model_len: int | None = Field(default=None)
+    max_model_len: int = Field(default=None)
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index ba309b267596..7aa0af67c928 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -31,13 +31,13 @@ class SchedulerConfig:
     runner_type: RunnerType = "generate"
     """The runner type to launch for the model."""
 
-    max_num_batched_tokens: int | None = Field(default=None)
+    max_num_batched_tokens: int = Field(default=None)
     """Maximum number of tokens to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    max_num_seqs: int | None = Field(default=None)
+    max_num_seqs: int = Field(default=None)
     """Maximum number of sequences to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will

From 5f7cef9e35ed90e959bcbb96cc537c6e42999d2b Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Thu, 16 Oct 2025 04:36:55 +0000
Subject: [PATCH 17/21] fix

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 vllm/engine/arg_utils.py  | 12 ++++++------
 vllm/utils/async_utils.py |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c88e4f2cfd32..ee183772a3ec 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1137,7 +1137,7 @@ def create_model_config(self) -> ModelConfig:
             hf_token=self.hf_token,
             hf_overrides=self.hf_overrides,
             tokenizer_revision=self.tokenizer_revision,
-            max_model_len=self.max_model_len,
+            max_model_len=self.max_model_len,  # type: ignore[arg-type]
             quantization=self.quantization,
             enforce_eager=self.enforce_eager,
             max_logprobs=self.max_logprobs,
@@ -1325,7 +1325,7 @@ def create_engine_config(
         )
 
         cache_config = CacheConfig(
-            block_size=self.block_size,
+            block_size=self.block_size,  # type: ignore[arg-type]
             gpu_memory_utilization=self.gpu_memory_utilization,
             kv_cache_memory_bytes=self.kv_cache_memory_bytes,
             swap_space=self.swap_space,
@@ -1512,9 +1512,9 @@ def create_engine_config(
 
         scheduler_config = SchedulerConfig(
             runner_type=model_config.runner_type,
-            max_num_batched_tokens=self.max_num_batched_tokens,
-            max_num_seqs=self.max_num_seqs,
-            max_model_len=model_config.max_model_len,
+            max_num_batched_tokens=self.max_num_batched_tokens,  # type: ignore[arg-type]
+            max_num_seqs=self.max_num_seqs,  # type: ignore[arg-type]
+            max_model_len=model_config.max_model_len,  # type: ignore[arg-type]
             cuda_graph_sizes=self.cuda_graph_sizes,
             num_lookahead_slots=num_lookahead_slots,
             enable_chunked_prefill=self.enable_chunked_prefill,
@@ -1616,7 +1616,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             return False
 
         # No Mamba or Encoder-Decoder so far.
-        if not model_config.is_v1_compatible:
+        if not getattr(model_config, "is_v1_compatible", True):
             _raise_or_fallback(
                 feature_name=str(model_config.architectures), recommend_to_remove=False
             )
diff --git a/vllm/utils/async_utils.py b/vllm/utils/async_utils.py
index aeabd808add5..4f0f36859de9 100644
--- a/vllm/utils/async_utils.py
+++ b/vllm/utils/async_utils.py
@@ -270,7 +270,7 @@ async def merge_async_iterators(
 
     loop = asyncio.get_running_loop()
 
-    awaits = {loop.create_task(anext(it)): (i, it) for i, it in enumerate(iterators)}
+    awaits = {loop.create_task(anext(it)): (i, it) for i, it in enumerate(iterators)}  # type: ignore[var-annotated, arg-type]
     try:
         while awaits:
             done, _ = await asyncio.wait(awaits.keys(), return_when=FIRST_COMPLETED)
@@ -279,7 +279,7 @@ async def merge_async_iterators(
                 try:
                     item = await d
                     i, it = pair
-                    awaits[loop.create_task(anext(it))] = pair
+                    awaits[loop.create_task(anext(it))] = pair  # type: ignore[arg-type]
                     yield i, item
                 except StopAsyncIteration:
                     pass

From 41eaf97d7944d96e2a061212f532242d005890a3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 17 Oct 2025 11:48:36 +0200
Subject: [PATCH 18/21] Don't use `pydantic.Field` for `get_field`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 6fcc29d3d253..5e7e7580c5a9 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -6,12 +6,12 @@
 import inspect
 import textwrap
 from collections.abc import Iterable
-from dataclasses import MISSING, fields, is_dataclass, replace
+from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar
 
 import regex as re
-from pydantic.fields import Field, FieldInfo
+from pydantic.fields import FieldInfo
 from typing_extensions import runtime_checkable
 
 if TYPE_CHECKING:
@@ -47,17 +47,17 @@ def get_field(cls: ConfigType, name: str) -> Field:
     cls_fields = {f.name: f for f in fields(cls)}
     if name not in cls_fields:
         raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
-    named_field = cls_fields[name]
+    named_field: Field = cls_fields[name]
     if (default_factory := named_field.default_factory) is not MISSING:
-        return Field(default_factory=default_factory)
+        return field(default_factory=default_factory)
     if (default := named_field.default) is not MISSING:
         if isinstance(default, FieldInfo):
             # Handle pydantic.Field defaults
             if default.default_factory is not None:
-                return Field(default_factory=default.default_factory)
+                return field(default_factory=default.default_factory)
             else:
                 default = default.default
-        return Field(default=default)
+        return field(default=default)
 
     raise ValueError(
         f"{cls.__name__}.{name} must have a default value or default factory."

From 3a4a61624882150e3b3126c07d57f218933552fa Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 17 Oct 2025 11:55:23 +0200
Subject: [PATCH 19/21] Use wrap validator to only skip validation when `None`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/cache.py     |  8 ++++++++
 vllm/config/model.py     | 15 +++++++++++----
 vllm/engine/arg_utils.py |  6 +-----
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index f35117a747ef..6a28bdb753ab 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
+from collections.abc import Callable
 from dataclasses import field
 from typing import TYPE_CHECKING, Any, Literal
 
@@ -150,6 +151,13 @@ def metrics_info(self):
         # metrics info
         return {key: str(value) for key, value in self.__dict__.items()}
 
+    @field_validator("block_size", mode="wrap")
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        if value is None:
+            return value
+        return handler(value)
+
     @field_validator("cache_dtype", mode="after")
     @classmethod
     def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
diff --git a/vllm/config/model.py b/vllm/config/model.py
index a9f538e1627c..4c7bf99c26b4 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -10,7 +10,7 @@
 from typing import TYPE_CHECKING, Any, Literal, cast, get_args
 
 import torch
-from pydantic import ConfigDict, Field, SkipValidation, field_validator, model_validator
+from pydantic import ConfigDict, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 
@@ -120,7 +120,7 @@ class ModelConfig:
 
     Note that the model may support other tasks using the same model runner.
     """
-    tokenizer: str = Field(default=None)
+    tokenizer: str = None
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
     tokenizer_mode: TokenizerMode = "auto"
@@ -171,7 +171,7 @@ class ModelConfig:
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    max_model_len: int = Field(default=None)
+    max_model_len: int = None
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
 
@@ -182,7 +182,7 @@ class ModelConfig:
     - 25.6k -> 25,600"""
     spec_target_max_model_len: int | None = None
     """Specify the maximum length for spec decoding draft models."""
-    quantization: SkipValidation[QuantizationMethods | None] = None
+    quantization: str | QuantizationMethods | None = None
     """Method used to quantize the weights. If `None`, we first check the
     `quantization_config` attribute in the model config file. If that is
     `None`, we assume the model weights are not quantized and use `dtype` to
@@ -302,6 +302,13 @@ class ModelConfig:
     skip_mm_profiling: InitVar[bool | None] = None
     video_pruning_rate: InitVar[float | None] = None
 
+    @field_validator("tokenizer", "max_model_len", mode="wrap")
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        if value is None:
+            return value
+        return handler(value)
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ee183772a3ec..7ce47c581a50 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1114,16 +1114,13 @@ def create_model_config(self) -> ModelConfig:
 
             self.mm_encoder_tp_mode = "data"
 
-        kwargs = dict[str, Any]()
-        if self.tokenizer is not None:
-            kwargs["tokenizer"] = self.tokenizer
-
         return ModelConfig(
             model=self.model,
             hf_config_path=self.hf_config_path,
             runner=self.runner,
             convert=self.convert,
             task=self.task,
+            tokenizer=self.tokenizer,  # type: ignore[arg-type]
             tokenizer_mode=self.tokenizer_mode,
             trust_remote_code=self.trust_remote_code,
             allowed_local_media_path=self.allowed_local_media_path,
@@ -1168,7 +1165,6 @@ def create_model_config(self) -> ModelConfig:
             logits_processors=self.logits_processors,
             video_pruning_rate=self.video_pruning_rate,
             io_processor_plugin=self.io_processor_plugin,
-            **kwargs,
         )
 
     def validate_tensorizer_args(self):

From 18d254b76e982c1d70b5b4a67e59639704802d55 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 17 Oct 2025 12:04:27 +0200
Subject: [PATCH 20/21] pre-commit

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/utils.py     | 2 +-
 vllm/engine/arg_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 5e7e7580c5a9..3452315b3e2e 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -39,7 +39,7 @@ def config(cls: ConfigT) -> ConfigT:
     return cls
 
 
-def get_field(cls: ConfigType, name: str) -> Field:
+def get_field(cls: ConfigType, name: str) -> Any:
     """Get the default factory field of a dataclass by name. Used for getting
     default factory fields in `EngineArgs`."""
     if not is_dataclass(cls):
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7ce47c581a50..2575665e1ad2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -426,7 +426,7 @@ class EngineArgs:
     hf_token: bool | str | None = ModelConfig.hf_token
     hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
     tokenizer_revision: str | None = ModelConfig.tokenizer_revision
-    quantization: QuantizationMethods | None = ModelConfig.quantization
+    quantization: str | QuantizationMethods | None = ModelConfig.quantization
     enforce_eager: bool = ModelConfig.enforce_eager
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
     limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field(

From f23315d92bd0654a7a3a4ab259ce903ae1830516 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 21 Oct 2025 11:41:34 +0200
Subject: [PATCH 21/21] Skip some more `None` validation

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/cache.py     |  2 +-
 vllm/config/scheduler.py | 24 +++++++++++++++++++-----
 vllm/config/vllm.py      |  2 +-
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 6a28bdb753ab..41537b56707e 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -31,7 +31,7 @@
 class CacheConfig:
     """Configuration for the KV cache."""
 
-    block_size: BlockSize = Field(default=None)
+    block_size: BlockSize = None
     """Size of a contiguous cache block in number of tokens. On CUDA devices,
     only block sizes up to 32 are supported.
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 7aa0af67c928..f4015f43de8b 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -2,10 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
+from collections.abc import Callable
 from dataclasses import InitVar, field
 from typing import Any, Literal
 
-from pydantic import Field, SkipValidation, model_validator
+from pydantic import field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from typing_extensions import Self
 
@@ -31,19 +32,19 @@ class SchedulerConfig:
     runner_type: RunnerType = "generate"
     """The runner type to launch for the model."""
 
-    max_num_batched_tokens: int = Field(default=None)
+    max_num_batched_tokens: int = None
     """Maximum number of tokens to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    max_num_seqs: int = Field(default=None)
+    max_num_seqs: int = None
     """Maximum number of sequences to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    max_model_len: int = Field(default=None)
+    max_model_len: int = None
     """Maximum length of a sequence (including prompt and generated text). This
     is primarily set in `ModelConfig` and that value should be manually
     duplicated here."""
@@ -79,7 +80,7 @@ class SchedulerConfig:
     3. more than one value (e.g. 1 2 128) is provided, then the capture list
     will follow the provided list."""
 
-    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
+    enable_chunked_prefill: bool = None
     """If True, prefill requests can be chunked based
     on the remaining max_num_batched_tokens."""
 
@@ -169,6 +170,19 @@ def compute_hash(self) -> str:
         hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
+    @field_validator(
+        "max_num_batched_tokens",
+        "max_num_seqs",
+        "max_model_len",
+        "enable_chunked_prefill",
+        mode="wrap",
+    )
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        if value is None:
+            return value
+        return handler(value)
+
     def __post_init__(self, is_encoder_decoder: bool) -> None:
         if self.max_model_len is None:
             self.max_model_len = 8192
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index da7ec7032969..cd31fade025b 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -57,7 +57,7 @@ class VllmConfig:
 
     # TODO: use default_factory once default constructing ModelConfig doesn't
     # try to download a model
-    model_config: ModelConfig = Field(default=None)
+    model_config: ModelConfig = None
     """Model configuration."""
     cache_config: CacheConfig = Field(default_factory=CacheConfig)
     """Cache configuration."""