diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index af47531501cf..b837b830e774 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -2,10 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib -from dataclasses import InitVar, field +from collections.abc import Callable +from dataclasses import InitVar from typing import Any, Literal -from pydantic import SkipValidation, model_validator +from pydantic import Field, field_validator, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self @@ -31,28 +32,28 @@ class SchedulerConfig: runner_type: RunnerType = "generate" """The runner type to launch for the model.""" - max_num_batched_tokens: SkipValidation[int] = None # type: ignore + max_num_batched_tokens: int = Field(default=None, ge=1) """Maximum number of tokens to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_num_seqs: SkipValidation[int] = None # type: ignore + max_num_seqs: int = Field(default=None, ge=1) """Maximum number of sequences to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_model_len: SkipValidation[int] = None # type: ignore + max_model_len: int = Field(default=None, ge=1) """Maximum length of a sequence (including prompt and generated text). This is primarily set in `ModelConfig` and that value should be manually duplicated here.""" - max_num_partial_prefills: int = 1 + max_num_partial_prefills: int = Field(default=1, ge=1) """For chunked prefill, the maximum number of sequences that can be partially prefilled concurrently.""" - max_long_partial_prefills: int = 1 + max_long_partial_prefills: int = Field(default=1, ge=1) """For chunked prefill, the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. Setting this less than max_num_partial_prefills will allow shorter prompts to jump @@ -62,7 +63,7 @@ class SchedulerConfig: """For chunked prefill, a request is considered long if the prompt is longer than this number of tokens.""" - num_lookahead_slots: int = 0 + num_lookahead_slots: int = Field(default=0, ge=0) """The number of slots to allocate per sequence per step, beyond the known token ids. This is used in speculative decoding to store KV activations of tokens which may or may not be @@ -71,7 +72,7 @@ class SchedulerConfig: NOTE: This will be replaced by speculative config in the future; it is present to enable correctness tests until then.""" - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore + enable_chunked_prefill: bool = Field(default=None) """If True, prefill requests can be chunked based on the remaining max_num_batched_tokens.""" @@ -86,14 +87,14 @@ class SchedulerConfig: """ # TODO (ywang96): Make this configurable. - max_num_encoder_input_tokens: int = field(init=False) + max_num_encoder_input_tokens: int = Field(init=False) """Multimodal encoder compute budget, only used in V1. NOTE: This is not currently configurable. It will be overridden by max_num_batched_tokens in case max multimodal embedding size is larger.""" # TODO (ywang96): Make this configurable. - encoder_cache_size: int = field(init=False) + encoder_cache_size: int = Field(init=False) """Multimodal encoder cache size, only used in V1. NOTE: This is not currently configurable. It will be overridden by @@ -106,7 +107,7 @@ class SchedulerConfig: - "priority" means requests are handled based on given priority (lower value means earlier handling) and time of arrival deciding any ties).""" - chunked_prefill_enabled: bool = field(init=False) + chunked_prefill_enabled: bool = Field(init=False) """True if chunked prefill is enabled.""" disable_chunked_mm_input: bool = False @@ -155,6 +156,20 @@ def compute_hash(self) -> str: hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str + @field_validator( + "max_num_batched_tokens", + "max_num_seqs", + "max_model_len", + "enable_chunked_prefill", + mode="wrap", + ) + @classmethod + def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: + """Skip validation if the value is `None` when initialisation is delayed.""" + if value is None: + return value + return handler(value) + def __post_init__(self, is_encoder_decoder: bool) -> None: if self.max_model_len is None: self.max_model_len = 8192 @@ -260,19 +275,7 @@ def _verify_args(self) -> Self: self.max_num_seqs * self.max_model_len, ) - if self.num_lookahead_slots < 0: - raise ValueError( - "num_lookahead_slots " - f"({self.num_lookahead_slots}) must be greater than or " - "equal to 0." - ) - - if self.max_num_partial_prefills < 1: - raise ValueError( - f"max_num_partial_prefills ({self.max_num_partial_prefills}) " - "must be greater than or equal to 1." - ) - elif self.max_num_partial_prefills > 1: + if self.max_num_partial_prefills > 1: if not self.chunked_prefill_enabled: raise ValueError( "Chunked prefill must be enabled to set " @@ -286,13 +289,10 @@ def _verify_args(self) -> Self: f"than the max_model_len ({self.max_model_len})." ) - if (self.max_long_partial_prefills < 1) or ( - self.max_long_partial_prefills > self.max_num_partial_prefills - ): + if self.max_long_partial_prefills > self.max_num_partial_prefills: raise ValueError( - f"max_long_partial_prefills ({self.max_long_partial_prefills}) " - "must be greater than or equal to 1 and less than or equal to " - f"max_num_partial_prefills ({self.max_num_partial_prefills})." + f"{self.max_long_partial_prefills=} must be less than or equal to " + f"{self.max_num_partial_prefills=}." ) return self diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py index 76b565006e28..85b6e42264a4 100644 --- a/vllm/config/structured_outputs.py +++ b/vllm/config/structured_outputs.py @@ -2,8 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib -from typing import Any, Literal +from typing import Any, Literal, Self +from pydantic import model_validator from pydantic.dataclasses import dataclass from vllm.config.utils import config @@ -56,7 +57,8 @@ def compute_hash(self) -> str: hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str - def __post_init__(self): + @model_validator(mode="after") + def _validate_structured_output_config(self) -> Self: if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"): raise ValueError( "disable_any_whitespace is only supported for " @@ -67,3 +69,4 @@ def __post_init__(self): "disable_additional_properties is only supported " "for the guidance backend." ) + return self diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b31e4931f229..e788d2d02a3a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1813,7 +1813,7 @@ def _set_default_args( incremental_prefill_supported = ( pooling_type is not None and pooling_type.lower() == "last" - and is_causal + and bool(is_causal) ) action = "Enabling" if incremental_prefill_supported else "Disabling" diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py index 06dd336bf9cf..d083ece892d5 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py @@ -2,11 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json -import re import uuid from collections.abc import Sequence from typing import Any +import regex as re + from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, DeltaFunctionCall,