vllm-project · hmellor · Jun 30, 2026 · May 29, 2026 · May 30, 2026 · May 30, 2026
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -1322,6 +1322,34 @@ def test_scheduler_config_init():
         print(SchedulerConfig.default_factory().max_model_len)
 
 
+def test_long_prefill_token_threshold_rejects_negative():
+    # 0 means "disabled"; any negative is silently ineffective downstream
+    # because the scheduler clamp is guarded by `0 < threshold`. Reject at
+    # admission instead. See https://github.com/vllm-project/vllm/issues/43985
+    SchedulerConfig(
+        max_model_len=4096,
+        is_encoder_decoder=False,
+        long_prefill_token_threshold=0,
+    )
+    with pytest.raises(ValidationError):
+        SchedulerConfig(
+            max_model_len=4096,
+            is_encoder_decoder=False,
+            long_prefill_token_threshold=-5,
+        )
+
+
+def test_max_logprobs_rejects_negative_other_than_minus_one():
+    # -1 is the sentinel for "no cap, use vocab size"; other negatives are
+    # silently ineffective for logprob-free traffic and produce a confusing
+    # "max allowed: <negative>" message otherwise. Reject at admission.
+    # See https://github.com/vllm-project/vllm/issues/43985
+    ModelConfig("facebook/opt-125m", max_logprobs=-1)
+    ModelConfig("facebook/opt-125m", max_logprobs=0)
+    with pytest.raises(ValidationError):
+        ModelConfig("facebook/opt-125m", max_logprobs=-5)
+
+
 @pytest.mark.parametrize(
     (
         "model_id",

diff --git a/vllm/config/model.py b/vllm/config/model.py
@@ -231,7 +231,7 @@ class ModelConfig:
     flexibility."""
     enable_return_routed_experts: bool = False
     """Whether to return routed experts."""
-    max_logprobs: int = 20
+    max_logprobs: int = Field(default=20, ge=-1)
     """Maximum number of log probabilities to return when `logprobs` is
     specified in `SamplingParams`. The default value comes the default for the
     OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length *

@@ -77,9 +77,9 @@ class SchedulerConfig:
     this less than max_num_partial_prefills will allow shorter prompts to jump
     the queue in front of longer prompts in some cases, improving latency."""
 
-    long_prefill_token_threshold: int = 0
+    long_prefill_token_threshold: int = Field(default=0, ge=0)
     """For chunked prefill, a request is considered long if the prompt is
-    longer than this number of tokens."""
+    longer than this number of tokens. 0 disables the cap (default)."""
 
     enable_chunked_prefill: bool = True
     """If True, prefill requests can be chunked based