Skip to content
Merged
28 changes: 28 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1322,6 +1322,34 @@ def test_scheduler_config_init():
print(SchedulerConfig.default_factory().max_model_len)


def test_long_prefill_token_threshold_rejects_negative():
# 0 means "disabled"; any negative is silently ineffective downstream
# because the scheduler clamp is guarded by `0 < threshold`. Reject at
# admission instead. See https://github.com/vllm-project/vllm/issues/43985
SchedulerConfig(
max_model_len=4096,
is_encoder_decoder=False,
long_prefill_token_threshold=0,
)
with pytest.raises(ValidationError):
SchedulerConfig(
max_model_len=4096,
is_encoder_decoder=False,
long_prefill_token_threshold=-5,
)


def test_max_logprobs_rejects_negative_other_than_minus_one():
# -1 is the sentinel for "no cap, use vocab size"; other negatives are
# silently ineffective for logprob-free traffic and produce a confusing
# "max allowed: <negative>" message otherwise. Reject at admission.
# See https://github.com/vllm-project/vllm/issues/43985
ModelConfig("facebook/opt-125m", max_logprobs=-1)
ModelConfig("facebook/opt-125m", max_logprobs=0)
with pytest.raises(ValidationError):
ModelConfig("facebook/opt-125m", max_logprobs=-5)


Comment thread
jwzheng96 marked this conversation as resolved.
Outdated
@pytest.mark.parametrize(
(
"model_id",
Expand Down
2 changes: 1 addition & 1 deletion vllm/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ class ModelConfig:
flexibility."""
enable_return_routed_experts: bool = False
"""Whether to return routed experts."""
max_logprobs: int = 20
max_logprobs: int = Field(default=20, ge=-1)
"""Maximum number of log probabilities to return when `logprobs` is
specified in `SamplingParams`. The default value comes the default for the
OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length *
Expand Down
4 changes: 2 additions & 2 deletions vllm/config/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ class SchedulerConfig:
this less than max_num_partial_prefills will allow shorter prompts to jump
the queue in front of longer prompts in some cases, improving latency."""

long_prefill_token_threshold: int = 0
long_prefill_token_threshold: int = Field(default=0, ge=0)
"""For chunked prefill, a request is considered long if the prompt is
longer than this number of tokens."""
longer than this number of tokens. 0 disables the cap (default)."""

enable_chunked_prefill: bool = True
"""If True, prefill requests can be chunked based
Expand Down
Loading