Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
94b48b2
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Dec 11, 2025
1d239c6
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Dec 11, 2025
d7d19f3
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Feb 6, 2026
21dc976
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Feb 6, 2026
87cbcd6
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Feb 6, 2026
17aed45
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Feb 6, 2026
486df75
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Feb 24, 2026
febe213
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Feb 24, 2026
a4bebba
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Feb 24, 2026
2eb0d20
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 5, 2026
52690a4
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 5, 2026
c98a7a6
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 5, 2026
ff399e2
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 5, 2026
3f93d1f
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 5, 2026
142be8d
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 5, 2026
05128a9
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 5, 2026
15a6ad1
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 5, 2026
c3f323b
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 6, 2026
64c1c89
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 6, 2026
676dfa4
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 6, 2026
635f374
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 6, 2026
b668a6b
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 6, 2026
9a5417d
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 13, 2026
af9991f
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 13, 2026
1e1c8f5
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 13, 2026
a982302
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 16, 2026
9af4a95
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 16, 2026
4739e05
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 16, 2026
2b75e67
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 16, 2026
87822e7
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 17, 2026
b8d567a
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 17, 2026
6fc8450
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 18, 2026
5bbe4ec
[Core][Feat] Add max-waiting-queue-length parameter to reject request…
chaunceyjiang Mar 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.exceptions import GenerationError
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.exceptions import GenerationError
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
Expand Down
42 changes: 42 additions & 0 deletions tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4140,3 +4140,45 @@ def test_eagle3_mm_encoder_cache_with_shift():
f"shifted_end={scheduled_end_with_shift}) overlapping MM at "
f"{start_pos}. The fix must schedule encoder inputs."
)


def test_max_waiting_queue_length():
"""Test that requests are rejected when waiting queue is full."""
# Create scheduler with max_waiting_queue_length=5
scheduler = create_scheduler(max_waiting_queue_length=5)

# Add 5 requests (should succeed)
requests = create_requests(num_requests=5)
for request in requests:
scheduler.add_request(request)

assert len(scheduler.waiting) == 5
assert len(scheduler.rejected) == 0

# Add 1 more request (should be rejected)
extra_request = create_requests(num_requests=1, req_ids=["extra"])
scheduler.add_request(extra_request[0])

# The extra request should be rejected
assert len(scheduler.waiting) == 5
assert len(scheduler.rejected) == 1
assert scheduler.rejected[0].request_id == "extra"
assert scheduler.rejected[0].status == RequestStatus.FINISHED_REJECTED

# Schedule: the 5 waiting requests are scheduled
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == 5
# rejected request not in finished_req_ids yet (handled in update_from_output)
assert "extra" not in output.finished_req_ids


def test_max_waiting_queue_length_default():
"""Test that default max_waiting_queue_length works."""
# Default is 4096, should handle large number of requests
scheduler = create_scheduler(max_waiting_queue_length=4096)
requests = create_requests(num_requests=100)
for request in requests:
scheduler.add_request(request)

assert len(scheduler.waiting) == 100
assert len(scheduler.rejected) == 0
2 changes: 2 additions & 0 deletions tests/v1/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def create_scheduler(
pipeline_parallel_size: int = 1,
use_ec_connector: bool = False,
ec_role: str | None = None,
max_waiting_queue_length: int = 4096,
) -> Scheduler | AsyncScheduler:
"""Create scheduler under test.

Expand Down Expand Up @@ -89,6 +90,7 @@ def create_scheduler(
enable_chunked_prefill=enable_chunked_prefill,
async_scheduling=async_scheduling,
is_encoder_decoder=model_config.is_encoder_decoder,
max_waiting_queue_length=max_waiting_queue_length,
)
# Cache config, optionally force APC
cache_config = CacheConfig(
Expand Down
11 changes: 10 additions & 1 deletion vllm/config/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class SchedulerConfig:

DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128

DEFAULT_MAX_WAITING_QUEUE_LEN: ClassVar[int] = 4096
runner_type: RunnerType = "generate"
"""The runner type to launch for the model."""

Expand All @@ -66,6 +66,15 @@ class SchedulerConfig:
In real usage, this should be set in `EngineArgs.create_engine_config`.
"""

max_waiting_queue_length: int = Field(default=DEFAULT_MAX_WAITING_QUEUE_LEN, ge=1)
"""
The maximum number of requests allowed in the waiting queue.
If the waiting queue is full, new incoming requests will be rejected.

[Experimental] This parameter is an experimental feature and
may change or be removed in future releases.
"""

max_num_partial_prefills: int = Field(default=1, ge=1)
"""For chunked prefill, the maximum number of sequences that can be
partially prefilled concurrently."""
Expand Down
7 changes: 7 additions & 0 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,9 @@ class EngineArgs:
max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
max_num_seqs: int | None = None
max_waiting_queue_length: int = get_field(
SchedulerConfig, "max_waiting_queue_length"
)
max_logprobs: int = ModelConfig.max_logprobs
logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
disable_log_stats: bool = False
Expand Down Expand Up @@ -1196,6 +1199,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"--max-long-partial-prefills",
**scheduler_kwargs["max_long_partial_prefills"],
)
scheduler_group.add_argument(
"--max-waiting-queue-length", **scheduler_kwargs["max_waiting_queue_length"]
)
scheduler_group.add_argument(
"--long-prefill-token-threshold",
**scheduler_kwargs["long_prefill_token_threshold"],
Expand Down Expand Up @@ -1782,6 +1788,7 @@ def create_engine_config(
runner_type=model_config.runner_type,
max_num_batched_tokens=self.max_num_batched_tokens,
max_num_seqs=self.max_num_seqs,
max_waiting_queue_length=self.max_waiting_queue_length,
max_model_len=model_config.max_model_len,
enable_chunked_prefill=self.enable_chunked_prefill,
disable_chunked_mm_input=self.disable_chunked_mm_input,
Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
from vllm.entrypoints.launcher import serve_http
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.server_utils import (
Expand All @@ -54,6 +53,7 @@
log_version_and_model,
process_lora_modules,
)
from vllm.exceptions import GenerationError
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.tasks import POOLING_TASKS, SupportedTask
Expand Down
9 changes: 0 additions & 9 deletions vllm/entrypoints/openai/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
# Adapted from
# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
import time
from http import HTTPStatus
from typing import Any, ClassVar, Literal, TypeAlias

import regex as re
Expand Down Expand Up @@ -260,11 +259,3 @@ class DeltaMessage(OpenAIBaseModel):
content: str | None = None
reasoning: str | None = None
tool_calls: list[DeltaToolCall] = Field(default_factory=list)


class GenerationError(Exception):
"""raised when finish_reason indicates internal server error (500)"""

def __init__(self, message: str = "Internal server error"):
super().__init__(message)
self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR
32 changes: 20 additions & 12 deletions vllm/entrypoints/openai/engine/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
ErrorResponse,
FunctionCall,
FunctionDefinition,
GenerationError,
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.responses.protocol import (
Expand Down Expand Up @@ -71,7 +70,7 @@
TokenizeResponse,
)
from vllm.entrypoints.utils import create_error_response
from vllm.exceptions import VLLMValidationError
from vllm.exceptions import GenerationError, RequestRejectedError, VLLMValidationError
from vllm.inputs.data import (
ProcessorInputs,
PromptType,
Expand Down Expand Up @@ -575,23 +574,32 @@ def create_streaming_error_response(
return json_str

def _raise_if_error(self, finish_reason: str | None, request_id: str) -> None:
"""Raise GenerationError if finish_reason indicates an error."""
"""
Raise appropriate exception if finish_reason indicates an error or rejection.
"""
if finish_reason == "error":
logger.error(
"Request %s failed with an internal error during generation",
request_id,
)
raise GenerationError("Internal server error")
if finish_reason == "rejected":
logger.warning_every_n(
"Request %s was rejected due to "
"a full waiting queue (log every 100 requests)",
request_id,
)
raise RequestRejectedError()

def _convert_generation_error_to_streaming_response(
self, e: GenerationError
) -> str:
"""Convert GenerationError to streaming error response."""
return self.create_streaming_error_response(
str(e),
err_type="InternalServerError",
status_code=e.status_code,
)
def _convert_generation_error_to_streaming_response(self, e: Exception) -> str:
"""
Convert GenerationError or RequestRejectedError to streaming error response.
"""
if isinstance(e, GenerationError):
return self.create_streaming_error_response(
str(e), err_type=e.err_type, status_code=e.status_code
)
return self.create_streaming_error_response(str(e))

async def _check_model(
self,
Expand Down
3 changes: 1 addition & 2 deletions vllm/entrypoints/openai/server_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,9 @@
from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo,
ErrorResponse,
GenerationError,
)
from vllm.entrypoints.utils import create_error_response, sanitize_message
from vllm.exceptions import VLLMValidationError
from vllm.exceptions import GenerationError, VLLMValidationError
from vllm.logger import init_logger
from vllm.utils.gc_utils import freeze_gc_heap
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo,
ErrorResponse,
GenerationError,
StreamOptions,
)
from vllm.entrypoints.openai.models.protocol import LoRAModulePath
from vllm.exceptions import GenerationError
from vllm.logger import current_formatter_type, init_logger
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
Expand Down
27 changes: 27 additions & 0 deletions vllm/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""Custom exceptions for vLLM."""

from http import HTTPStatus
from typing import Any


Expand Down Expand Up @@ -64,3 +65,29 @@ def __init__(

def __str__(self):
return self.message


class GenerationError(Exception):
"""Raised when finish_reason indicates internal server error (500)"""

def __init__(
self,
message: str = "Internal server error",
err_type: str = "InternalServerError",
status_code: HTTPStatus = HTTPStatus.INTERNAL_SERVER_ERROR,
):
super().__init__(message)
self.err_type = err_type
self.status_code = status_code


class RequestRejectedError(GenerationError):
"""Raised when finish_reason indicates the request was rejected
(e.g., queue full, rate-limited, etc)."""

def __init__(self):
super().__init__(
message="Request was rejected",
err_type="ServiceUnavailableError",
status_code=HTTPStatus.SERVICE_UNAVAILABLE,
)
37 changes: 35 additions & 2 deletions vllm/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
import logging
import os
import sys
from collections.abc import Generator, Hashable
from collections import defaultdict
from collections.abc import Callable, Generator, Hashable
from contextlib import contextmanager
from functools import lru_cache, partial
from logging import Logger
from logging.config import dictConfig
from os import path
from threading import Lock
from types import MethodType
from typing import Any, Literal, cast

Expand Down Expand Up @@ -92,6 +94,24 @@ def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:

LogScope = Literal["process", "global", "local"]

_warning_every_n_counters: defaultdict[tuple[LogScope, int, str], int] = defaultdict(
int
)
_warning_every_n_lock: Lock = Lock()


def _print_warning_every_n(
logger: Logger, msg: str, *args: Hashable, n: int = 100, scope: LogScope = "process"
) -> None:
key = (scope, id(logger), msg)

with _warning_every_n_lock:
_warning_every_n_counters[key] += 1
count = _warning_every_n_counters[key]

if count == 1 or count % n == 0:
logger.warning(msg, *args)


def _should_log_with_scope(scope: LogScope) -> bool:
"""Decide whether to log based on scope"""
Expand Down Expand Up @@ -147,12 +167,25 @@ def warning_once(
return
_print_warning_once(self, msg, *args)

def warning_every_n(
self, msg: str, *args: Hashable, n: int = 100, scope: LogScope = "process"
) -> None:
"""
As `warning`, but only logs once every `n` calls.
"""

if not _should_log_with_scope(scope):
return

_print_warning_every_n(self, msg, *args, n=n)


# Pre-defined methods mapping to avoid repeated dictionary creation
_METHODS_TO_PATCH = {
_METHODS_TO_PATCH: dict[str, Callable[..., Any]] = {
"debug_once": _VllmLogger.debug_once,
"info_once": _VllmLogger.info_once,
"warning_once": _VllmLogger.warning_once,
"warning_every_n": _VllmLogger.warning_every_n,
}


Expand Down
Loading
Loading