vllm-project · chaunceyjiang · Dec 11, 2025 · Dec 11, 2025 · Feb 6, 2026 · Feb 6, 2026
@@ -10,10 +10,10 @@
 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
-from vllm.entrypoints.openai.engine.protocol import GenerationError
 from vllm.entrypoints.openai.models.protocol import BaseModelPath
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.serve.render.serving import OpenAIServingRender
+from vllm.exceptions import GenerationError
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.renderers.hf import HfRenderer
 from vllm.tokenizers.registry import tokenizer_args_from_config

@@ -10,10 +10,10 @@
 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.completion.protocol import CompletionRequest
 from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
-from vllm.entrypoints.openai.engine.protocol import GenerationError
 from vllm.entrypoints.openai.models.protocol import BaseModelPath
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.serve.render.serving import OpenAIServingRender
+from vllm.exceptions import GenerationError
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.renderers.hf import HfRenderer
 from vllm.tokenizers.registry import tokenizer_args_from_config

@@ -4140,3 +4140,45 @@ def test_eagle3_mm_encoder_cache_with_shift():
         f"shifted_end={scheduled_end_with_shift}) overlapping MM at "
         f"{start_pos}. The fix must schedule encoder inputs."
     )
+
+
+def test_max_waiting_queue_length():
+    """Test that requests are rejected when waiting queue is full."""
+    # Create scheduler with max_waiting_queue_length=5
+    scheduler = create_scheduler(max_waiting_queue_length=5)
+
+    # Add 5 requests (should succeed)
+    requests = create_requests(num_requests=5)
+    for request in requests:
+        scheduler.add_request(request)
+
+    assert len(scheduler.waiting) == 5
+    assert len(scheduler.rejected) == 0
+
+    # Add 1 more request (should be rejected)
+    extra_request = create_requests(num_requests=1, req_ids=["extra"])
+    scheduler.add_request(extra_request[0])
+
+    # The extra request should be rejected
+    assert len(scheduler.waiting) == 5
+    assert len(scheduler.rejected) == 1
+    assert scheduler.rejected[0].request_id == "extra"
+    assert scheduler.rejected[0].status == RequestStatus.FINISHED_REJECTED
+
+    # Schedule: the 5 waiting requests are scheduled
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 5
+    # rejected request not in finished_req_ids yet (handled in update_from_output)
+    assert "extra" not in output.finished_req_ids
+
+
+def test_max_waiting_queue_length_default():
+    """Test that default max_waiting_queue_length works."""
+    # Default is 4096, should handle large number of requests
+    scheduler = create_scheduler(max_waiting_queue_length=4096)
+    requests = create_requests(num_requests=100)
+    for request in requests:
+        scheduler.add_request(request)
+
+    assert len(scheduler.waiting) == 100
+    assert len(scheduler.rejected) == 0
@@ -57,6 +57,7 @@ def create_scheduler(
     pipeline_parallel_size: int = 1,
     use_ec_connector: bool = False,
     ec_role: str | None = None,
+    max_waiting_queue_length: int = 4096,
 ) -> Scheduler | AsyncScheduler:
     """Create scheduler under test.
 
@@ -89,6 +90,7 @@ def create_scheduler(
         enable_chunked_prefill=enable_chunked_prefill,
         async_scheduling=async_scheduling,
         is_encoder_decoder=model_config.is_encoder_decoder,
+        max_waiting_queue_length=max_waiting_queue_length,
     )
     # Cache config, optionally force APC
     cache_config = CacheConfig(

@@ -41,7 +41,7 @@ class SchedulerConfig:
 
     DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
     DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
-
+    DEFAULT_MAX_WAITING_QUEUE_LEN: ClassVar[int] = 4096
     runner_type: RunnerType = "generate"
     """The runner type to launch for the model."""
 
@@ -66,6 +66,15 @@ class SchedulerConfig:
     In real usage, this should be set in `EngineArgs.create_engine_config`.
     """
 
+    max_waiting_queue_length: int = Field(default=DEFAULT_MAX_WAITING_QUEUE_LEN, ge=1)
+    """
+    The maximum number of requests allowed in the waiting queue.
+    If the waiting queue is full, new incoming requests will be rejected.
+
+    [Experimental] This parameter is an experimental feature and
+    may change or be removed in future releases.
+    """
+
     max_num_partial_prefills: int = Field(default=1, ge=1)
     """For chunked prefill, the maximum number of sequences that can be
     partially prefilled concurrently."""

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -461,6 +461,9 @@ class EngineArgs:
     max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
     long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
     max_num_seqs: int | None = None
+    max_waiting_queue_length: int = get_field(
+        SchedulerConfig, "max_waiting_queue_length"
+    )
     max_logprobs: int = ModelConfig.max_logprobs
     logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
     disable_log_stats: bool = False
@@ -1196,6 +1199,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--max-long-partial-prefills",
             **scheduler_kwargs["max_long_partial_prefills"],
         )
+        scheduler_group.add_argument(
+            "--max-waiting-queue-length", **scheduler_kwargs["max_waiting_queue_length"]
+        )
         scheduler_group.add_argument(
             "--long-prefill-token-threshold",
             **scheduler_kwargs["long_prefill_token_threshold"],
@@ -1782,6 +1788,7 @@ def create_engine_config(
             runner_type=model_config.runner_type,
             max_num_batched_tokens=self.max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
+            max_waiting_queue_length=self.max_waiting_queue_length,
             max_model_len=model_config.max_model_len,
             enable_chunked_prefill=self.enable_chunked_prefill,
             disable_chunked_mm_input=self.disable_chunked_mm_input,

@@ -29,7 +29,6 @@
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
-from vllm.entrypoints.openai.engine.protocol import GenerationError
 from vllm.entrypoints.openai.models.protocol import BaseModelPath
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.openai.server_utils import (
@@ -54,6 +53,7 @@
     log_version_and_model,
     process_lora_modules,
 )
+from vllm.exceptions import GenerationError
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.tasks import POOLING_TASKS, SupportedTask

@@ -4,7 +4,6 @@
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import time
-from http import HTTPStatus
 from typing import Any, ClassVar, Literal, TypeAlias
 
 import regex as re
@@ -260,11 +259,3 @@ class DeltaMessage(OpenAIBaseModel):
     content: str | None = None
     reasoning: str | None = None
     tool_calls: list[DeltaToolCall] = Field(default_factory=list)
-
-
-class GenerationError(Exception):
-    """raised when finish_reason indicates internal server error (500)"""
-
-    def __init__(self, message: str = "Internal server error"):
-        super().__init__(message)
-        self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR
@@ -38,7 +38,6 @@
     ErrorResponse,
     FunctionCall,
     FunctionDefinition,
-    GenerationError,
 )
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.openai.responses.protocol import (
@@ -71,7 +70,7 @@
     TokenizeResponse,
 )
 from vllm.entrypoints.utils import create_error_response
-from vllm.exceptions import VLLMValidationError
+from vllm.exceptions import GenerationError, RequestRejectedError, VLLMValidationError
 from vllm.inputs.data import (
     ProcessorInputs,
     PromptType,
@@ -575,23 +574,32 @@ def create_streaming_error_response(
         return json_str
 
     def _raise_if_error(self, finish_reason: str | None, request_id: str) -> None:
-        """Raise GenerationError if finish_reason indicates an error."""
+        """
+        Raise appropriate exception if finish_reason indicates an error or rejection.
+        """
         if finish_reason == "error":
             logger.error(
                 "Request %s failed with an internal error during generation",
                 request_id,
             )
             raise GenerationError("Internal server error")
+        if finish_reason == "rejected":
+            logger.warning_every_n(
+                "Request %s was rejected due to "
+                "a full waiting queue (log every 100 requests)",
+                request_id,
+            )
+            raise RequestRejectedError()
 
-    def _convert_generation_error_to_streaming_response(
-        self, e: GenerationError
-    ) -> str:
-        """Convert GenerationError to streaming error response."""
-        return self.create_streaming_error_response(
-            str(e),
-            err_type="InternalServerError",
-            status_code=e.status_code,
-        )
+    def _convert_generation_error_to_streaming_response(self, e: Exception) -> str:
+        """
+        Convert GenerationError or RequestRejectedError to streaming error response.
+        """
+        if isinstance(e, GenerationError):
+            return self.create_streaming_error_response(
+                str(e), err_type=e.err_type, status_code=e.status_code
+            )
+        return self.create_streaming_error_response(str(e))
 
     async def _check_model(
         self,

@@ -24,10 +24,9 @@
 from vllm.entrypoints.openai.engine.protocol import (
     ErrorInfo,
     ErrorResponse,
-    GenerationError,
 )
 from vllm.entrypoints.utils import create_error_response, sanitize_message
-from vllm.exceptions import VLLMValidationError
+from vllm.exceptions import GenerationError, VLLMValidationError
 from vllm.logger import init_logger
 from vllm.utils.gc_utils import freeze_gc_heap
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError

@@ -20,10 +20,10 @@
 from vllm.entrypoints.openai.engine.protocol import (
     ErrorInfo,
     ErrorResponse,
-    GenerationError,
     StreamOptions,
 )
 from vllm.entrypoints.openai.models.protocol import LoRAModulePath
+from vllm.exceptions import GenerationError
 from vllm.logger import current_formatter_type, init_logger
 from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser

diff --git a/vllm/exceptions.py b/vllm/exceptions.py
@@ -3,6 +3,7 @@
 
 """Custom exceptions for vLLM."""
 
+from http import HTTPStatus
 from typing import Any
 
 
@@ -64,3 +65,29 @@ def __init__(
 
     def __str__(self):
         return self.message
+
+
+class GenerationError(Exception):
+    """Raised when finish_reason indicates internal server error (500)"""
+
+    def __init__(
+        self,
+        message: str = "Internal server error",
+        err_type: str = "InternalServerError",
+        status_code: HTTPStatus = HTTPStatus.INTERNAL_SERVER_ERROR,
+    ):
+        super().__init__(message)
+        self.err_type = err_type
+        self.status_code = status_code
+
+
+class RequestRejectedError(GenerationError):
+    """Raised when finish_reason indicates the request was rejected
+    (e.g., queue full, rate-limited, etc)."""
+
+    def __init__(self):
+        super().__init__(
+            message="Request was rejected",
+            err_type="ServiceUnavailableError",
+            status_code=HTTPStatus.SERVICE_UNAVAILABLE,
+        )
diff --git a/vllm/logger.py b/vllm/logger.py
@@ -7,12 +7,14 @@
 import logging
 import os
 import sys
-from collections.abc import Generator, Hashable
+from collections import defaultdict
+from collections.abc import Callable, Generator, Hashable
 from contextlib import contextmanager
 from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
 from os import path
+from threading import Lock
 from types import MethodType
 from typing import Any, Literal, cast
 
@@ -92,6 +94,24 @@ def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:
 
 LogScope = Literal["process", "global", "local"]
 
+_warning_every_n_counters: defaultdict[tuple[LogScope, int, str], int] = defaultdict(
+    int
+)
+_warning_every_n_lock: Lock = Lock()
+
+
+def _print_warning_every_n(
+    logger: Logger, msg: str, *args: Hashable, n: int = 100, scope: LogScope = "process"
+) -> None:
+    key = (scope, id(logger), msg)
+
+    with _warning_every_n_lock:
+        _warning_every_n_counters[key] += 1
+        count = _warning_every_n_counters[key]
+
+    if count == 1 or count % n == 0:
+        logger.warning(msg, *args)
+
 
 def _should_log_with_scope(scope: LogScope) -> bool:
     """Decide whether to log based on scope"""
@@ -147,12 +167,25 @@ def warning_once(
             return
         _print_warning_once(self, msg, *args)
 
+    def warning_every_n(
+        self, msg: str, *args: Hashable, n: int = 100, scope: LogScope = "process"
+    ) -> None:
+        """
+        As `warning`, but only logs once every `n` calls.
+        """
+
+        if not _should_log_with_scope(scope):
+            return
+
+        _print_warning_every_n(self, msg, *args, n=n)
+
 
 # Pre-defined methods mapping to avoid repeated dictionary creation
-_METHODS_TO_PATCH = {
+_METHODS_TO_PATCH: dict[str, Callable[..., Any]] = {
     "debug_once": _VllmLogger.debug_once,
     "info_once": _VllmLogger.info_once,
     "warning_once": _VllmLogger.warning_once,
+    "warning_every_n": _VllmLogger.warning_every_n,
 }