vllm-project · DarkLight1337 · Mar 5, 2026 · Jan 7, 2026 · DarkLight1337 · Feb 27, 2026
@@ -6,7 +6,6 @@
 
 import pytest
 
-from vllm.entrypoints.openai.engine.protocol import ErrorResponse
 from vllm.entrypoints.openai.engine.serving import GenerationError, OpenAIServing
 
 
@@ -38,32 +37,6 @@ async def test_raise_if_error_raises_generation_error():
     serving._raise_if_error(None, "test-request-id")  # should not raise
 
 
-@pytest.mark.asyncio
-async def test_convert_generation_error_to_response():
-    """test _convert_generation_error_to_response creates proper ErrorResponse"""
-    mock_engine = MagicMock()
-    mock_engine.model_config = MagicMock()
-    mock_engine.model_config.max_model_len = 100
-    mock_models = MagicMock()
-
-    serving = OpenAIServing(
-        engine_client=mock_engine,
-        models=mock_models,
-        request_logger=None,
-    )
-
-    # create a GenerationError
-    gen_error = GenerationError("Internal server error")
-
-    # convert to ErrorResponse
-    error_response = serving._convert_generation_error_to_response(gen_error)
-
-    assert isinstance(error_response, ErrorResponse)
-    assert error_response.error.type == "InternalServerError"
-    assert error_response.error.message == "Internal server error"
-    assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
-
-
 @pytest.mark.asyncio
 async def test_convert_generation_error_to_streaming_response():
     """test _convert_generation_error_to_streaming_response output"""

@@ -13,7 +13,7 @@
 import pytest
 import pytest_asyncio
 import requests
-from openai import BadRequestError, NotFoundError, OpenAI
+from openai import InternalServerError, NotFoundError, OpenAI
 from openai_harmony import Message
 
 from ....utils import RemoteOpenAIServer
@@ -698,7 +698,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
 async def test_function_calling_required(client: OpenAI, model_name: str):
     tools = [GET_WEATHER_SCHEMA]
 
-    with pytest.raises(BadRequestError):
+    with pytest.raises(InternalServerError):
         await client.responses.create(
             model=model_name,
             input="What's the weather like in Paris today?",

@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass, field
-from http import HTTPStatus
 from typing import Any
 from unittest.mock import AsyncMock, MagicMock, patch
 
@@ -11,7 +10,7 @@
 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
-from vllm.entrypoints.openai.engine.protocol import ErrorResponse
+from vllm.entrypoints.openai.engine.protocol import GenerationError
 from vllm.entrypoints.openai.models.protocol import BaseModelPath
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.outputs import CompletionOutput, RequestOutput
@@ -145,12 +144,8 @@ async def mock_generate(*args, **kwargs):
         stream=False,
     )
 
-    response = await serving_chat.create_chat_completion(request)
-
-    assert isinstance(response, ErrorResponse)
-    assert response.error.type == "InternalServerError"
-    assert response.error.message == "Internal server error"
-    assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
+    with pytest.raises(GenerationError):
+        await serving_chat.create_chat_completion(request)
 
 
 @pytest.mark.asyncio

@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass, field
-from http import HTTPStatus
 from typing import Any
 from unittest.mock import MagicMock
 
@@ -11,7 +10,7 @@
 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.completion.protocol import CompletionRequest
 from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
-from vllm.entrypoints.openai.engine.protocol import ErrorResponse
+from vllm.entrypoints.openai.engine.protocol import GenerationError
 from vllm.entrypoints.openai.models.protocol import BaseModelPath
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.outputs import CompletionOutput, RequestOutput
@@ -131,12 +130,8 @@ async def mock_generate(*args, **kwargs):
         stream=False,
     )
 
-    response = await serving_completion.create_completion(request)
-
-    assert isinstance(response, ErrorResponse)
-    assert response.error.type == "InternalServerError"
-    assert response.error.message == "Internal server error"
-    assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
+    with pytest.raises(GenerationError):
+        await serving_completion.create_completion(request)
 
 
 @pytest.mark.asyncio

@@ -1,12 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
+from http import HTTPStatus
 from typing import Final
 
 import pytest
 import schemathesis
+from httpx import URL
 from hypothesis import settings
 from schemathesis import GenerationConfig
+from schemathesis.checks import not_a_server_error
+from schemathesis.internal.checks import CheckContext
+from schemathesis.models import Case
+from schemathesis.transports.responses import GenericResponse
 
 from ...utils import RemoteOpenAIServer
 
@@ -127,10 +133,25 @@ def no_invalid_types(case: schemathesis.models.Case):
     return strategy.filter(no_invalid_types)
 
 
+def customized_not_a_server_error(
+    ctx: CheckContext, response: GenericResponse, case: Case
+) -> bool | None:
+    try:
+        return not_a_server_error(ctx, response, case)
+    except Exception:
+        if (
+            URL(response.request.url).path
+            in ["/v1/chat/completions/render", "/v1/chat/completions"]
+            and response.status_code == HTTPStatus.NOT_IMPLEMENTED.value
+        ):
+            return True
+        raise
+
+
 @schema.parametrize()
 @schema.override(headers={"Content-Type": "application/json"})
 @settings(deadline=LONG_TIMEOUT_SECONDS * 1000, max_examples=50)
-def test_openapi_stateless(case: schemathesis.Case):
+def test_openapi_stateless(case: Case):
     key = (
         case.operation.method.upper(),
         case.operation.path,
@@ -155,4 +176,9 @@ def test_openapi_stateless(case: schemathesis.Case):
     }.get(key, DEFAULT_TIMEOUT_SECONDS)
 
     # No need to verify SSL certificate for localhost
-    case.call_and_validate(verify=False, timeout=timeout)
+    case.call_and_validate(
+        verify=False,
+        timeout=timeout,
+        additional_checks=(customized_not_a_server_error,),
+        excluded_checks=(not_a_server_error,),
+    )
@@ -23,6 +23,7 @@
 )
 from vllm.entrypoints.openai.models.serving import BaseModelPath, OpenAIServingModels
 from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
+from vllm.exceptions import VLLMValidationError
 from vllm.inputs import TokensPrompt
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.renderers.hf import HfRenderer
@@ -818,9 +819,8 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
         max_tokens=10,
     )
 
-    resp = await serving_chat.create_chat_completion(req)
-    assert isinstance(resp, ErrorResponse)
-    assert "context length is only" in resp.error.message
+    with pytest.raises(VLLMValidationError):
+        await serving_chat.create_chat_completion(req)
 
 
 @pytest.mark.asyncio
@@ -860,9 +860,8 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
         max_tokens=1,
     )
 
-    resp = await serving_chat.create_chat_completion(req)
-    assert isinstance(resp, ErrorResponse)
-    assert "context length is only" in resp.error.message
+    with pytest.raises(VLLMValidationError):
+        await serving_chat.create_chat_completion(req)
 
 
 @pytest.mark.asyncio

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
@@ -17,9 +17,6 @@
     ChatCompletionResponse,
 )
 from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
-from vllm.entrypoints.openai.engine.protocol import (
-    ErrorResponse,
-)
 from vllm.entrypoints.openai.models.protocol import BaseModelPath
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.inputs import PromptType
@@ -542,11 +539,9 @@ async def test_header_dp_rank_argument():
         # Test 2: Out-of-range DP rank (1)
         mock_raw_request.headers = {"X-data-parallel-rank": "1"}
 
-        # should return ErrorResponse for out-of-range rank
-        response2 = await serving_chat.create_chat_completion(req, mock_raw_request)
-        assert isinstance(response2, ErrorResponse), (
-            "Expected an ErrorResponse for out-of-range DP rank"
-        )
+        # should raise ValueError for out-of-range rank
+        with pytest.raises(ValueError):
+            await serving_chat.create_chat_completion(req, mock_raw_request)
 
 
 @pytest.mark.asyncio

@@ -4,11 +4,10 @@
 import asyncio
 import signal
 import socket
-from http import HTTPStatus
 from typing import Any
 
 import uvicorn
-from fastapi import FastAPI, Request, Response
+from fastapi import FastAPI
 
 from vllm import envs
 from vllm.engine.protocol import EngineClient
@@ -19,7 +18,6 @@
 from vllm.entrypoints.ssl import SSLCertRefresher
 from vllm.logger import init_logger
 from vllm.utils.network_utils import find_process_using_port
-from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 
 logger = init_logger(__name__)
 
@@ -75,7 +73,7 @@ async def serve_http(
     config.h11_max_header_count = h11_max_header_count
     config.load()
     server = uvicorn.Server(config)
-    _add_shutdown_handlers(app, server)
+    app.state.server = server
 
     loop = asyncio.get_running_loop()
 
@@ -148,40 +146,3 @@ def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
     engine_errored = engine.errored and not engine.is_running
     if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored:
         server.should_exit = True
-
-
-def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
-    """
-    VLLM V1 AsyncLLM catches exceptions and returns
-    only two types: EngineGenerateError and EngineDeadError.
-
-    EngineGenerateError is raised by the per request generate()
-    method. This error could be request specific (and therefore
-    recoverable - e.g. if there is an error in input processing).
-
-    EngineDeadError is raised by the background output_handler
-    method. This error is global and therefore not recoverable.
-
-    We register these @app.exception_handlers to return nice
-    responses to the end user if they occur and shut down if needed.
-    See https://fastapi.tiangolo.com/tutorial/handling-errors/
-    for more details on how exception handlers work.
-
-    If an exception is encountered in a StreamingResponse
-    generator, the exception is not raised, since we already sent
-    a 200 status. Rather, we send an error message as the next chunk.
-    Since the exception is not raised, this means that the server
-    will not automatically shut down. Instead, we use the watchdog
-    background task for check for errored state.
-    """
-
-    @app.exception_handler(RuntimeError)
-    @app.exception_handler(EngineDeadError)
-    @app.exception_handler(EngineGenerateError)
-    async def runtime_exception_handler(request: Request, __):
-        terminate_if_errored(
-            server=server,
-            engine=request.app.state.engine_client,
-        )
-
-        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
@@ -31,6 +31,8 @@
 from vllm.entrypoints.openai.models.protocol import BaseModelPath
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.openai.server_utils import (
+    engine_error_handler,
+    exception_handler,
     get_uvicorn_log_config,
     http_exception_handler,
     lifespan,
@@ -57,6 +59,7 @@
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.utils.system_utils import decorate_logs, set_ulimit
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.version import __version__ as VLLM_VERSION
 
 prometheus_multiproc_dir: tempfile.TemporaryDirectory
@@ -250,6 +253,9 @@ def build_app(
 
     app.exception_handler(HTTPException)(http_exception_handler)
     app.exception_handler(RequestValidationError)(validation_exception_handler)
+    app.exception_handler(EngineGenerateError)(engine_error_handler)
+    app.exception_handler(EngineDeadError)(engine_error_handler)
+    app.exception_handler(Exception)(exception_handler)
 
     # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY
     if tokens := [key for key in (args.api_key or [envs.VLLM_API_KEY]) if key]:
@@ -355,7 +361,6 @@ async def init_app_state(
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
         trust_request_chat_template=args.trust_request_chat_template,
-        log_error_stack=args.log_error_stack,
     )
 
     if any(task in supported_tasks for task in ("generate", "render")):

@@ -39,6 +39,7 @@ def chat(request: Request) -> OpenAIServingChat | None:
         HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
         HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
         HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
     },
 )
 @with_cancellation
@@ -54,10 +55,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
             message="The model does not support Chat Completions API"
         )
 
-    try:
-        generator = await handler.create_chat_completion(request, raw_request)
-    except Exception as e:
-        generator = handler.create_error_response(e)
+    generator = await handler.create_chat_completion(request, raw_request)
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(
@@ -81,6 +79,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
         HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
         HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
         HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
     },
 )
 async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request):
@@ -93,10 +92,7 @@ async def render_chat_completion(request: ChatCompletionRequest, raw_request: Re
             message="The model does not support Chat Completions API"
         )
 
-    try:
-        result = await handler.render_chat_request(request)
-    except Exception as e:
-        result = handler.create_error_response(e)
+    result = await handler.render_chat_request(request)
 
     if isinstance(result, ErrorResponse):
         return JSONResponse(content=result.model_dump(), status_code=result.error.code)