Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 0 additions & 27 deletions tests/entrypoints/openai/responses/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import pytest

from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.engine.serving import GenerationError, OpenAIServing


Expand Down Expand Up @@ -38,32 +37,6 @@ async def test_raise_if_error_raises_generation_error():
serving._raise_if_error(None, "test-request-id") # should not raise


@pytest.mark.asyncio
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you make an e2e version of this test?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still need some time to add an e2e test for this case. Can we postpone this later and merge this pr firstly. I have tested this manually.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ask claude or codex to add it?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ask claude or codex to add it?

Will add this test in the next PR.

async def test_convert_generation_error_to_response():
"""test _convert_generation_error_to_response creates proper ErrorResponse"""
mock_engine = MagicMock()
mock_engine.model_config = MagicMock()
mock_engine.model_config.max_model_len = 100
mock_models = MagicMock()

serving = OpenAIServing(
engine_client=mock_engine,
models=mock_models,
request_logger=None,
)

# create a GenerationError
gen_error = GenerationError("Internal server error")

# convert to ErrorResponse
error_response = serving._convert_generation_error_to_response(gen_error)

assert isinstance(error_response, ErrorResponse)
assert error_response.error.type == "InternalServerError"
assert error_response.error.message == "Internal server error"
assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR


@pytest.mark.asyncio
async def test_convert_generation_error_to_streaming_response():
"""test _convert_generation_error_to_streaming_response output"""
Expand Down
4 changes: 2 additions & 2 deletions tests/entrypoints/openai/responses/test_harmony.py
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change BadReqeustError to InternalServerError is because NotImplementedError is inherited from RuntimeError. RuntimeError has been remove from create_error_response and thus NotImplementedError will result in 501 http response status code.

Image

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import pytest
import pytest_asyncio
import requests
from openai import BadRequestError, NotFoundError, OpenAI
from openai import InternalServerError, NotFoundError, OpenAI
from openai_harmony import Message

from ....utils import RemoteOpenAIServer
Expand Down Expand Up @@ -698,7 +698,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
async def test_function_calling_required(client: OpenAI, model_name: str):
tools = [GET_WEATHER_SCHEMA]

with pytest.raises(BadRequestError):
with pytest.raises(InternalServerError):
await client.responses.create(
model=model_name,
input="What's the weather like in Paris today?",
Expand Down
11 changes: 3 additions & 8 deletions tests/entrypoints/openai/test_chat_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from dataclasses import dataclass, field
from http import HTTPStatus
from typing import Any
from unittest.mock import AsyncMock, MagicMock, patch

Expand All @@ -11,7 +10,7 @@
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.outputs import CompletionOutput, RequestOutput
Expand Down Expand Up @@ -145,12 +144,8 @@ async def mock_generate(*args, **kwargs):
stream=False,
)

response = await serving_chat.create_chat_completion(request)

assert isinstance(response, ErrorResponse)
assert response.error.type == "InternalServerError"
assert response.error.message == "Internal server error"
assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
with pytest.raises(GenerationError):
await serving_chat.create_chat_completion(request)


@pytest.mark.asyncio
Expand Down
11 changes: 3 additions & 8 deletions tests/entrypoints/openai/test_completion_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from dataclasses import dataclass, field
from http import HTTPStatus
from typing import Any
from unittest.mock import MagicMock

Expand All @@ -11,7 +10,7 @@
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.outputs import CompletionOutput, RequestOutput
Expand Down Expand Up @@ -131,12 +130,8 @@ async def mock_generate(*args, **kwargs):
stream=False,
)

response = await serving_completion.create_completion(request)

assert isinstance(response, ErrorResponse)
assert response.error.type == "InternalServerError"
assert response.error.message == "Internal server error"
assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
with pytest.raises(GenerationError):
await serving_completion.create_completion(request)


@pytest.mark.asyncio
Expand Down
30 changes: 28 additions & 2 deletions tests/entrypoints/openai/test_openai_schema.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from http import HTTPStatus
from typing import Final

import pytest
import schemathesis
from httpx import URL
from hypothesis import settings
from schemathesis import GenerationConfig
from schemathesis.checks import not_a_server_error
from schemathesis.internal.checks import CheckContext
from schemathesis.models import Case
from schemathesis.transports.responses import GenericResponse

from ...utils import RemoteOpenAIServer

Expand Down Expand Up @@ -127,10 +133,25 @@ def no_invalid_types(case: schemathesis.models.Case):
return strategy.filter(no_invalid_types)


def customized_not_a_server_error(
Copy link
Copy Markdown
Contributor Author

@andyxning andyxning Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New changes. By default all server error will cause an exception or failure in schemathesis. We need to customize this by allowing /v1/chat/completions/render and /v1/chat/completions openapi return 501 status code.

ctx: CheckContext, response: GenericResponse, case: Case
) -> bool | None:
try:
return not_a_server_error(ctx, response, case)
except Exception:
if (
URL(response.request.url).path
in ["/v1/chat/completions/render", "/v1/chat/completions"]
and response.status_code == HTTPStatus.NOT_IMPLEMENTED.value
):
return True
raise


@schema.parametrize()
@schema.override(headers={"Content-Type": "application/json"})
@settings(deadline=LONG_TIMEOUT_SECONDS * 1000, max_examples=50)
def test_openapi_stateless(case: schemathesis.Case):
def test_openapi_stateless(case: Case):
key = (
case.operation.method.upper(),
case.operation.path,
Expand All @@ -155,4 +176,9 @@ def test_openapi_stateless(case: schemathesis.Case):
}.get(key, DEFAULT_TIMEOUT_SECONDS)

# No need to verify SSL certificate for localhost
case.call_and_validate(verify=False, timeout=timeout)
case.call_and_validate(
verify=False,
timeout=timeout,
additional_checks=(customized_not_a_server_error,),
excluded_checks=(not_a_server_error,),
)
11 changes: 5 additions & 6 deletions tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
)
from vllm.entrypoints.openai.models.serving import BaseModelPath, OpenAIServingModels
from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
from vllm.exceptions import VLLMValidationError
from vllm.inputs import TokensPrompt
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.renderers.hf import HfRenderer
Expand Down Expand Up @@ -818,9 +819,8 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
max_tokens=10,
)

resp = await serving_chat.create_chat_completion(req)
assert isinstance(resp, ErrorResponse)
assert "context length is only" in resp.error.message
with pytest.raises(VLLMValidationError):
await serving_chat.create_chat_completion(req)


@pytest.mark.asyncio
Expand Down Expand Up @@ -860,9 +860,8 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
max_tokens=1,
)

resp = await serving_chat.create_chat_completion(req)
assert isinstance(resp, ErrorResponse)
assert "context length is only" in resp.error.message
with pytest.raises(VLLMValidationError):
await serving_chat.create_chat_completion(req)


@pytest.mark.asyncio
Expand Down
11 changes: 3 additions & 8 deletions tests/v1/engine/test_async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@
ChatCompletionResponse,
)
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
)
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.inputs import PromptType
Expand Down Expand Up @@ -542,11 +539,9 @@ async def test_header_dp_rank_argument():
# Test 2: Out-of-range DP rank (1)
mock_raw_request.headers = {"X-data-parallel-rank": "1"}

# should return ErrorResponse for out-of-range rank
response2 = await serving_chat.create_chat_completion(req, mock_raw_request)
assert isinstance(response2, ErrorResponse), (
"Expected an ErrorResponse for out-of-range DP rank"
)
# should raise ValueError for out-of-range rank
with pytest.raises(ValueError):
await serving_chat.create_chat_completion(req, mock_raw_request)


@pytest.mark.asyncio
Expand Down
43 changes: 2 additions & 41 deletions vllm/entrypoints/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@
import asyncio
import signal
import socket
from http import HTTPStatus
from typing import Any

import uvicorn
from fastapi import FastAPI, Request, Response
from fastapi import FastAPI

from vllm import envs
from vllm.engine.protocol import EngineClient
Expand All @@ -19,7 +18,6 @@
from vllm.entrypoints.ssl import SSLCertRefresher
from vllm.logger import init_logger
from vllm.utils.network_utils import find_process_using_port
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError

logger = init_logger(__name__)

Expand Down Expand Up @@ -75,7 +73,7 @@ async def serve_http(
config.h11_max_header_count = h11_max_header_count
config.load()
server = uvicorn.Server(config)
_add_shutdown_handlers(app, server)
app.state.server = server

loop = asyncio.get_running_loop()

Expand Down Expand Up @@ -148,40 +146,3 @@ def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
engine_errored = engine.errored and not engine.is_running
if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored:
server.should_exit = True


def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
"""
VLLM V1 AsyncLLM catches exceptions and returns
only two types: EngineGenerateError and EngineDeadError.

EngineGenerateError is raised by the per request generate()
method. This error could be request specific (and therefore
recoverable - e.g. if there is an error in input processing).

EngineDeadError is raised by the background output_handler
method. This error is global and therefore not recoverable.

We register these @app.exception_handlers to return nice
responses to the end user if they occur and shut down if needed.
See https://fastapi.tiangolo.com/tutorial/handling-errors/
for more details on how exception handlers work.

If an exception is encountered in a StreamingResponse
generator, the exception is not raised, since we already sent
a 200 status. Rather, we send an error message as the next chunk.
Since the exception is not raised, this means that the server
will not automatically shut down. Instead, we use the watchdog
background task for check for errored state.
"""

@app.exception_handler(RuntimeError)
@app.exception_handler(EngineDeadError)
@app.exception_handler(EngineGenerateError)
async def runtime_exception_handler(request: Request, __):
terminate_if_errored(
server=server,
engine=request.app.state.engine_client,
)

return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
7 changes: 6 additions & 1 deletion vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.server_utils import (
engine_error_handler,
exception_handler,
get_uvicorn_log_config,
http_exception_handler,
lifespan,
Expand All @@ -57,6 +59,7 @@
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.network_utils import is_valid_ipv6_address
from vllm.utils.system_utils import decorate_logs, set_ulimit
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
from vllm.version import __version__ as VLLM_VERSION

prometheus_multiproc_dir: tempfile.TemporaryDirectory
Expand Down Expand Up @@ -250,6 +253,9 @@ def build_app(

app.exception_handler(HTTPException)(http_exception_handler)
app.exception_handler(RequestValidationError)(validation_exception_handler)
app.exception_handler(EngineGenerateError)(engine_error_handler)
app.exception_handler(EngineDeadError)(engine_error_handler)
app.exception_handler(Exception)(exception_handler)

# Ensure --api-key option from CLI takes precedence over VLLM_API_KEY
if tokens := [key for key in (args.api_key or [envs.VLLM_API_KEY]) if key]:
Expand Down Expand Up @@ -355,7 +361,6 @@ async def init_app_state(
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
trust_request_chat_template=args.trust_request_chat_template,
log_error_stack=args.log_error_stack,
)

if any(task in supported_tasks for task in ("generate", "render")):
Expand Down
12 changes: 4 additions & 8 deletions vllm/entrypoints/openai/chat_completion/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def chat(request: Request) -> OpenAIServingChat | None:
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding Not Implemented is because NotImplementedError is inherited from RuntimeError. RuntimeError has been remove from create_error_response and thus NotImplementedError will result in 501 http response status code which is NOT_IMPLEMENTED.

Image

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

},
)
@with_cancellation
Expand All @@ -54,10 +55,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
message="The model does not support Chat Completions API"
)

try:
generator = await handler.create_chat_completion(request, raw_request)
except Exception as e:
generator = handler.create_error_response(e)
generator = await handler.create_chat_completion(request, raw_request)

if isinstance(generator, ErrorResponse):
return JSONResponse(
Expand All @@ -81,6 +79,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding Not Implemented is because NotImplementedError is inherited from RuntimeError. RuntimeError has been remove from create_error_response and thus NotImplementedError will result in 501 http response status code which is NOT_IMPLEMENTED.

Image

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

},
)
async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request):
Expand All @@ -93,10 +92,7 @@ async def render_chat_completion(request: ChatCompletionRequest, raw_request: Re
message="The model does not support Chat Completions API"
)

try:
result = await handler.render_chat_request(request)
except Exception as e:
result = handler.create_error_response(e)
result = await handler.render_chat_request(request)

if isinstance(result, ErrorResponse):
return JSONResponse(content=result.model_dump(), status_code=result.error.code)
Expand Down
Loading