From aedff6c26233bcf969cc04606c412592f2eb9a93 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 14 Jan 2026 09:06:43 +0000 Subject: [PATCH 1/2] [Frontend] Standardize use of `create_error_response` Signed-off-by: DarkLight1337 --- vllm/entrypoints/openai/api_server.py | 8 +------- .../openai/chat_completion/api_router.py | 8 ++++---- vllm/entrypoints/openai/engine/serving.py | 11 ++++++++--- vllm/entrypoints/openai/responses/api_router.py | 14 ++++---------- vllm/entrypoints/openai/serving_models.py | 7 ++++++- vllm/entrypoints/pooling/classify/api_router.py | 8 +++----- vllm/entrypoints/pooling/embed/api_router.py | 6 ++---- vllm/entrypoints/pooling/pooling/api_router.py | 7 +++---- vllm/entrypoints/pooling/score/api_router.py | 12 +++++------- vllm/entrypoints/serve/disagg/api_router.py | 5 ++--- vllm/entrypoints/serve/tokenize/api_router.py | 8 +------- 11 files changed, 39 insertions(+), 55 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index acaf5f8d446c..58d950bedd2c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -388,14 +388,8 @@ async def create_completion(request: CompletionRequest, raw_request: Request): try: generator = await handler.create_completion(request, raw_request) - except OverflowError as e: - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST.value, detail=str(e) - ) from e except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) if isinstance(generator, ErrorResponse): return JSONResponse( diff --git a/vllm/entrypoints/openai/chat_completion/api_router.py b/vllm/entrypoints/openai/chat_completion/api_router.py index e4010c5256a0..748250cbc9e3 100644 --- a/vllm/entrypoints/openai/chat_completion/api_router.py +++ b/vllm/entrypoints/openai/chat_completion/api_router.py @@ -4,7 +4,7 @@ from http import HTTPStatus -from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request +from fastapi import APIRouter, Depends, FastAPI, Request from fastapi.responses import JSONResponse, StreamingResponse from vllm.entrypoints.openai.chat_completion.protocol import ( @@ -53,12 +53,12 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re return base_server.create_error_response( message="The model does not support Chat Completions API" ) + try: generator = await handler.create_chat_completion(request, raw_request) except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) + if isinstance(generator, ErrorResponse): return JSONResponse( content=generator.model_dump(), status_code=generator.error.code diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 045e2d0682c4..802fde3598c9 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -92,7 +92,7 @@ TokenizeCompletionRequest, TokenizeResponse, ) -from vllm.entrypoints.utils import _validate_truncation_size +from vllm.entrypoints.utils import _validate_truncation_size, sanitize_message from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.parse import ( PromptComponents, @@ -766,11 +766,15 @@ def create_error_response( err_type = "BadRequestError" status_code = HTTPStatus.BAD_REQUEST param = exc.parameter - elif isinstance(exc, (ValueError, TypeError, RuntimeError)): + elif isinstance(exc, (ValueError, TypeError, RuntimeError, OverflowError)): # Common validation errors from user input err_type = "BadRequestError" status_code = HTTPStatus.BAD_REQUEST param = None + elif isinstance(exc, NotImplementedError): + err_type = "NotImplementedError" + status_code = HTTPStatus.NOT_IMPLEMENTED + param = None elif exc.__class__.__name__ == "TemplateError": # jinja2.TemplateError (avoid importing jinja2) err_type = "BadRequestError" @@ -789,9 +793,10 @@ def create_error_response( traceback.print_exc() else: traceback.print_stack() + return ErrorResponse( error=ErrorInfo( - message=message, + message=sanitize_message(message), type=err_type, code=status_code.value, param=param, diff --git a/vllm/entrypoints/openai/responses/api_router.py b/vllm/entrypoints/openai/responses/api_router.py index 958cd3fec1a1..5eca91179dbc 100644 --- a/vllm/entrypoints/openai/responses/api_router.py +++ b/vllm/entrypoints/openai/responses/api_router.py @@ -5,7 +5,7 @@ from collections.abc import AsyncGenerator from http import HTTPStatus -from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request +from fastapi import APIRouter, Depends, FastAPI, Request from fastapi.responses import JSONResponse, StreamingResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse @@ -64,9 +64,7 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): try: generator = await handler.create_responses(request, raw_request) except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) if isinstance(generator, ErrorResponse): return JSONResponse( @@ -101,9 +99,7 @@ async def retrieve_responses( stream=stream, ) except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) if isinstance(response, ErrorResponse): return JSONResponse( @@ -128,9 +124,7 @@ async def cancel_responses(response_id: str, raw_request: Request): try: response = await handler.cancel_responses(response_id) except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) if isinstance(response, ErrorResponse): return JSONResponse( diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 4fed2a8274e2..614a6fc32d19 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -18,6 +18,7 @@ LoadLoRAAdapterRequest, UnloadLoRAAdapterRequest, ) +from vllm.entrypoints.utils import sanitize_message from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry @@ -302,5 +303,9 @@ def create_error_response( status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, ) -> ErrorResponse: return ErrorResponse( - error=ErrorInfo(message=message, type=err_type, code=status_code.value) + error=ErrorInfo( + message=sanitize_message(message), + type=err_type, + code=status_code.value, + ) ) diff --git a/vllm/entrypoints/pooling/classify/api_router.py b/vllm/entrypoints/pooling/classify/api_router.py index 7bd170a9f144..f4afec7fe33a 100644 --- a/vllm/entrypoints/pooling/classify/api_router.py +++ b/vllm/entrypoints/pooling/classify/api_router.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from http import HTTPStatus -from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi import APIRouter, Depends, Request from starlette.responses import JSONResponse from typing_extensions import assert_never @@ -36,9 +35,8 @@ async def create_classify(request: ClassificationRequest, raw_request: Request): try: generator = await handler.create_classify(request, raw_request) except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) + if isinstance(generator, ErrorResponse): return JSONResponse( content=generator.model_dump(), status_code=generator.error.code diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py index d8e5cf64127e..50a4018857aa 100644 --- a/vllm/entrypoints/pooling/embed/api_router.py +++ b/vllm/entrypoints/pooling/embed/api_router.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from http import HTTPStatus -from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi import APIRouter, Depends, Request from fastapi.responses import JSONResponse, StreamingResponse from typing_extensions import assert_never @@ -47,9 +47,7 @@ async def create_embedding( try: generator = await handler.create_embedding(request, raw_request) except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) if isinstance(generator, ErrorResponse): return JSONResponse( diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py index 223d6e3b89be..bfff97daadb8 100644 --- a/vllm/entrypoints/pooling/pooling/api_router.py +++ b/vllm/entrypoints/pooling/pooling/api_router.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from http import HTTPStatus -from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi import APIRouter, Depends, Request from fastapi.responses import JSONResponse, StreamingResponse from typing_extensions import assert_never @@ -44,9 +44,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): try: generator = await handler.create_pooling(request, raw_request) except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) + if isinstance(generator, ErrorResponse): return JSONResponse( content=generator.model_dump(), status_code=generator.error.code diff --git a/vllm/entrypoints/pooling/score/api_router.py b/vllm/entrypoints/pooling/score/api_router.py index bd9b5c425b05..006403239656 100644 --- a/vllm/entrypoints/pooling/score/api_router.py +++ b/vllm/entrypoints/pooling/score/api_router.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from http import HTTPStatus -from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi import APIRouter, Depends, Request from fastapi.responses import JSONResponse from typing_extensions import assert_never @@ -52,9 +52,8 @@ async def create_score(request: ScoreRequest, raw_request: Request): try: generator = await handler.create_score(request, raw_request) except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) + if isinstance(generator, ErrorResponse): return JSONResponse( content=generator.model_dump(), status_code=generator.error.code @@ -104,9 +103,8 @@ async def do_rerank(request: RerankRequest, raw_request: Request): try: generator = await handler.do_rerank(request, raw_request) except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) + if isinstance(generator, ErrorResponse): return JSONResponse( content=generator.model_dump(), status_code=generator.error.code diff --git a/vllm/entrypoints/serve/disagg/api_router.py b/vllm/entrypoints/serve/disagg/api_router.py index 6924dc83882f..0b1d1e50ab56 100644 --- a/vllm/entrypoints/serve/disagg/api_router.py +++ b/vllm/entrypoints/serve/disagg/api_router.py @@ -67,9 +67,8 @@ async def generate(request: GenerateRequest, raw_request: Request): try: generator = await handler.serve_tokens(request, raw_request) except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) + if isinstance(generator, ErrorResponse): return JSONResponse( content=generator.model_dump(), status_code=generator.error.code diff --git a/vllm/entrypoints/serve/tokenize/api_router.py b/vllm/entrypoints/serve/tokenize/api_router.py index ec486cf410d6..7b0b466abaf4 100644 --- a/vllm/entrypoints/serve/tokenize/api_router.py +++ b/vllm/entrypoints/serve/tokenize/api_router.py @@ -51,14 +51,8 @@ async def tokenize(request: TokenizeRequest, raw_request: Request): try: generator = await handler.create_tokenize(request, raw_request) - except NotImplementedError as e: - raise HTTPException( - status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e) - ) from e except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e + return handler.create_error_response(e) if isinstance(generator, ErrorResponse): return JSONResponse( From 1ebeac31a78680760d8bb8bf8a1e7a9b8a8ec3d7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 14 Jan 2026 09:37:03 +0000 Subject: [PATCH 2/2] Fix Signed-off-by: DarkLight1337 --- vllm/entrypoints/utils.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 0d6fea36fd8a..1134d49cf123 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -7,7 +7,7 @@ import os from argparse import Namespace from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING, Any import regex as re from fastapi import Request @@ -22,20 +22,27 @@ resolve_hf_chat_template, resolve_mistral_chat_template, ) -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) -from vllm.entrypoints.openai.cli_args import make_arg_parser -from vllm.entrypoints.openai.engine.protocol import ( - CompletionRequest, - StreamOptions, -) -from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils.argparse_utils import FlexibleArgumentParser +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, + ) + from vllm.entrypoints.openai.engine.protocol import ( + CompletionRequest, + StreamOptions, + ) + from vllm.entrypoints.openai.serving_models import LoRAModulePath +else: + ChatCompletionRequest = object + CompletionRequest = object + StreamOptions = object + LoRAModulePath = object + + logger = init_logger(__name__) VLLM_SUBCMD_PARSER_EPILOG = ( @@ -208,7 +215,7 @@ def _validate_truncation_size( def get_max_tokens( max_model_len: int, - request: ChatCompletionRequest | CompletionRequest, + request: "ChatCompletionRequest | CompletionRequest", input_length: int, default_sampling_params: dict, ) -> int: @@ -229,6 +236,8 @@ def get_max_tokens( def log_non_default_args(args: Namespace | EngineArgs): + from vllm.entrypoints.openai.cli_args import make_arg_parser + non_default_args = {} # Handle Namespace @@ -257,7 +266,7 @@ def log_non_default_args(args: Namespace | EngineArgs): def should_include_usage( - stream_options: StreamOptions | None, enable_force_include_usage: bool + stream_options: "StreamOptions | None", enable_force_include_usage: bool ) -> tuple[bool, bool]: if stream_options: include_usage = stream_options.include_usage or enable_force_include_usage @@ -272,6 +281,8 @@ def should_include_usage( def process_lora_modules( args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None ) -> list[LoRAModulePath]: + from vllm.entrypoints.openai.serving_models import LoRAModulePath + lora_modules = args_lora_modules if default_mm_loras: default_mm_lora_paths = [