Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,14 +388,8 @@ async def create_completion(request: CompletionRequest, raw_request: Request):

try:
generator = await handler.create_completion(request, raw_request)
except OverflowError as e:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST.value, detail=str(e)
) from e
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(generator, ErrorResponse):
return JSONResponse(
Expand Down
8 changes: 4 additions & 4 deletions vllm/entrypoints/openai/chat_completion/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from http import HTTPStatus

from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
from fastapi import APIRouter, Depends, FastAPI, Request
from fastapi.responses import JSONResponse, StreamingResponse

from vllm.entrypoints.openai.chat_completion.protocol import (
Expand Down Expand Up @@ -53,12 +53,12 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
return base_server.create_error_response(
message="The model does not support Chat Completions API"
)

try:
generator = await handler.create_chat_completion(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
Expand Down
11 changes: 8 additions & 3 deletions vllm/entrypoints/openai/engine/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
TokenizeCompletionRequest,
TokenizeResponse,
)
from vllm.entrypoints.utils import _validate_truncation_size
from vllm.entrypoints.utils import _validate_truncation_size, sanitize_message
from vllm.inputs.data import PromptType, TokensPrompt
from vllm.inputs.parse import (
PromptComponents,
Expand Down Expand Up @@ -766,11 +766,15 @@ def create_error_response(
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = exc.parameter
elif isinstance(exc, (ValueError, TypeError, RuntimeError)):
elif isinstance(exc, (ValueError, TypeError, RuntimeError, OverflowError)):
# Common validation errors from user input
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = None
elif isinstance(exc, NotImplementedError):
err_type = "NotImplementedError"
status_code = HTTPStatus.NOT_IMPLEMENTED
param = None
elif exc.__class__.__name__ == "TemplateError":
# jinja2.TemplateError (avoid importing jinja2)
err_type = "BadRequestError"
Expand All @@ -789,9 +793,10 @@ def create_error_response(
traceback.print_exc()
else:
traceback.print_stack()

return ErrorResponse(
error=ErrorInfo(
message=message,
message=sanitize_message(message),
type=err_type,
code=status_code.value,
param=param,
Expand Down
14 changes: 4 additions & 10 deletions vllm/entrypoints/openai/responses/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from collections.abc import AsyncGenerator
from http import HTTPStatus

from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
from fastapi import APIRouter, Depends, FastAPI, Request
from fastapi.responses import JSONResponse, StreamingResponse

from vllm.entrypoints.openai.engine.protocol import ErrorResponse
Expand Down Expand Up @@ -64,9 +64,7 @@ async def create_responses(request: ResponsesRequest, raw_request: Request):
try:
generator = await handler.create_responses(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(generator, ErrorResponse):
return JSONResponse(
Expand Down Expand Up @@ -101,9 +99,7 @@ async def retrieve_responses(
stream=stream,
)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(response, ErrorResponse):
return JSONResponse(
Expand All @@ -128,9 +124,7 @@ async def cancel_responses(response_id: str, raw_request: Request):
try:
response = await handler.cancel_responses(response_id)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(response, ErrorResponse):
return JSONResponse(
Expand Down
7 changes: 6 additions & 1 deletion vllm/entrypoints/openai/serving_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest,
)
from vllm.entrypoints.utils import sanitize_message
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
Expand Down Expand Up @@ -302,5 +303,9 @@ def create_error_response(
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
) -> ErrorResponse:
return ErrorResponse(
error=ErrorInfo(message=message, type=err_type, code=status_code.value)
error=ErrorInfo(
message=sanitize_message(message),
type=err_type,
code=status_code.value,
)
)
8 changes: 3 additions & 5 deletions vllm/entrypoints/pooling/classify/api_router.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus

from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi import APIRouter, Depends, Request
from starlette.responses import JSONResponse
from typing_extensions import assert_never

Expand Down Expand Up @@ -36,9 +35,8 @@ async def create_classify(request: ClassificationRequest, raw_request: Request):
try:
generator = await handler.create_classify(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
Expand Down
6 changes: 2 additions & 4 deletions vllm/entrypoints/pooling/embed/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus

from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never

Expand Down Expand Up @@ -47,9 +47,7 @@ async def create_embedding(
try:
generator = await handler.create_embedding(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(generator, ErrorResponse):
return JSONResponse(
Expand Down
7 changes: 3 additions & 4 deletions vllm/entrypoints/pooling/pooling/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus

from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never

Expand Down Expand Up @@ -44,9 +44,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
try:
generator = await handler.create_pooling(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
Expand Down
12 changes: 5 additions & 7 deletions vllm/entrypoints/pooling/score/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus

from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse
from typing_extensions import assert_never

Expand Down Expand Up @@ -52,9 +52,8 @@ async def create_score(request: ScoreRequest, raw_request: Request):
try:
generator = await handler.create_score(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
Expand Down Expand Up @@ -104,9 +103,8 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
try:
generator = await handler.do_rerank(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
Expand Down
5 changes: 2 additions & 3 deletions vllm/entrypoints/serve/disagg/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,8 @@ async def generate(request: GenerateRequest, raw_request: Request):
try:
generator = await handler.serve_tokens(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
Expand Down
8 changes: 1 addition & 7 deletions vllm/entrypoints/serve/tokenize/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,8 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):

try:
generator = await handler.create_tokenize(request, raw_request)
except NotImplementedError as e:
raise HTTPException(
status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
) from e
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)

if isinstance(generator, ErrorResponse):
return JSONResponse(
Expand Down
35 changes: 23 additions & 12 deletions vllm/entrypoints/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import os
from argparse import Namespace
from pathlib import Path
from typing import Any
from typing import TYPE_CHECKING, Any

import regex as re
from fastapi import Request
Expand All @@ -22,20 +22,27 @@
resolve_hf_chat_template,
resolve_mistral_chat_template,
)
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.engine.protocol import (
CompletionRequest,
StreamOptions,
)
from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.argparse_utils import FlexibleArgumentParser

if TYPE_CHECKING:
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
CompletionRequest,
StreamOptions,
)
from vllm.entrypoints.openai.serving_models import LoRAModulePath
else:
ChatCompletionRequest = object
CompletionRequest = object
StreamOptions = object
LoRAModulePath = object


logger = init_logger(__name__)

VLLM_SUBCMD_PARSER_EPILOG = (
Expand Down Expand Up @@ -208,7 +215,7 @@ def _validate_truncation_size(

def get_max_tokens(
max_model_len: int,
request: ChatCompletionRequest | CompletionRequest,
request: "ChatCompletionRequest | CompletionRequest",
input_length: int,
default_sampling_params: dict,
) -> int:
Expand All @@ -229,6 +236,8 @@ def get_max_tokens(


def log_non_default_args(args: Namespace | EngineArgs):
from vllm.entrypoints.openai.cli_args import make_arg_parser

non_default_args = {}

# Handle Namespace
Expand Down Expand Up @@ -257,7 +266,7 @@ def log_non_default_args(args: Namespace | EngineArgs):


def should_include_usage(
stream_options: StreamOptions | None, enable_force_include_usage: bool
stream_options: "StreamOptions | None", enable_force_include_usage: bool
) -> tuple[bool, bool]:
if stream_options:
include_usage = stream_options.include_usage or enable_force_include_usage
Expand All @@ -272,6 +281,8 @@ def should_include_usage(
def process_lora_modules(
args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None
) -> list[LoRAModulePath]:
from vllm.entrypoints.openai.serving_models import LoRAModulePath

lora_modules = args_lora_modules
if default_mm_loras:
default_mm_lora_paths = [
Expand Down