diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 002ae62b8ee8..126e2b4024e8 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -29,11 +29,13 @@ from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args +from vllm.entrypoints.openai.engine.protocol import GenerationError from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.server_utils import ( engine_error_handler, exception_handler, + generation_error_handler, get_uvicorn_log_config, http_exception_handler, lifespan, @@ -263,6 +265,7 @@ def build_app( app.exception_handler(RequestValidationError)(validation_exception_handler) app.exception_handler(EngineGenerateError)(engine_error_handler) app.exception_handler(EngineDeadError)(engine_error_handler) + app.exception_handler(GenerationError)(generation_error_handler) app.exception_handler(Exception)(exception_handler) # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY diff --git a/vllm/entrypoints/openai/server_utils.py b/vllm/entrypoints/openai/server_utils.py index 1453d8083c80..7e9e9a0290e3 100644 --- a/vllm/entrypoints/openai/server_utils.py +++ b/vllm/entrypoints/openai/server_utils.py @@ -21,7 +21,11 @@ from vllm import envs from vllm.engine.protocol import EngineClient from vllm.entrypoints.launcher import terminate_if_errored -from vllm.entrypoints.openai.engine.protocol import ErrorInfo, ErrorResponse +from vllm.entrypoints.openai.engine.protocol import ( + ErrorInfo, + ErrorResponse, + GenerationError, +) from vllm.entrypoints.utils import create_error_response, sanitize_message from vllm.exceptions import VLLMValidationError from vllm.logger import init_logger @@ -354,6 +358,17 @@ async def engine_error_handler( return JSONResponse(err.model_dump(), status_code=err.error.code) +async def generation_error_handler(req: Request, exc: GenerationError): + """Handle GenerationError without logging stack traces. + + GenerationError is a known, expected error (e.g. KV cache load failure) + that should be returned to the client as a 500 response without polluting + server logs with stack traces. + """ + err = create_error_response(exc) + return JSONResponse(err.model_dump(), status_code=err.error.code) + + async def exception_handler(req: Request, exc: Exception): if req.app.state.args.log_error_stack: logger.exception(