diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index aa664f6d77f7..d5aa730ddced 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -196,7 +196,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path): invalid_files.mkdir() (invalid_files / "adapter_config.json").write_text("this is not json") - with pytest.raises(openai.BadRequestError): + with pytest.raises(openai.InternalServerError): await client.post( "load_lora_adapter", cast_to=str, @@ -232,7 +232,7 @@ async def test_dynamic_lora_badrequests( json.dump(adapter_config, f) # Test loading the adapter - with pytest.raises(openai.BadRequestError, match=expected_error): + with pytest.raises(openai.InternalServerError, match=expected_error): await client.post( "load_lora_adapter", cast_to=str, @@ -312,7 +312,7 @@ async def run_good_requests(client): body={"lora_name": "notfound", "lora_path": "/not/an/adapter"}, ) for _ in range(25): - with suppress(openai.BadRequestError): + with suppress(openai.InternalServerError): await client.post( "load_lora_adapter", cast_to=str, diff --git a/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py b/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py index a2867efdc584..01b3e6502222 100644 --- a/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py +++ b/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py @@ -88,7 +88,7 @@ async def test_sagemaker_load_adapter_invalid_files( basic_server_with_lora.url_for("adapters"), json={"name": "invalid-adapter", "src": str(invalid_files)}, ) - assert load_response.status_code == 400 + assert load_response.status_code == 500 @pytest.mark.asyncio diff --git a/vllm/entrypoints/anthropic/api_router.py b/vllm/entrypoints/anthropic/api_router.py index 2b65fff50384..1fe2be899626 100644 --- a/vllm/entrypoints/anthropic/api_router.py +++ b/vllm/entrypoints/anthropic/api_router.py @@ -62,7 +62,7 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques if handler is None: base_server = raw_request.app.state.openai_serving_tokenization error = base_server.create_error_response( - message="The model does not support Messages API" + NotImplementedError("The model does not support Messages API") ) return translate_error_response(error) @@ -108,7 +108,7 @@ async def count_tokens(request: AnthropicCountTokensRequest, raw_request: Reques if handler is None: base_server = raw_request.app.state.openai_serving_tokenization error = base_server.create_error_response( - message="The model does not support Messages API" + NotImplementedError("The model does not support Messages API") ) return translate_error_response(error) diff --git a/vllm/entrypoints/openai/chat_completion/api_router.py b/vllm/entrypoints/openai/chat_completion/api_router.py index f5569f5aba3e..28a2eab679c0 100644 --- a/vllm/entrypoints/openai/chat_completion/api_router.py +++ b/vllm/entrypoints/openai/chat_completion/api_router.py @@ -50,10 +50,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re ) handler = chat(raw_request) if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Chat Completions API" - ) + raise NotImplementedError("The model does not support Chat Completions API") generator = await handler.create_chat_completion(request, raw_request) diff --git a/vllm/entrypoints/openai/completion/api_router.py b/vllm/entrypoints/openai/completion/api_router.py index 56e961bef408..4d8e0f885837 100644 --- a/vllm/entrypoints/openai/completion/api_router.py +++ b/vllm/entrypoints/openai/completion/api_router.py @@ -49,10 +49,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request): ) handler = completion(raw_request) if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Completions API" - ) + raise NotImplementedError("The model does not support Completions API") generator = await handler.create_completion(request, raw_request) diff --git a/vllm/entrypoints/openai/models/serving.py b/vllm/entrypoints/openai/models/serving.py index e99d8f7ac767..1db0eccea0ed 100644 --- a/vllm/entrypoints/openai/models/serving.py +++ b/vllm/entrypoints/openai/models/serving.py @@ -7,7 +7,6 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.engine.protocol import ( - ErrorInfo, ErrorResponse, ModelCard, ModelList, @@ -18,7 +17,8 @@ LoadLoRAAdapterRequest, UnloadLoRAAdapterRequest, ) -from vllm.entrypoints.utils import sanitize_message +from vllm.entrypoints.utils import create_error_response +from vllm.exceptions import LoRAAdapterNotFoundError from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry @@ -152,15 +152,15 @@ async def load_lora_adapter( try: await self.engine_client.add_lora(lora_request) except Exception as e: - error_type = "BadRequestError" - status_code = HTTPStatus.BAD_REQUEST - if "No adapter found" in str(e): - error_type = "NotFoundError" - status_code = HTTPStatus.NOT_FOUND - - return create_error_response( - message=str(e), err_type=error_type, status_code=status_code - ) + if str( + LoRAAdapterNotFoundError( + lora_request.lora_name, lora_request.lora_path + ) + ) in str(e): + raise LoRAAdapterNotFoundError( + lora_request.lora_name, lora_request.lora_path + ) from e + raise self.lora_requests[lora_name] = lora_request logger.info( @@ -292,17 +292,3 @@ async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse: err_type="NotFoundError", status_code=HTTPStatus.NOT_FOUND, ) - - -def create_error_response( - message: str, - err_type: str = "BadRequestError", - status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, -) -> ErrorResponse: - return ErrorResponse( - error=ErrorInfo( - message=sanitize_message(message), - type=err_type, - code=status_code.value, - ) - ) diff --git a/vllm/entrypoints/openai/responses/api_router.py b/vllm/entrypoints/openai/responses/api_router.py index 0c6b4a73801f..88d821260940 100644 --- a/vllm/entrypoints/openai/responses/api_router.py +++ b/vllm/entrypoints/openai/responses/api_router.py @@ -59,10 +59,7 @@ async def _convert_stream_to_sse_events( async def create_responses(request: ResponsesRequest, raw_request: Request): handler = responses(raw_request) if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Responses API" - ) + raise NotImplementedError("The model does not support Responses API") generator = await handler.create_responses(request, raw_request) @@ -88,10 +85,7 @@ async def retrieve_responses( ): handler = responses(raw_request) if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Responses API" - ) + raise NotImplementedError("The model does not support Responses API") response = await handler.retrieve_responses( response_id, @@ -115,10 +109,7 @@ async def retrieve_responses( async def cancel_responses(response_id: str, raw_request: Request): handler = responses(raw_request) if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Responses API" - ) + raise NotImplementedError("The model does not support Responses API") response = await handler.cancel_responses(response_id) diff --git a/vllm/entrypoints/openai/speech_to_text/api_router.py b/vllm/entrypoints/openai/speech_to_text/api_router.py index 2c4f6bc9a1ce..b940a97e4dff 100644 --- a/vllm/entrypoints/openai/speech_to_text/api_router.py +++ b/vllm/entrypoints/openai/speech_to_text/api_router.py @@ -65,10 +65,7 @@ async def create_transcriptions( ): handler = transcription(raw_request) if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Transcriptions API" - ) + raise NotImplementedError("The model does not support Transcriptions API") audio_data = await request.file.read() @@ -101,10 +98,7 @@ async def create_translations( ): handler = translation(raw_request) if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Translations API" - ) + raise NotImplementedError("The model does not support Translations API") audio_data = await request.file.read() diff --git a/vllm/entrypoints/pooling/classify/api_router.py b/vllm/entrypoints/pooling/classify/api_router.py index 1c364a84a469..f254a6c2b399 100644 --- a/vllm/entrypoints/pooling/classify/api_router.py +++ b/vllm/entrypoints/pooling/classify/api_router.py @@ -2,13 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from fastapi import APIRouter, Depends, Request -from fastapi.responses import JSONResponse, Response +from fastapi.responses import Response from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest from vllm.entrypoints.pooling.classify.serving import ServingClassification from vllm.entrypoints.utils import ( - create_error_response, load_aware_call, with_cancellation, ) @@ -28,12 +27,6 @@ async def create_classify( ) -> Response: handler = classify(raw_request) if handler is None: - error_response = create_error_response( - message="The model does not support Classification API" - ) - return JSONResponse( - content=error_response.model_dump(), - status_code=error_response.error.code, - ) + raise NotImplementedError("The model does not support Classification API") return await handler(request, raw_request) diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py index d5e4028b73f2..f88999468692 100644 --- a/vllm/entrypoints/pooling/embed/api_router.py +++ b/vllm/entrypoints/pooling/embed/api_router.py @@ -4,14 +4,12 @@ from http import HTTPStatus from fastapi import APIRouter, Depends, Request -from fastapi.responses import JSONResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest from vllm.entrypoints.pooling.embed.serving import ServingEmbedding from vllm.entrypoints.utils import ( - create_error_response, load_aware_call, with_cancellation, ) @@ -39,11 +37,6 @@ async def create_embedding( ): handler = embedding(raw_request) if handler is None: - error_response = create_error_response( - message="The model does not support Embeddings API" - ) - return JSONResponse( - content=error_response.model_dump(), - status_code=error_response.error.code, - ) + raise NotImplementedError("The model does not support Embeddings API") + return await handler(request, raw_request) diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py index 6cac91b7c1b7..f63a8edf6ca8 100644 --- a/vllm/entrypoints/pooling/pooling/api_router.py +++ b/vllm/entrypoints/pooling/pooling/api_router.py @@ -37,10 +37,7 @@ def pooling(request: Request) -> OpenAIServingPooling | None: async def create_pooling(request: PoolingRequest, raw_request: Request): handler = pooling(raw_request) if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Pooling API" - ) + raise NotImplementedError("The model does not support Pooling API") generator = await handler.create_pooling(request, raw_request) diff --git a/vllm/entrypoints/pooling/score/api_router.py b/vllm/entrypoints/pooling/score/api_router.py index 64c6b496bbeb..a9a8641e9214 100644 --- a/vllm/entrypoints/pooling/score/api_router.py +++ b/vllm/entrypoints/pooling/score/api_router.py @@ -44,10 +44,7 @@ def rerank(request: Request) -> ServingScores | None: async def create_score(request: ScoreRequest, raw_request: Request): handler = score(raw_request) if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Score API" - ) + raise NotImplementedError("The model does not support Score API") generator = await handler.create_score(request, raw_request) @@ -93,10 +90,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): async def do_rerank(request: RerankRequest, raw_request: Request): handler = rerank(raw_request) if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Rerank (Score) API" - ) + raise NotImplementedError("The model does not support Rerank (Score) API") generator = await handler.do_rerank(request, raw_request) diff --git a/vllm/entrypoints/serve/disagg/api_router.py b/vllm/entrypoints/serve/disagg/api_router.py index a9c6d3cdcbb7..e7c18a0914a2 100644 --- a/vllm/entrypoints/serve/disagg/api_router.py +++ b/vllm/entrypoints/serve/disagg/api_router.py @@ -61,9 +61,7 @@ def engine_client(request: Request) -> EngineClient: async def generate(request: GenerateRequest, raw_request: Request): handler = generate_tokens(raw_request) if handler is None: - return tokenization(raw_request).create_error_response( - message="The model does not support generate tokens API" - ) + raise NotImplementedError("The model does not support generate tokens API") generator = await handler.serve_tokens(request, raw_request) diff --git a/vllm/entrypoints/serve/render/api_router.py b/vllm/entrypoints/serve/render/api_router.py index a9f62e450ad7..dd782a97fe24 100644 --- a/vllm/entrypoints/serve/render/api_router.py +++ b/vllm/entrypoints/serve/render/api_router.py @@ -10,7 +10,6 @@ from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.serve.render.serving import OpenAIServingRender -from vllm.entrypoints.utils import create_error_response from vllm.logger import init_logger logger = init_logger(__name__) @@ -36,13 +35,8 @@ def render(request: Request) -> OpenAIServingRender | None: async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request): handler = render(raw_request) if handler is None: - error = create_error_response( - message="The model does not support Chat Completions Render API", - err_type="NotFoundError", - status_code=HTTPStatus.NOT_FOUND, - ) - return JSONResponse( - status_code=HTTPStatus.NOT_FOUND, content=error.model_dump() + raise NotImplementedError( + "The model does not support Chat Completions Render API" ) result = await handler.render_chat_request(request) @@ -66,14 +60,7 @@ async def render_chat_completion(request: ChatCompletionRequest, raw_request: Re async def render_completion(request: CompletionRequest, raw_request: Request): handler = render(raw_request) if handler is None: - error = create_error_response( - message="The model does not support Completions Render API", - err_type="NotFoundError", - status_code=HTTPStatus.NOT_FOUND, - ) - return JSONResponse( - status_code=HTTPStatus.NOT_FOUND, content=error.model_dump() - ) + raise NotImplementedError("The model does not support Completions Render API") result = await handler.render_completion_request(request) diff --git a/vllm/exceptions.py b/vllm/exceptions.py index 5baf45619f25..931040b8ceb0 100644 --- a/vllm/exceptions.py +++ b/vllm/exceptions.py @@ -36,7 +36,31 @@ def __str__(self): return f"{base} ({', '.join(extras)})" if extras else base -class VLLMNotFoundError(ValueError): +class VLLMNotFoundError(Exception): """vLLM-specific NotFoundError""" pass + + +class LoRAAdapterNotFoundError(VLLMNotFoundError): + """Exception raised when a LoRA adapter is not found. + + This exception is thrown when a requested LoRA adapter does not exist + in the system. + + Attributes: + message: The error message string describing the exception + """ + + message: str + + def __init__( + self, + lora_name: str, + lora_path: str, + ) -> None: + message = f"Loading lora {lora_name} failed: No adapter found for {lora_path}" + self.message = message + + def __str__(self): + return self.message diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index b8916f7875ce..c5c0b7d33c4d 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -7,6 +7,7 @@ import torch from vllm.config import VllmConfig +from vllm.exceptions import LoRAAdapterNotFoundError from vllm.logger import init_logger from vllm.lora.lora_model import LoRAModel from vllm.lora.model_manager import ( @@ -147,12 +148,10 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: # offline mode) # - No local adapter files found at `lora_request.lora_path` # For NotFoundError - raise ValueError( - f"Loading lora {lora_request.lora_name} failed: No adapter " - f"found for {lora_request.lora_path}" + raise LoRAAdapterNotFoundError( + lora_request.lora_name, lora_request.lora_path ) from e except Exception as e: - # For BadRequestError raise e return lora