Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions tests/entrypoints/openai/test_lora_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
invalid_files.mkdir()
(invalid_files / "adapter_config.json").write_text("this is not json")

with pytest.raises(openai.BadRequestError):
with pytest.raises(openai.InternalServerError):
await client.post(
"load_lora_adapter",
cast_to=str,
Expand Down Expand Up @@ -232,7 +232,7 @@ async def test_dynamic_lora_badrequests(
json.dump(adapter_config, f)

# Test loading the adapter
with pytest.raises(openai.BadRequestError, match=expected_error):
with pytest.raises(openai.InternalServerError, match=expected_error):
await client.post(
"load_lora_adapter",
cast_to=str,
Expand Down Expand Up @@ -312,7 +312,7 @@ async def run_good_requests(client):
body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
)
for _ in range(25):
with suppress(openai.BadRequestError):
with suppress(openai.InternalServerError):
await client.post(
"load_lora_adapter",
cast_to=str,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ async def test_sagemaker_load_adapter_invalid_files(
basic_server_with_lora.url_for("adapters"),
json={"name": "invalid-adapter", "src": str(invalid_files)},
)
assert load_response.status_code == 400
assert load_response.status_code == 500


@pytest.mark.asyncio
Expand Down
4 changes: 2 additions & 2 deletions vllm/entrypoints/anthropic/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
error = base_server.create_error_response(
message="The model does not support Messages API"
NotImplementedError("The model does not support Messages API")
)
return translate_error_response(error)

Expand Down Expand Up @@ -108,7 +108,7 @@ async def count_tokens(request: AnthropicCountTokensRequest, raw_request: Reques
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
error = base_server.create_error_response(
message="The model does not support Messages API"
NotImplementedError("The model does not support Messages API")
)
return translate_error_response(error)

Expand Down
5 changes: 1 addition & 4 deletions vllm/entrypoints/openai/chat_completion/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
)
handler = chat(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Chat Completions API"
)
raise NotImplementedError("The model does not support Chat Completions API")

generator = await handler.create_chat_completion(request, raw_request)

Expand Down
5 changes: 1 addition & 4 deletions vllm/entrypoints/openai/completion/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
)
handler = completion(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Completions API"
)
raise NotImplementedError("The model does not support Completions API")

generator = await handler.create_completion(request, raw_request)

Expand Down
36 changes: 11 additions & 25 deletions vllm/entrypoints/openai/models/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo,
ErrorResponse,
ModelCard,
ModelList,
Expand All @@ -18,7 +17,8 @@
LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest,
)
from vllm.entrypoints.utils import sanitize_message
from vllm.entrypoints.utils import create_error_response
from vllm.exceptions import LoRAAdapterNotFoundError
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
Expand Down Expand Up @@ -152,15 +152,15 @@ async def load_lora_adapter(
try:
await self.engine_client.add_lora(lora_request)
except Exception as e:
error_type = "BadRequestError"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normal Exception has changed from BadRequestError to InternalServerError, i.e., http response status code from 404 to 500

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, if --enable-lora arg is not set, exception occurred during load lora adapter will be:

{"error":{"message":"Call to add_lora method failed: LoRA is not enabled. Use --enable-lora to enable LoRA.","type":"InternalServerError","param":null,"code":500}}

status_code = HTTPStatus.BAD_REQUEST
if "No adapter found" in str(e):
error_type = "NotFoundError"
status_code = HTTPStatus.NOT_FOUND

return create_error_response(
message=str(e), err_type=error_type, status_code=status_code
)
if str(
LoRAAdapterNotFoundError(
lora_request.lora_name, lora_request.lora_path
)
) in str(e):
raise LoRAAdapterNotFoundError(
lora_request.lora_name, lora_request.lora_path
) from e
raise

self.lora_requests[lora_name] = lora_request
logger.info(
Expand Down Expand Up @@ -292,17 +292,3 @@ async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse:
err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND,
)


def create_error_response(
message: str,
err_type: str = "BadRequestError",
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
) -> ErrorResponse:
return ErrorResponse(
error=ErrorInfo(
message=sanitize_message(message),
type=err_type,
code=status_code.value,
)
)
15 changes: 3 additions & 12 deletions vllm/entrypoints/openai/responses/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,7 @@ async def _convert_stream_to_sse_events(
async def create_responses(request: ResponsesRequest, raw_request: Request):
handler = responses(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Responses API"
)
raise NotImplementedError("The model does not support Responses API")

generator = await handler.create_responses(request, raw_request)

Expand All @@ -88,10 +85,7 @@ async def retrieve_responses(
):
handler = responses(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Responses API"
)
raise NotImplementedError("The model does not support Responses API")

response = await handler.retrieve_responses(
response_id,
Expand All @@ -115,10 +109,7 @@ async def retrieve_responses(
async def cancel_responses(response_id: str, raw_request: Request):
handler = responses(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Responses API"
)
raise NotImplementedError("The model does not support Responses API")

response = await handler.cancel_responses(response_id)

Expand Down
10 changes: 2 additions & 8 deletions vllm/entrypoints/openai/speech_to_text/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,7 @@ async def create_transcriptions(
):
handler = transcription(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Transcriptions API"
)
raise NotImplementedError("The model does not support Transcriptions API")

audio_data = await request.file.read()

Expand Down Expand Up @@ -101,10 +98,7 @@ async def create_translations(
):
handler = translation(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Translations API"
)
raise NotImplementedError("The model does not support Translations API")

audio_data = await request.file.read()

Expand Down
11 changes: 2 additions & 9 deletions vllm/entrypoints/pooling/classify/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, Response
from fastapi.responses import Response

from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
from vllm.entrypoints.pooling.classify.serving import ServingClassification
from vllm.entrypoints.utils import (
create_error_response,
load_aware_call,
with_cancellation,
)
Expand All @@ -28,12 +27,6 @@ async def create_classify(
) -> Response:
handler = classify(raw_request)
if handler is None:
error_response = create_error_response(
message="The model does not support Classification API"
)
return JSONResponse(
content=error_response.model_dump(),
status_code=error_response.error.code,
)
raise NotImplementedError("The model does not support Classification API")

return await handler(request, raw_request)
11 changes: 2 additions & 9 deletions vllm/entrypoints/pooling/embed/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@
from http import HTTPStatus

from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse

from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest
from vllm.entrypoints.pooling.embed.serving import ServingEmbedding
from vllm.entrypoints.utils import (
create_error_response,
load_aware_call,
with_cancellation,
)
Expand Down Expand Up @@ -39,11 +37,6 @@ async def create_embedding(
):
handler = embedding(raw_request)
if handler is None:
error_response = create_error_response(
message="The model does not support Embeddings API"
)
return JSONResponse(
content=error_response.model_dump(),
status_code=error_response.error.code,
)
raise NotImplementedError("The model does not support Embeddings API")

return await handler(request, raw_request)
5 changes: 1 addition & 4 deletions vllm/entrypoints/pooling/pooling/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,7 @@ def pooling(request: Request) -> OpenAIServingPooling | None:
async def create_pooling(request: PoolingRequest, raw_request: Request):
handler = pooling(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Pooling API"
)
raise NotImplementedError("The model does not support Pooling API")

generator = await handler.create_pooling(request, raw_request)

Expand Down
10 changes: 2 additions & 8 deletions vllm/entrypoints/pooling/score/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,7 @@ def rerank(request: Request) -> ServingScores | None:
async def create_score(request: ScoreRequest, raw_request: Request):
handler = score(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Score API"
)
raise NotImplementedError("The model does not support Score API")

generator = await handler.create_score(request, raw_request)

Expand Down Expand Up @@ -93,10 +90,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
async def do_rerank(request: RerankRequest, raw_request: Request):
handler = rerank(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Rerank (Score) API"
)
raise NotImplementedError("The model does not support Rerank (Score) API")

generator = await handler.do_rerank(request, raw_request)

Expand Down
4 changes: 1 addition & 3 deletions vllm/entrypoints/serve/disagg/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,7 @@ def engine_client(request: Request) -> EngineClient:
async def generate(request: GenerateRequest, raw_request: Request):
handler = generate_tokens(raw_request)
if handler is None:
return tokenization(raw_request).create_error_response(
message="The model does not support generate tokens API"
)
raise NotImplementedError("The model does not support generate tokens API")

generator = await handler.serve_tokens(request, raw_request)

Expand Down
19 changes: 3 additions & 16 deletions vllm/entrypoints/serve/render/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.entrypoints.utils import create_error_response
from vllm.logger import init_logger

logger = init_logger(__name__)
Expand All @@ -36,13 +35,8 @@ def render(request: Request) -> OpenAIServingRender | None:
async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request):
handler = render(raw_request)
if handler is None:
error = create_error_response(
message="The model does not support Chat Completions Render API",
err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND,
)
return JSONResponse(
status_code=HTTPStatus.NOT_FOUND, content=error.model_dump()
raise NotImplementedError(
"The model does not support Chat Completions Render API"
)

result = await handler.render_chat_request(request)
Expand All @@ -66,14 +60,7 @@ async def render_chat_completion(request: ChatCompletionRequest, raw_request: Re
async def render_completion(request: CompletionRequest, raw_request: Request):
handler = render(raw_request)
if handler is None:
error = create_error_response(
message="The model does not support Completions Render API",
err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND,
)
return JSONResponse(
status_code=HTTPStatus.NOT_FOUND, content=error.model_dump()
)
raise NotImplementedError("The model does not support Completions Render API")

result = await handler.render_completion_request(request)

Expand Down
26 changes: 25 additions & 1 deletion vllm/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,31 @@ def __str__(self):
return f"{base} ({', '.join(extras)})" if extras else base


class VLLMNotFoundError(ValueError):
class VLLMNotFoundError(Exception):
"""vLLM-specific NotFoundError"""

pass


class LoRAAdapterNotFoundError(VLLMNotFoundError):
"""Exception raised when a LoRA adapter is not found.

This exception is thrown when a requested LoRA adapter does not exist
in the system.

Attributes:
message: The error message string describing the exception
"""

message: str

def __init__(
self,
lora_name: str,
lora_path: str,
) -> None:
message = f"Loading lora {lora_name} failed: No adapter found for {lora_path}"
self.message = message

def __str__(self):
return self.message
7 changes: 3 additions & 4 deletions vllm/lora/worker_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import torch

from vllm.config import VllmConfig
from vllm.exceptions import LoRAAdapterNotFoundError
from vllm.logger import init_logger
from vllm.lora.lora_model import LoRAModel
from vllm.lora.model_manager import (
Expand Down Expand Up @@ -147,12 +148,10 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
# offline mode)
# - No local adapter files found at `lora_request.lora_path`
# For NotFoundError
raise ValueError(
f"Loading lora {lora_request.lora_name} failed: No adapter "
f"found for {lora_request.lora_path}"
raise LoRAAdapterNotFoundError(
lora_request.lora_name, lora_request.lora_path
) from e
except Exception as e:
# For BadRequestError
raise e

return lora
Expand Down