Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,11 +414,16 @@ async def init_render_app_state(
directly from the :class:`~vllm.config.VllmConfig`.
"""
from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.plugins.io_processors import get_io_processor
from vllm.renderers import renderer_from_config

served_model_names = args.served_model_name or [args.model]
base_model_paths = [
BaseModelPath(name=name, model_path=args.model) for name in served_model_names
]

if args.enable_log_requests:
request_logger = RequestLogger(max_log_len=args.max_log_len)
Expand All @@ -431,11 +436,17 @@ async def init_render_app_state(
)
resolved_chat_template = load_chat_template(args.chat_template)

state.openai_serving_models = OpenAIServingModels(
engine_client=None,
base_model_paths=base_model_paths,
model_config=vllm_config.model_config,
)

state.openai_serving_render = OpenAIServingRender(
model_config=vllm_config.model_config,
renderer=renderer,
io_processor=io_processor,
served_model_names=served_model_names,
models=state.openai_serving_models,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
Expand All @@ -447,9 +458,6 @@ async def init_render_app_state(
log_error_stack=args.log_error_stack,
)

# Expose models endpoint via the render handler.
state.openai_serving_models = state.openai_serving_render

state.vllm_config = vllm_config
# Disable stats logging — there is no engine to poll.
state.log_stats = False
Expand Down
29 changes: 1 addition & 28 deletions vllm/entrypoints/openai/engine/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from pydantic import ConfigDict, TypeAdapter
from starlette.datastructures import Headers

import vllm.envs as envs
from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
Expand Down Expand Up @@ -613,31 +612,7 @@ async def _check_model(
self,
request: AnyRequest,
) -> ErrorResponse | None:
error_response = None

if self._is_model_supported(request.model):
return None
if request.model in self.models.lora_requests:
return None
if (
envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING
and request.model
and (load_result := await self.models.resolve_lora(request.model))
):
if isinstance(load_result, LoRARequest):
return None
if (
isinstance(load_result, ErrorResponse)
and load_result.error.code == HTTPStatus.BAD_REQUEST.value
):
error_response = load_result

return error_response or self.create_error_response(
message=f"The model `{request.model}` does not exist.",
err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND,
param="model",
)
return await self.models.check_model(request.model)

def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None:
"""Determine if there are any active default multimodal loras."""
Expand Down Expand Up @@ -1197,8 +1172,6 @@ def _get_decoded_token(
return tokenizer.decode([token_id])

def _is_model_supported(self, model_name: str | None) -> bool:
if not model_name:
return True
return self.models.is_base_model(model_name)


Expand Down
4 changes: 1 addition & 3 deletions vllm/entrypoints/openai/generate/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,7 @@ async def init_generate_state(
model_config=engine_client.model_config,
renderer=engine_client.renderer,
io_processor=engine_client.io_processor,
served_model_names=[
mp.name for mp in state.openai_serving_models.base_model_paths
],
models=state.openai_serving_models,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
Expand Down
70 changes: 61 additions & 9 deletions vllm/entrypoints/openai/models/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from collections import defaultdict
from http import HTTPStatus

import vllm.envs as envs
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo,
Expand Down Expand Up @@ -38,15 +40,20 @@ class OpenAIServingModels:

def __init__(
self,
engine_client: EngineClient,
engine_client: EngineClient | None,
base_model_paths: list[BaseModelPath],
*,
model_config: ModelConfig | None = None,
lora_modules: list[LoRAModulePath] | None = None,
):
super().__init__()

self.engine_client = engine_client
self.base_model_paths = base_model_paths
if model_config is not None:
self.model_config = model_config
elif engine_client is not None:
self.model_config = engine_client.model_config
else:
raise ValueError("model_config must be provided when engine_client is None")

self.static_lora_modules = lora_modules
self.lora_requests: dict[str, LoRARequest] = {}
Expand All @@ -59,11 +66,6 @@ def __init__(
)
self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)

self.model_config = self.engine_client.model_config
self.renderer = self.engine_client.renderer
self.io_processor = self.engine_client.io_processor
self.input_processor = self.engine_client.input_processor

async def init_static_loras(self):
"""Loads all static LoRA modules.
Raises if any fail to load"""
Expand All @@ -79,7 +81,9 @@ async def init_static_loras(self):
if isinstance(load_result, ErrorResponse):
raise ValueError(load_result.error.message)

def is_base_model(self, model_name) -> bool:
def is_base_model(self, model_name: str | None) -> bool:
if not model_name:
return True
return any(model.name == model_name for model in self.base_model_paths)

def model_name(self, lora_request: LoRARequest | None = None) -> str:
Expand All @@ -94,6 +98,38 @@ def model_name(self, lora_request: LoRARequest | None = None) -> str:
return lora_request.lora_name
return self.base_model_paths[0].name

async def check_model(self, model_name: str | None) -> ErrorResponse | None:
"""Return an ErrorResponse if model_name is not served, else None.

When VLLM_ALLOW_RUNTIME_LORA_UPDATING is set and the model is not
already known, attempts to resolve and load it as a LoRA adapter.
"""
error_response = None

if self.is_base_model(model_name):
return None
if model_name in self.lora_requests:
return None
if (
envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING
and model_name
and (load_result := await self.resolve_lora(model_name))
):
if isinstance(load_result, LoRARequest):
return None
if (
isinstance(load_result, ErrorResponse)
and load_result.error.code == HTTPStatus.BAD_REQUEST.value
):
error_response = load_result

return error_response or create_error_response(
message=f"The model `{model_name}` does not exist.",
err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND,
param="model",
)

async def show_available_models(self) -> ModelList:
"""Show available models. This includes the base model and all adapters."""
max_model_len = self.model_config.max_model_len
Expand Down Expand Up @@ -124,6 +160,13 @@ async def show_available_models(self) -> ModelList:
async def load_lora_adapter(
self, request: LoadLoRAAdapterRequest, base_model_name: str | None = None
) -> ErrorResponse | str:
if self.engine_client is None:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this somewhat goes against #36536 (comment)?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DarkLight1337 Oh that wasn't my intention, I tried to unify the engine-free path with the default one, since the model check in the original openai models can dynamically load loras...

What I'm trying to achieve here and in the previous PR is openai render sharing the same openai models as the rest of the serving classes. Since the full model check can include engine operations, I came with inheritance/shared object for both paths, since composition would cause the renderer to have incomplete model check. There's also a chance I didn't fully get you intentions, in that case I'd love for a clarification here :)

Thanks for taking the time to review these

Copy link
Copy Markdown
Member

@DarkLight1337 DarkLight1337 Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to achieve a cleaner separation between the code paths with vs. without engine client. In that case I prefer the previous PR #36536

return create_error_response(
message="LoRA adapters are not supported in render-only mode.",
err_type="BadRequestError",
status_code=HTTPStatus.BAD_REQUEST,
)
Comment on lines +163 to +168
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

To avoid duplicating this check for engine_client is None in both load_lora_adapter and resolve_lora, consider extracting it into a helper method. This improves maintainability and ensures consistency if more LoRA-related methods are added in the future. This is a critical guard for the new render-only mode, and centralizing it reduces the risk of errors.

For example, you could add a private method:

    def _check_lora_supported(self) -> ErrorResponse | None:
        """Return an error if LoRA adapters are not supported, else None."""
        if self.engine_client is None:
            return create_error_response(
                message="LoRA adapters are not supported in render-only mode.",
                err_type="BadRequestError",
                status_code=HTTPStatus.BAD_REQUEST,
            )
        return None

And then call it from both load_lora_adapter and resolve_lora:

        if (error := self._check_lora_supported()) is not None:
            return error


lora_name = request.lora_name

# Ensure atomicity based on the lora name
Expand Down Expand Up @@ -240,6 +283,13 @@ async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse:
ErrorResponse (404) if no resolver finds the adapter.
ErrorResponse (400) if adapter(s) are found but none load.
"""
if self.engine_client is None:
return create_error_response(
message="LoRA adapters are not supported in render-only mode.",
err_type="BadRequestError",
status_code=HTTPStatus.BAD_REQUEST,
)

async with self.lora_resolver_lock[lora_name]:
# First check if this LoRA is already loaded
if lora_name in self.lora_requests:
Expand Down Expand Up @@ -298,11 +348,13 @@ def create_error_response(
message: str,
err_type: str = "BadRequestError",
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
param: str | None = None,
) -> ErrorResponse:
return ErrorResponse(
error=ErrorInfo(
message=sanitize_message(message),
type=err_type,
code=status_code.value,
param=param,
)
)
2 changes: 1 addition & 1 deletion vllm/entrypoints/pooling/base/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __init__(
)
self.io_processor = self.init_io_processor(
model_config=models.model_config,
renderer=models.renderer,
renderer=engine_client.renderer,
chat_template_config=self.chat_template_config,
)

Expand Down
34 changes: 5 additions & 29 deletions vllm/entrypoints/serve/render/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@
from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo,
ErrorResponse,
ModelCard,
ModelList,
ModelPermission,
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.parser.harmony_utils import (
get_developer_message,
get_system_message,
Expand All @@ -50,7 +49,7 @@ def __init__(
model_config: ModelConfig,
renderer: BaseRenderer,
io_processor: Any,
served_model_names: list[str],
models: OpenAIServingModels,
*,
request_logger: RequestLogger | None,
chat_template: str | None,
Expand All @@ -65,7 +64,7 @@ def __init__(
self.model_config = model_config
self.renderer = renderer
self.io_processor = io_processor
self.served_model_names = served_model_names
self.models = models
self.request_logger = request_logger
self.chat_template = chat_template
self.chat_template_content_format: ChatTemplateContentFormatOption = (
Expand Down Expand Up @@ -264,18 +263,7 @@ def _make_request_with_harmony(

async def show_available_models(self) -> ModelList:
"""Returns the models served by this render server."""
max_model_len = self.model_config.max_model_len
return ModelList(
data=[
ModelCard(
id=name,
max_model_len=max_model_len,
root=self.model_config.model,
permission=[ModelPermission()],
)
for name in self.served_model_names
]
)
return await self.models.show_available_models()

def create_error_response(
self,
Expand Down Expand Up @@ -333,23 +321,11 @@ def create_error_response(
)
)

def _is_model_supported(self, model_name: str) -> bool:
"""Simplified from OpenAIServing._is_model_supported (no LoRA support)."""
return model_name in self.served_model_names

async def _check_model(
self,
request: Any,
) -> ErrorResponse | None:
"""Simplified from OpenAIServing._check_model (no LoRA support)."""
if self._is_model_supported(request.model):
return None
return self.create_error_response(
message=f"The model `{request.model}` does not exist.",
err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND,
param="model",
)
return await self.models.check_model(request.model)

def _validate_chat_template(
self,
Expand Down