diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 7961daf160b4..29c4b2b37841 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -414,11 +414,16 @@ async def init_render_app_state( directly from the :class:`~vllm.config.VllmConfig`. """ from vllm.entrypoints.chat_utils import load_chat_template + from vllm.entrypoints.openai.models.protocol import BaseModelPath + from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.plugins.io_processors import get_io_processor from vllm.renderers import renderer_from_config served_model_names = args.served_model_name or [args.model] + base_model_paths = [ + BaseModelPath(name=name, model_path=args.model) for name in served_model_names + ] if args.enable_log_requests: request_logger = RequestLogger(max_log_len=args.max_log_len) @@ -431,11 +436,17 @@ async def init_render_app_state( ) resolved_chat_template = load_chat_template(args.chat_template) + state.openai_serving_models = OpenAIServingModels( + engine_client=None, + base_model_paths=base_model_paths, + model_config=vllm_config.model_config, + ) + state.openai_serving_render = OpenAIServingRender( model_config=vllm_config.model_config, renderer=renderer, io_processor=io_processor, - served_model_names=served_model_names, + models=state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -447,9 +458,6 @@ async def init_render_app_state( log_error_stack=args.log_error_stack, ) - # Expose models endpoint via the render handler. - state.openai_serving_models = state.openai_serving_render - state.vllm_config = vllm_config # Disable stats logging — there is no engine to poll. state.log_stats = False diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index fad2a7f8c2eb..60fdfe5329b6 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -16,7 +16,6 @@ from pydantic import ConfigDict, TypeAdapter from starlette.datastructures import Headers -import vllm.envs as envs from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient @@ -613,31 +612,7 @@ async def _check_model( self, request: AnyRequest, ) -> ErrorResponse | None: - error_response = None - - if self._is_model_supported(request.model): - return None - if request.model in self.models.lora_requests: - return None - if ( - envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING - and request.model - and (load_result := await self.models.resolve_lora(request.model)) - ): - if isinstance(load_result, LoRARequest): - return None - if ( - isinstance(load_result, ErrorResponse) - and load_result.error.code == HTTPStatus.BAD_REQUEST.value - ): - error_response = load_result - - return error_response or self.create_error_response( - message=f"The model `{request.model}` does not exist.", - err_type="NotFoundError", - status_code=HTTPStatus.NOT_FOUND, - param="model", - ) + return await self.models.check_model(request.model) def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None: """Determine if there are any active default multimodal loras.""" @@ -1197,8 +1172,6 @@ def _get_decoded_token( return tokenizer.decode([token_id]) def _is_model_supported(self, model_name: str | None) -> bool: - if not model_name: - return True return self.models.is_base_model(model_name) diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py index dedaf108f98b..fd6b32ad635f 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/openai/generate/api_router.py @@ -169,9 +169,7 @@ async def init_generate_state( model_config=engine_client.model_config, renderer=engine_client.renderer, io_processor=engine_client.io_processor, - served_model_names=[ - mp.name for mp in state.openai_serving_models.base_model_paths - ], + models=state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, diff --git a/vllm/entrypoints/openai/models/serving.py b/vllm/entrypoints/openai/models/serving.py index e99d8f7ac767..f83b0778d598 100644 --- a/vllm/entrypoints/openai/models/serving.py +++ b/vllm/entrypoints/openai/models/serving.py @@ -5,6 +5,8 @@ from collections import defaultdict from http import HTTPStatus +import vllm.envs as envs +from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.engine.protocol import ( ErrorInfo, @@ -38,15 +40,20 @@ class OpenAIServingModels: def __init__( self, - engine_client: EngineClient, + engine_client: EngineClient | None, base_model_paths: list[BaseModelPath], *, + model_config: ModelConfig | None = None, lora_modules: list[LoRAModulePath] | None = None, ): - super().__init__() - self.engine_client = engine_client self.base_model_paths = base_model_paths + if model_config is not None: + self.model_config = model_config + elif engine_client is not None: + self.model_config = engine_client.model_config + else: + raise ValueError("model_config must be provided when engine_client is None") self.static_lora_modules = lora_modules self.lora_requests: dict[str, LoRARequest] = {} @@ -59,11 +66,6 @@ def __init__( ) self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock) - self.model_config = self.engine_client.model_config - self.renderer = self.engine_client.renderer - self.io_processor = self.engine_client.io_processor - self.input_processor = self.engine_client.input_processor - async def init_static_loras(self): """Loads all static LoRA modules. Raises if any fail to load""" @@ -79,7 +81,9 @@ async def init_static_loras(self): if isinstance(load_result, ErrorResponse): raise ValueError(load_result.error.message) - def is_base_model(self, model_name) -> bool: + def is_base_model(self, model_name: str | None) -> bool: + if not model_name: + return True return any(model.name == model_name for model in self.base_model_paths) def model_name(self, lora_request: LoRARequest | None = None) -> str: @@ -94,6 +98,38 @@ def model_name(self, lora_request: LoRARequest | None = None) -> str: return lora_request.lora_name return self.base_model_paths[0].name + async def check_model(self, model_name: str | None) -> ErrorResponse | None: + """Return an ErrorResponse if model_name is not served, else None. + + When VLLM_ALLOW_RUNTIME_LORA_UPDATING is set and the model is not + already known, attempts to resolve and load it as a LoRA adapter. + """ + error_response = None + + if self.is_base_model(model_name): + return None + if model_name in self.lora_requests: + return None + if ( + envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING + and model_name + and (load_result := await self.resolve_lora(model_name)) + ): + if isinstance(load_result, LoRARequest): + return None + if ( + isinstance(load_result, ErrorResponse) + and load_result.error.code == HTTPStatus.BAD_REQUEST.value + ): + error_response = load_result + + return error_response or create_error_response( + message=f"The model `{model_name}` does not exist.", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND, + param="model", + ) + async def show_available_models(self) -> ModelList: """Show available models. This includes the base model and all adapters.""" max_model_len = self.model_config.max_model_len @@ -124,6 +160,13 @@ async def show_available_models(self) -> ModelList: async def load_lora_adapter( self, request: LoadLoRAAdapterRequest, base_model_name: str | None = None ) -> ErrorResponse | str: + if self.engine_client is None: + return create_error_response( + message="LoRA adapters are not supported in render-only mode.", + err_type="BadRequestError", + status_code=HTTPStatus.BAD_REQUEST, + ) + lora_name = request.lora_name # Ensure atomicity based on the lora name @@ -240,6 +283,13 @@ async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse: ErrorResponse (404) if no resolver finds the adapter. ErrorResponse (400) if adapter(s) are found but none load. """ + if self.engine_client is None: + return create_error_response( + message="LoRA adapters are not supported in render-only mode.", + err_type="BadRequestError", + status_code=HTTPStatus.BAD_REQUEST, + ) + async with self.lora_resolver_lock[lora_name]: # First check if this LoRA is already loaded if lora_name in self.lora_requests: @@ -298,11 +348,13 @@ def create_error_response( message: str, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + param: str | None = None, ) -> ErrorResponse: return ErrorResponse( error=ErrorInfo( message=sanitize_message(message), type=err_type, code=status_code.value, + param=param, ) ) diff --git a/vllm/entrypoints/pooling/base/serving.py b/vllm/entrypoints/pooling/base/serving.py index 9bbdde5bbc80..d13ab78a7552 100644 --- a/vllm/entrypoints/pooling/base/serving.py +++ b/vllm/entrypoints/pooling/base/serving.py @@ -65,7 +65,7 @@ def __init__( ) self.io_processor = self.init_io_processor( model_config=models.model_config, - renderer=models.renderer, + renderer=engine_client.renderer, chat_template_config=self.chat_template_config, ) diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index c0e32be7ea5e..d816ccbc0b75 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -20,10 +20,9 @@ from vllm.entrypoints.openai.engine.protocol import ( ErrorInfo, ErrorResponse, - ModelCard, ModelList, - ModelPermission, ) +from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.parser.harmony_utils import ( get_developer_message, get_system_message, @@ -50,7 +49,7 @@ def __init__( model_config: ModelConfig, renderer: BaseRenderer, io_processor: Any, - served_model_names: list[str], + models: OpenAIServingModels, *, request_logger: RequestLogger | None, chat_template: str | None, @@ -65,7 +64,7 @@ def __init__( self.model_config = model_config self.renderer = renderer self.io_processor = io_processor - self.served_model_names = served_model_names + self.models = models self.request_logger = request_logger self.chat_template = chat_template self.chat_template_content_format: ChatTemplateContentFormatOption = ( @@ -264,18 +263,7 @@ def _make_request_with_harmony( async def show_available_models(self) -> ModelList: """Returns the models served by this render server.""" - max_model_len = self.model_config.max_model_len - return ModelList( - data=[ - ModelCard( - id=name, - max_model_len=max_model_len, - root=self.model_config.model, - permission=[ModelPermission()], - ) - for name in self.served_model_names - ] - ) + return await self.models.show_available_models() def create_error_response( self, @@ -333,23 +321,11 @@ def create_error_response( ) ) - def _is_model_supported(self, model_name: str) -> bool: - """Simplified from OpenAIServing._is_model_supported (no LoRA support).""" - return model_name in self.served_model_names - async def _check_model( self, request: Any, ) -> ErrorResponse | None: - """Simplified from OpenAIServing._check_model (no LoRA support).""" - if self._is_model_supported(request.model): - return None - return self.create_error_response( - message=f"The model `{request.model}` does not exist.", - err_type="NotFoundError", - status_code=HTTPStatus.NOT_FOUND, - param="model", - ) + return await self.models.check_model(request.model) def _validate_chat_template( self,