-
-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[Frontend] Allow engine_client=None in OpenAIServingModels
#36655
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,8 @@ | |
| from collections import defaultdict | ||
| from http import HTTPStatus | ||
|
|
||
| import vllm.envs as envs | ||
| from vllm.config import ModelConfig | ||
| from vllm.engine.protocol import EngineClient | ||
| from vllm.entrypoints.openai.engine.protocol import ( | ||
| ErrorInfo, | ||
|
|
@@ -38,15 +40,20 @@ class OpenAIServingModels: | |
|
|
||
| def __init__( | ||
| self, | ||
| engine_client: EngineClient, | ||
| engine_client: EngineClient | None, | ||
| base_model_paths: list[BaseModelPath], | ||
| *, | ||
| model_config: ModelConfig | None = None, | ||
| lora_modules: list[LoRAModulePath] | None = None, | ||
| ): | ||
| super().__init__() | ||
|
|
||
| self.engine_client = engine_client | ||
| self.base_model_paths = base_model_paths | ||
| if model_config is not None: | ||
| self.model_config = model_config | ||
| elif engine_client is not None: | ||
| self.model_config = engine_client.model_config | ||
| else: | ||
| raise ValueError("model_config must be provided when engine_client is None") | ||
|
|
||
| self.static_lora_modules = lora_modules | ||
| self.lora_requests: dict[str, LoRARequest] = {} | ||
|
|
@@ -59,11 +66,6 @@ def __init__( | |
| ) | ||
| self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock) | ||
|
|
||
| self.model_config = self.engine_client.model_config | ||
| self.renderer = self.engine_client.renderer | ||
| self.io_processor = self.engine_client.io_processor | ||
| self.input_processor = self.engine_client.input_processor | ||
|
|
||
| async def init_static_loras(self): | ||
| """Loads all static LoRA modules. | ||
| Raises if any fail to load""" | ||
|
|
@@ -79,7 +81,9 @@ async def init_static_loras(self): | |
| if isinstance(load_result, ErrorResponse): | ||
| raise ValueError(load_result.error.message) | ||
|
|
||
| def is_base_model(self, model_name) -> bool: | ||
| def is_base_model(self, model_name: str | None) -> bool: | ||
| if not model_name: | ||
| return True | ||
| return any(model.name == model_name for model in self.base_model_paths) | ||
|
|
||
| def model_name(self, lora_request: LoRARequest | None = None) -> str: | ||
|
|
@@ -94,6 +98,38 @@ def model_name(self, lora_request: LoRARequest | None = None) -> str: | |
| return lora_request.lora_name | ||
| return self.base_model_paths[0].name | ||
|
|
||
| async def check_model(self, model_name: str | None) -> ErrorResponse | None: | ||
| """Return an ErrorResponse if model_name is not served, else None. | ||
|
|
||
| When VLLM_ALLOW_RUNTIME_LORA_UPDATING is set and the model is not | ||
| already known, attempts to resolve and load it as a LoRA adapter. | ||
| """ | ||
| error_response = None | ||
|
|
||
| if self.is_base_model(model_name): | ||
| return None | ||
| if model_name in self.lora_requests: | ||
| return None | ||
| if ( | ||
| envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING | ||
| and model_name | ||
| and (load_result := await self.resolve_lora(model_name)) | ||
| ): | ||
| if isinstance(load_result, LoRARequest): | ||
| return None | ||
| if ( | ||
| isinstance(load_result, ErrorResponse) | ||
| and load_result.error.code == HTTPStatus.BAD_REQUEST.value | ||
| ): | ||
| error_response = load_result | ||
|
|
||
| return error_response or create_error_response( | ||
| message=f"The model `{model_name}` does not exist.", | ||
| err_type="NotFoundError", | ||
| status_code=HTTPStatus.NOT_FOUND, | ||
| param="model", | ||
| ) | ||
|
|
||
| async def show_available_models(self) -> ModelList: | ||
| """Show available models. This includes the base model and all adapters.""" | ||
| max_model_len = self.model_config.max_model_len | ||
|
|
@@ -124,6 +160,13 @@ async def show_available_models(self) -> ModelList: | |
| async def load_lora_adapter( | ||
| self, request: LoadLoRAAdapterRequest, base_model_name: str | None = None | ||
| ) -> ErrorResponse | str: | ||
| if self.engine_client is None: | ||
| return create_error_response( | ||
| message="LoRA adapters are not supported in render-only mode.", | ||
| err_type="BadRequestError", | ||
| status_code=HTTPStatus.BAD_REQUEST, | ||
| ) | ||
|
Comment on lines
+163
to
+168
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To avoid duplicating this check for For example, you could add a private method: def _check_lora_supported(self) -> ErrorResponse | None:
"""Return an error if LoRA adapters are not supported, else None."""
if self.engine_client is None:
return create_error_response(
message="LoRA adapters are not supported in render-only mode.",
err_type="BadRequestError",
status_code=HTTPStatus.BAD_REQUEST,
)
return NoneAnd then call it from both if (error := self._check_lora_supported()) is not None:
return error |
||
|
|
||
| lora_name = request.lora_name | ||
|
|
||
| # Ensure atomicity based on the lora name | ||
|
|
@@ -240,6 +283,13 @@ async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse: | |
| ErrorResponse (404) if no resolver finds the adapter. | ||
| ErrorResponse (400) if adapter(s) are found but none load. | ||
| """ | ||
| if self.engine_client is None: | ||
| return create_error_response( | ||
| message="LoRA adapters are not supported in render-only mode.", | ||
| err_type="BadRequestError", | ||
| status_code=HTTPStatus.BAD_REQUEST, | ||
| ) | ||
|
|
||
| async with self.lora_resolver_lock[lora_name]: | ||
| # First check if this LoRA is already loaded | ||
| if lora_name in self.lora_requests: | ||
|
|
@@ -298,11 +348,13 @@ def create_error_response( | |
| message: str, | ||
| err_type: str = "BadRequestError", | ||
| status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, | ||
| param: str | None = None, | ||
| ) -> ErrorResponse: | ||
| return ErrorResponse( | ||
| error=ErrorInfo( | ||
| message=sanitize_message(message), | ||
| type=err_type, | ||
| code=status_code.value, | ||
| param=param, | ||
| ) | ||
| ) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this somewhat goes against #36536 (comment)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@DarkLight1337 Oh that wasn't my intention, I tried to unify the engine-free path with the default one, since the model check in the original openai models can dynamically load loras...
What I'm trying to achieve here and in the previous PR is openai render sharing the same openai models as the rest of the serving classes. Since the full model check can include engine operations, I came with inheritance/shared object for both paths, since composition would cause the renderer to have incomplete model check. There's also a chance I didn't fully get you intentions, in that case I'd love for a clarification here :)
Thanks for taking the time to review these
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would like to achieve a cleaner separation between the code paths with vs. without engine client. In that case I prefer the previous PR #36536