[serve.llm] Rename APIs to LLMRouter and VLLMService (#50775)

Talked to @richardliaw The suggestion is to use more concise and easy to digest names instead of VLLMDeploymentImpl and LLMModelRouterDeploymentImpl --------- Signed-off-by: Kourosh Hakhamaneshi <[email protected]>
ray-project · Feb 21, 2025 · c99ed82 · c99ed82
1 parent 0abb724
commit c99ed82
Show file tree

Hide file tree

Showing 8 changed files with 33 additions and 43 deletions.
diff --git a/doc/source/serve/llm/api.rst b/doc/source/serve/llm/api.rst
@@ -43,8 +43,8 @@ Deployments
    :nosignatures:
    :toctree: doc/
 
-   VLLMDeploymentImpl
-   LLMModelRouterDeploymentImpl
+   VLLMService
+   LLMRouter
 
 
 .. currentmodule:: ray.serve.llm.openai_api_models

diff --git a/doc/source/serve/llm/overview.rst b/doc/source/serve/llm/overview.rst
@@ -16,12 +16,12 @@ Key Components
 
 The ``ray.serve.llm`` module provides two key deployment types for serving LLMs:
 
-VLLMDeploymentImpl
+VLLMService
 ~~~~~~~~~~~~~~~~~~
 
-The VLLMDeploymentImpl sets up and manages the vLLM engine for model serving. It can be used standalone or combined with your own custom Ray Serve deployments.
+The VLLMService sets up and manages the vLLM engine for model serving. It can be used standalone or combined with your own custom Ray Serve deployments.
 
-LLMModelRouterDeploymentImpl
+LLMRouter
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 This deployment provides an OpenAI-compatible FastAPI ingress and routes traffic to the appropriate model for multi-model services. The following endpoints are supported:
 
@@ -46,14 +46,14 @@ The ``LLMConfig`` class specifies model details such as:
 Quickstart Examples
 -------------------
 
-Single Model Deployment through ``VLLMDeploymentImpl``
+Single Model Deployment through ``VLLMService``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: python
 
     from ray import serve
     from ray.serve.llm.configs import LLMConfig
-    from ray.serve.llm.deployments import VLLMDeploymentImpl
+    from ray.serve.llm.deployments import VLLMService
 
     # Configure the model
     llm_config = LLMConfig(
@@ -71,7 +71,7 @@ Single Model Deployment through ``VLLMDeploymentImpl``
     )
 
     # Build the deployment directly
-    vllm_deployment = VLLMDeploymentImpl.as_deployment()
+    vllm_deployment = VLLMService.as_deployment()
 
     # You can also use .options() to configure the deployment
     # e.g. vllm_deployment.options(**llm_config.get_serve_options()).bind(llm_config)
@@ -80,14 +80,14 @@ Single Model Deployment through ``VLLMDeploymentImpl``
     model_handle = serve.run(vllm_app)
 
 
-Multi-Model Deployment through ``LLMModelRouterDeploymentImpl``
+Multi-Model Deployment through ``LLMRouter``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: python
 
     from ray import serve
     from ray.serve.llm.configs import LLMConfig
-    from ray.serve.llm.deployments import VLLMDeploymentImpl, LLMModelRouterDeploymentImpl
+    from ray.serve.llm.deployments import VLLMService, LLMRouter
 
     llm_config1 = LLMConfig(
         model_loading_config=dict(
@@ -116,9 +116,9 @@ Multi-Model Deployment through ``LLMModelRouterDeploymentImpl``
     )
 
     # Deploy the application
-    deployment1 = VLLMDeploymentImpl.as_deployment().bind(llm_config1)
-    deployment2 = VLLMDeploymentImpl.as_deployment().bind(llm_config2)
-    llm_app = LLMModelRouterDeploymentImpl.as_deployment().bind([deployment1, deployment2])
+    deployment1 = VLLMService.as_deployment().bind(llm_config1)
+    deployment2 = VLLMService.as_deployment().bind(llm_config2)
+    llm_app = LLMRouter.as_deployment().bind([deployment1, deployment2])
     serve.run(llm_app)
 
 Querying Models

diff --git a/python/ray/llm/_internal/serve/builders/application_builders.py b/python/ray/llm/_internal/serve/builders/application_builders.py
@@ -12,7 +12,7 @@
     LLMEngine,
 )
 from ray.llm._internal.serve.deployments.routers.router import (
-    LLMModelRouterDeploymentImpl,
+    LLMRouter,
 )
 from ray.llm._internal.serve.configs.constants import (
     ENABLE_WORKER_PROCESS_SETUP_HOOK,
@@ -151,6 +151,4 @@ def build_openai_app(llm_serving_args: LLMServingArgs) -> Application:
 
     llm_deployments = _get_llm_deployments(llm_configs)
 
-    return LLMModelRouterDeploymentImpl.as_deployment().bind(
-        llm_deployments=llm_deployments
-    )
+    return LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_deployment.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_deployment.py
@@ -372,7 +372,7 @@ async def process_completions(
             )
 
 
-class VLLMDeploymentImpl(LLMDeployment):
+class VLLMService(LLMDeployment):
     _default_engine_cls = VLLMEngine
     _default_image_retriever_cls = ImageRetriever
 
@@ -573,7 +573,7 @@ async def _disk_lora_model(self, lora_model_id: str) -> DiskMultiplexConfig:
     def as_deployment(
         cls, deployment_options: Dict[str, Any] = None
     ) -> serve.Deployment:
-        """Convert the VLLMDeploymentImpl to a Ray Serve deployment.
+        """Convert the VLLMService to a Ray Serve deployment.
 
         Args:
             deployment_options: A dictionary of deployment options.
@@ -604,8 +604,8 @@ def as_deployment(
     health_check_period_s=DEFAULT_HEALTH_CHECK_PERIOD_S,
     health_check_timeout_s=DEFAULT_HEALTH_CHECK_TIMEOUT_S,
 )
-class VLLMDeployment(VLLMDeploymentImpl):
-    # Note (genesu): We are separating the VLLMDeploymentImpl and VLLMDeployment just
+class VLLMDeployment(VLLMService):
+    # Note (genesu): We are separating the VLLMService and VLLMDeployment just
     # to give developers an ability to test the implementation outside the Ray Serve.
     # But in practice we should always test the VLLMDeployment class as a Serve
     # deployment to ensure all functionalities can be run remotely asynchronously.

diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py
@@ -156,7 +156,7 @@ async def _peek_at_openai_json_generator(
     return first_response, _openai_json_wrapper(generator, first_response)
 
 
-class LLMModelRouterDeploymentImpl:
+class LLMRouter:
     def __init__(
         self,
         llm_deployments: List[DeploymentHandle],

diff --git a/python/ray/llm/tests/serve/deployments/llm/multiplex/test_lora_deployment_base_client.py b/python/ray/llm/tests/serve/deployments/llm/multiplex/test_lora_deployment_base_client.py
@@ -10,7 +10,7 @@
 from ray.llm._internal.serve.deployments.llm.vllm.vllm_deployment import VLLMDeployment
 from ray.llm._internal.serve.configs.server_models import LLMConfig, LoraConfig
 from ray.llm._internal.serve.deployments.routers.router import (
-    LLMModelRouterDeploymentImpl,
+    LLMRouter,
 )
 from ray.llm._internal.serve.builders.application_builders import (
     get_serve_deployment_args,
@@ -81,9 +81,7 @@ async def test_lora_unavailable_base_model(shutdown_ray_and_serve):
     """Getting the handle for an unavailable model should return a 404."""
     llm_config = VLLM_APP.model_copy(deep=True)
     llm_deployments = get_mocked_llm_deployments([llm_config])
-    router_deployment = LLMModelRouterDeploymentImpl.as_deployment().bind(
-        llm_deployments=llm_deployments
-    )
+    router_deployment = LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
     router_handle = serve.run(router_deployment)
 
     with pytest.raises(HTTPException) as e:
@@ -101,9 +99,7 @@ async def test_lora_get_model(shutdown_ray_and_serve):
     llm_config = VLLM_APP.model_copy(deep=True)
     llm_config.model_loading_config.model_id = base_model_id
     llm_deployments = get_mocked_llm_deployments([llm_config])
-    router_deployment = LLMModelRouterDeploymentImpl.as_deployment().bind(
-        llm_deployments=llm_deployments
-    )
+    router_deployment = LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
     router_handle = serve.run(router_deployment)
 
     # Case 1: model does not exist.
@@ -130,7 +126,7 @@ async def fake_get_lora_model_metadata(*args, **kwargs):
             "max_request_context_length": 4096,
         }
 
-    router_deployment = LLMModelRouterDeploymentImpl.as_deployment().bind(
+    router_deployment = LLMRouter.as_deployment().bind(
         llm_deployments=llm_deployments,
         _get_lora_model_metadata_func=fake_get_lora_model_metadata,
     )
@@ -153,9 +149,7 @@ async def test_lora_list_base_model(shutdown_ray_and_serve):
     llm_config = VLLM_APP.model_copy(deep=True)
     llm_config.model_loading_config.model_id = base_model_id
     llm_deployments = get_mocked_llm_deployments([llm_config])
-    router_deployment = LLMModelRouterDeploymentImpl.as_deployment().bind(
-        llm_deployments=llm_deployments
-    )
+    router_deployment = LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
     router_handle = serve.run(router_deployment)
 
     models = (await router_handle.models.remote()).data
@@ -221,9 +215,7 @@ async def test_lora_include_adapters_in_list_models(
     app.lora_config = LoraConfig(dynamic_lora_loading_path=dynamic_lora_loading_path)
 
     llm_deployments = get_mocked_llm_deployments([app])
-    router_deployment = LLMModelRouterDeploymentImpl.as_deployment().bind(
-        llm_deployments=llm_deployments
-    )
+    router_deployment = LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
     router_handle = serve.run(router_deployment)
 
     models = (await router_handle.models.remote()).data

diff --git a/python/ray/serve/llm/deployments.py b/python/ray/serve/llm/deployments.py
@@ -1,16 +1,16 @@
 from ray.llm._internal.serve.deployments.llm.vllm.vllm_deployment import (
-    VLLMDeploymentImpl as _VLLMDeploymentImpl,
+    VLLMService as _VLLMService,
 )
 from ray.llm._internal.serve.deployments.routers.router import (
-    LLMModelRouterDeploymentImpl as _LLMModelRouterDeploymentImpl,
+    LLMRouter as _LLMRouter,
 )
 
 
 from ray.util.annotations import PublicAPI
 
 
 @PublicAPI(stability="alpha")
-class VLLMDeploymentImpl(_VLLMDeploymentImpl):
+class VLLMService(_VLLMService):
     """The implementation of the VLLM engine deployment.
 
     To build a VLLMDeployment object you should use `build_vllm_deployment` function.
@@ -42,7 +42,7 @@ class VLLMDeploymentImpl(_VLLMDeploymentImpl):
             )
 
             # Build the deployment directly
-            VLLMDeployment = VLLMDeploymentImpl.as_deployment(llm_config.get_serve_options())
+            VLLMDeployment = VLLMService.as_deployment(llm_config.get_serve_options())
             vllm_app = VLLMDeployment.bind(llm_config)
 
             model_handle = serve.run(vllm_app)
@@ -66,7 +66,7 @@ class VLLMDeploymentImpl(_VLLMDeploymentImpl):
 
 
 @PublicAPI(stability="alpha")
-class LLMModelRouterDeploymentImpl(_LLMModelRouterDeploymentImpl):
+class LLMRouter(_LLMRouter):
 
     """The implementation of the OpenAI compatiple model router.
 

diff --git a/python/ray/serve/tests/unit/test_llm_imports.py b/python/ray/serve/tests/unit/test_llm_imports.py
@@ -32,8 +32,8 @@ def test_serve_llm_import_does_not_error():
         )
     with pytest.raises(ImportError):
         from ray.serve.llm.deployments import (
-            VLLMDeploymentImpl,  # noqa: F401
-            LLMModelRouterDeploymentImpl,  # noqa: F401
+            VLLMService,  # noqa: F401
+            LLMRouter,  # noqa: F401
         )
     with pytest.raises(ImportError):
         from ray.serve.llm.builders import (