Skip to content

Commit

Permalink
[serve.llm] Rename APIs to LLMRouter and VLLMService (#50775)
Browse files Browse the repository at this point in the history
Talked to @richardliaw The suggestion is to use more concise and easy to
digest names instead of VLLMDeploymentImpl and
LLMModelRouterDeploymentImpl

---------

Signed-off-by: Kourosh Hakhamaneshi <[email protected]>
  • Loading branch information
kouroshHakha authored Feb 21, 2025
1 parent 0abb724 commit c99ed82
Show file tree
Hide file tree
Showing 8 changed files with 33 additions and 43 deletions.
4 changes: 2 additions & 2 deletions doc/source/serve/llm/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ Deployments
:nosignatures:
:toctree: doc/

VLLMDeploymentImpl
LLMModelRouterDeploymentImpl
VLLMService
LLMRouter


.. currentmodule:: ray.serve.llm.openai_api_models
Expand Down
22 changes: 11 additions & 11 deletions doc/source/serve/llm/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ Key Components

The ``ray.serve.llm`` module provides two key deployment types for serving LLMs:

VLLMDeploymentImpl
VLLMService
~~~~~~~~~~~~~~~~~~

The VLLMDeploymentImpl sets up and manages the vLLM engine for model serving. It can be used standalone or combined with your own custom Ray Serve deployments.
The VLLMService sets up and manages the vLLM engine for model serving. It can be used standalone or combined with your own custom Ray Serve deployments.

LLMModelRouterDeploymentImpl
LLMRouter
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This deployment provides an OpenAI-compatible FastAPI ingress and routes traffic to the appropriate model for multi-model services. The following endpoints are supported:

Expand All @@ -46,14 +46,14 @@ The ``LLMConfig`` class specifies model details such as:
Quickstart Examples
-------------------

Single Model Deployment through ``VLLMDeploymentImpl``
Single Model Deployment through ``VLLMService``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. code-block:: python
from ray import serve
from ray.serve.llm.configs import LLMConfig
from ray.serve.llm.deployments import VLLMDeploymentImpl
from ray.serve.llm.deployments import VLLMService
# Configure the model
llm_config = LLMConfig(
Expand All @@ -71,7 +71,7 @@ Single Model Deployment through ``VLLMDeploymentImpl``
)
# Build the deployment directly
vllm_deployment = VLLMDeploymentImpl.as_deployment()
vllm_deployment = VLLMService.as_deployment()
# You can also use .options() to configure the deployment
# e.g. vllm_deployment.options(**llm_config.get_serve_options()).bind(llm_config)
Expand All @@ -80,14 +80,14 @@ Single Model Deployment through ``VLLMDeploymentImpl``
model_handle = serve.run(vllm_app)
Multi-Model Deployment through ``LLMModelRouterDeploymentImpl``
Multi-Model Deployment through ``LLMRouter``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. code-block:: python
from ray import serve
from ray.serve.llm.configs import LLMConfig
from ray.serve.llm.deployments import VLLMDeploymentImpl, LLMModelRouterDeploymentImpl
from ray.serve.llm.deployments import VLLMService, LLMRouter
llm_config1 = LLMConfig(
model_loading_config=dict(
Expand Down Expand Up @@ -116,9 +116,9 @@ Multi-Model Deployment through ``LLMModelRouterDeploymentImpl``
)
# Deploy the application
deployment1 = VLLMDeploymentImpl.as_deployment().bind(llm_config1)
deployment2 = VLLMDeploymentImpl.as_deployment().bind(llm_config2)
llm_app = LLMModelRouterDeploymentImpl.as_deployment().bind([deployment1, deployment2])
deployment1 = VLLMService.as_deployment().bind(llm_config1)
deployment2 = VLLMService.as_deployment().bind(llm_config2)
llm_app = LLMRouter.as_deployment().bind([deployment1, deployment2])
serve.run(llm_app)
Querying Models
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
LLMEngine,
)
from ray.llm._internal.serve.deployments.routers.router import (
LLMModelRouterDeploymentImpl,
LLMRouter,
)
from ray.llm._internal.serve.configs.constants import (
ENABLE_WORKER_PROCESS_SETUP_HOOK,
Expand Down Expand Up @@ -151,6 +151,4 @@ def build_openai_app(llm_serving_args: LLMServingArgs) -> Application:

llm_deployments = _get_llm_deployments(llm_configs)

return LLMModelRouterDeploymentImpl.as_deployment().bind(
llm_deployments=llm_deployments
)
return LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ async def process_completions(
)


class VLLMDeploymentImpl(LLMDeployment):
class VLLMService(LLMDeployment):
_default_engine_cls = VLLMEngine
_default_image_retriever_cls = ImageRetriever

Expand Down Expand Up @@ -573,7 +573,7 @@ async def _disk_lora_model(self, lora_model_id: str) -> DiskMultiplexConfig:
def as_deployment(
cls, deployment_options: Dict[str, Any] = None
) -> serve.Deployment:
"""Convert the VLLMDeploymentImpl to a Ray Serve deployment.
"""Convert the VLLMService to a Ray Serve deployment.
Args:
deployment_options: A dictionary of deployment options.
Expand Down Expand Up @@ -604,8 +604,8 @@ def as_deployment(
health_check_period_s=DEFAULT_HEALTH_CHECK_PERIOD_S,
health_check_timeout_s=DEFAULT_HEALTH_CHECK_TIMEOUT_S,
)
class VLLMDeployment(VLLMDeploymentImpl):
# Note (genesu): We are separating the VLLMDeploymentImpl and VLLMDeployment just
class VLLMDeployment(VLLMService):
# Note (genesu): We are separating the VLLMService and VLLMDeployment just
# to give developers an ability to test the implementation outside the Ray Serve.
# But in practice we should always test the VLLMDeployment class as a Serve
# deployment to ensure all functionalities can be run remotely asynchronously.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ async def _peek_at_openai_json_generator(
return first_response, _openai_json_wrapper(generator, first_response)


class LLMModelRouterDeploymentImpl:
class LLMRouter:
def __init__(
self,
llm_deployments: List[DeploymentHandle],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ray.llm._internal.serve.deployments.llm.vllm.vllm_deployment import VLLMDeployment
from ray.llm._internal.serve.configs.server_models import LLMConfig, LoraConfig
from ray.llm._internal.serve.deployments.routers.router import (
LLMModelRouterDeploymentImpl,
LLMRouter,
)
from ray.llm._internal.serve.builders.application_builders import (
get_serve_deployment_args,
Expand Down Expand Up @@ -81,9 +81,7 @@ async def test_lora_unavailable_base_model(shutdown_ray_and_serve):
"""Getting the handle for an unavailable model should return a 404."""
llm_config = VLLM_APP.model_copy(deep=True)
llm_deployments = get_mocked_llm_deployments([llm_config])
router_deployment = LLMModelRouterDeploymentImpl.as_deployment().bind(
llm_deployments=llm_deployments
)
router_deployment = LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
router_handle = serve.run(router_deployment)

with pytest.raises(HTTPException) as e:
Expand All @@ -101,9 +99,7 @@ async def test_lora_get_model(shutdown_ray_and_serve):
llm_config = VLLM_APP.model_copy(deep=True)
llm_config.model_loading_config.model_id = base_model_id
llm_deployments = get_mocked_llm_deployments([llm_config])
router_deployment = LLMModelRouterDeploymentImpl.as_deployment().bind(
llm_deployments=llm_deployments
)
router_deployment = LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
router_handle = serve.run(router_deployment)

# Case 1: model does not exist.
Expand All @@ -130,7 +126,7 @@ async def fake_get_lora_model_metadata(*args, **kwargs):
"max_request_context_length": 4096,
}

router_deployment = LLMModelRouterDeploymentImpl.as_deployment().bind(
router_deployment = LLMRouter.as_deployment().bind(
llm_deployments=llm_deployments,
_get_lora_model_metadata_func=fake_get_lora_model_metadata,
)
Expand All @@ -153,9 +149,7 @@ async def test_lora_list_base_model(shutdown_ray_and_serve):
llm_config = VLLM_APP.model_copy(deep=True)
llm_config.model_loading_config.model_id = base_model_id
llm_deployments = get_mocked_llm_deployments([llm_config])
router_deployment = LLMModelRouterDeploymentImpl.as_deployment().bind(
llm_deployments=llm_deployments
)
router_deployment = LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
router_handle = serve.run(router_deployment)

models = (await router_handle.models.remote()).data
Expand Down Expand Up @@ -221,9 +215,7 @@ async def test_lora_include_adapters_in_list_models(
app.lora_config = LoraConfig(dynamic_lora_loading_path=dynamic_lora_loading_path)

llm_deployments = get_mocked_llm_deployments([app])
router_deployment = LLMModelRouterDeploymentImpl.as_deployment().bind(
llm_deployments=llm_deployments
)
router_deployment = LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
router_handle = serve.run(router_deployment)

models = (await router_handle.models.remote()).data
Expand Down
10 changes: 5 additions & 5 deletions python/ray/serve/llm/deployments.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from ray.llm._internal.serve.deployments.llm.vllm.vllm_deployment import (
VLLMDeploymentImpl as _VLLMDeploymentImpl,
VLLMService as _VLLMService,
)
from ray.llm._internal.serve.deployments.routers.router import (
LLMModelRouterDeploymentImpl as _LLMModelRouterDeploymentImpl,
LLMRouter as _LLMRouter,
)


from ray.util.annotations import PublicAPI


@PublicAPI(stability="alpha")
class VLLMDeploymentImpl(_VLLMDeploymentImpl):
class VLLMService(_VLLMService):
"""The implementation of the VLLM engine deployment.
To build a VLLMDeployment object you should use `build_vllm_deployment` function.
Expand Down Expand Up @@ -42,7 +42,7 @@ class VLLMDeploymentImpl(_VLLMDeploymentImpl):
)
# Build the deployment directly
VLLMDeployment = VLLMDeploymentImpl.as_deployment(llm_config.get_serve_options())
VLLMDeployment = VLLMService.as_deployment(llm_config.get_serve_options())
vllm_app = VLLMDeployment.bind(llm_config)
model_handle = serve.run(vllm_app)
Expand All @@ -66,7 +66,7 @@ class VLLMDeploymentImpl(_VLLMDeploymentImpl):


@PublicAPI(stability="alpha")
class LLMModelRouterDeploymentImpl(_LLMModelRouterDeploymentImpl):
class LLMRouter(_LLMRouter):

"""The implementation of the OpenAI compatiple model router.
Expand Down
4 changes: 2 additions & 2 deletions python/ray/serve/tests/unit/test_llm_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def test_serve_llm_import_does_not_error():
)
with pytest.raises(ImportError):
from ray.serve.llm.deployments import (
VLLMDeploymentImpl, # noqa: F401
LLMModelRouterDeploymentImpl, # noqa: F401
VLLMService, # noqa: F401
LLMRouter, # noqa: F401
)
with pytest.raises(ImportError):
from ray.serve.llm.builders import (
Expand Down

0 comments on commit c99ed82

Please sign in to comment.