-
Notifications
You must be signed in to change notification settings - Fork 7k
[docs][serve][llm] examples and doc for cross-node TP/PP in Serve #57715
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
1823990
20a9603
1f15be4
20b50d7
0afabf1
a60e5e9
bcf73a1
835a18a
173d860
94b71ab
2c8ad68
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,203 @@ | ||
| # flake8: noqa | ||
| """ | ||
| Cross-node parallelism examples for Ray Serve LLM. | ||
|
|
||
| TP / PP / custom placement group strategies | ||
| for multi-node LLM deployments. | ||
| """ | ||
|
|
||
| # __cross_node_tp_example_start__ | ||
| import vllm | ||
| from ray import serve | ||
| from ray.serve.llm import LLMConfig, build_openai_app | ||
|
|
||
| # Configure a model with tensor parallelism across 2 GPUs | ||
| # Tensor parallelism splits model weights across GPUs | ||
| llm_config = LLMConfig( | ||
| model_loading_config=dict( | ||
| model_id="llama-3.1-8b", | ||
| model_source="meta-llama/Llama-3.1-8B-Instruct", | ||
| ), | ||
| deployment_config=dict( | ||
| autoscaling_config=dict( | ||
| min_replicas=1, | ||
| max_replicas=2, | ||
| ) | ||
| ), | ||
| accelerator_type="L4", | ||
| engine_kwargs=dict( | ||
| tensor_parallel_size=2, | ||
| distributed_executor_backend="ray", | ||
|
||
| max_model_len=8192, | ||
| ), | ||
| ) | ||
|
|
||
| # Deploy the application | ||
| app = build_openai_app({"llm_configs": [llm_config]}) | ||
| serve.run(app, blocking=True) | ||
| # __cross_node_tp_example_end__ | ||
|
|
||
| # __cross_node_pp_example_start__ | ||
| from ray import serve | ||
| from ray.serve.llm import LLMConfig, build_openai_app | ||
|
|
||
| # Configure a model with pipeline parallelism across 2 GPUs | ||
| # Pipeline parallelism splits model layers across GPUs | ||
| llm_config = LLMConfig( | ||
| model_loading_config=dict( | ||
| model_id="llama-3.1-8b", | ||
| model_source="meta-llama/Llama-3.1-8B-Instruct", | ||
| ), | ||
| deployment_config=dict( | ||
| autoscaling_config=dict( | ||
| min_replicas=1, | ||
| max_replicas=1, | ||
| ) | ||
| ), | ||
| accelerator_type="L4", | ||
| engine_kwargs=dict( | ||
| pipeline_parallel_size=2, | ||
| distributed_executor_backend="ray", | ||
|
||
| max_model_len=8192, | ||
| ), | ||
| ) | ||
|
|
||
| # Deploy the application | ||
| app = build_openai_app({"llm_configs": [llm_config]}) | ||
| serve.run(app, blocking=True) | ||
| # __cross_node_pp_example_end__ | ||
|
|
||
| # __cross_node_tp_pp_example_start__ | ||
| from ray import serve | ||
| from ray.serve.llm import LLMConfig, build_openai_app | ||
|
|
||
| # Configure a model with both tensor and pipeline parallelism | ||
| # This example uses 4 GPUs total (2 TP * 2 PP) | ||
| llm_config = LLMConfig( | ||
| model_loading_config=dict( | ||
| model_id="llama-3.1-8b", | ||
| model_source="meta-llama/Llama-3.1-8B-Instruct", | ||
| ), | ||
| deployment_config=dict( | ||
| autoscaling_config=dict( | ||
| min_replicas=1, | ||
| max_replicas=1, | ||
| ) | ||
| ), | ||
| accelerator_type="L4", | ||
| engine_kwargs=dict( | ||
| tensor_parallel_size=2, | ||
| pipeline_parallel_size=2, | ||
| distributed_executor_backend="ray", | ||
|
||
| max_model_len=8192, | ||
| enable_chunked_prefill=True, | ||
| max_num_batched_tokens=4096, | ||
| ), | ||
| ) | ||
|
|
||
| # Deploy the application | ||
| app = build_openai_app({"llm_configs": [llm_config]}) | ||
| serve.run(app, blocking=True) | ||
| # __cross_node_tp_pp_example_end__ | ||
|
|
||
| # __custom_placement_group_pack_example_start__ | ||
| from ray import serve | ||
| from ray.serve.llm import LLMConfig, build_openai_app | ||
|
|
||
| # Configure a model with custom placement group using PACK strategy | ||
| # PACK tries to place workers on as few nodes as possible for locality | ||
| llm_config = LLMConfig( | ||
| model_loading_config=dict( | ||
| model_id="llama-3.1-8b", | ||
| model_source="meta-llama/Llama-3.1-8B-Instruct", | ||
| ), | ||
| deployment_config=dict( | ||
| autoscaling_config=dict( | ||
| min_replicas=1, | ||
| max_replicas=1, | ||
| ) | ||
| ), | ||
| accelerator_type="L4", | ||
| engine_kwargs=dict( | ||
| tensor_parallel_size=2, | ||
| distributed_executor_backend="ray", | ||
|
||
| max_model_len=8192, | ||
| ), | ||
| placement_group_config=dict( | ||
| bundles=[{"GPU": 1}] * 2, | ||
| strategy="PACK", | ||
| ), | ||
| ) | ||
|
|
||
| # Deploy the application | ||
| app = build_openai_app({"llm_configs": [llm_config]}) | ||
| serve.run(app, blocking=True) | ||
| # __custom_placement_group_pack_example_end__ | ||
|
|
||
| # __custom_placement_group_spread_example_start__ | ||
| from ray import serve | ||
| from ray.serve.llm import LLMConfig, build_openai_app | ||
|
|
||
| # Configure a model with custom placement group using SPREAD strategy | ||
| # SPREAD distributes workers across nodes for fault tolerance | ||
| llm_config = LLMConfig( | ||
| model_loading_config=dict( | ||
| model_id="llama-3.1-8b", | ||
| model_source="meta-llama/Llama-3.1-8B-Instruct", | ||
| ), | ||
| deployment_config=dict( | ||
| autoscaling_config=dict( | ||
| min_replicas=1, | ||
| max_replicas=1, | ||
| ) | ||
| ), | ||
| accelerator_type="L4", | ||
| engine_kwargs=dict( | ||
| tensor_parallel_size=4, | ||
| distributed_executor_backend="ray", | ||
|
||
| max_model_len=8192, | ||
| ), | ||
| placement_group_config=dict( | ||
| bundles=[{"GPU": 1}] * 4, | ||
| strategy="SPREAD", | ||
| ), | ||
| ) | ||
|
|
||
| # Deploy the application | ||
| app = build_openai_app({"llm_configs": [llm_config]}) | ||
| serve.run(app, blocking=True) | ||
| # __custom_placement_group_spread_example_end__ | ||
|
|
||
| # __custom_placement_group_strict_pack_example_start__ | ||
| from ray import serve | ||
| from ray.serve.llm import LLMConfig, build_openai_app | ||
|
|
||
| # Configure a model with custom placement group using STRICT_PACK strategy | ||
| # STRICT_PACK ensures all workers are placed on the same node | ||
| llm_config = LLMConfig( | ||
| model_loading_config=dict( | ||
| model_id="llama-3.1-8b", | ||
| model_source="meta-llama/Llama-3.1-8B-Instruct", | ||
| ), | ||
| deployment_config=dict( | ||
| autoscaling_config=dict( | ||
| min_replicas=1, | ||
| max_replicas=2, | ||
| ) | ||
| ), | ||
| accelerator_type="A100", | ||
| engine_kwargs=dict( | ||
| tensor_parallel_size=2, | ||
| distributed_executor_backend="ray", | ||
|
||
| max_model_len=8192, | ||
| ), | ||
| placement_group_config=dict( | ||
| bundles=[{"GPU": 1}] * 2, | ||
| strategy="STRICT_PACK", | ||
| ), | ||
| ) | ||
|
|
||
| # Deploy the application | ||
| app = build_openai_app({"llm_configs": [llm_config]}) | ||
| serve.run(app, blocking=True) | ||
| # __custom_placement_group_strict_pack_example_end__ | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bug: Unnecessary Import in Documentation Example
The
import vllmincross_node_parallelism_example.pyis unused. Its presence in the documentation examples could mislead users into thinking they need to importvllmdirectly, even though it's an internal dependency of Ray Serve LLM.