Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ py_test_run_all_subdirectory(
"source/serve/doc_code/stable_diffusion.py",
"source/serve/doc_code/object_detection.py",
"source/serve/doc_code/vllm_example.py",
"source/serve/doc_code/cross_node_parallelism_example.py",
"source/serve/doc_code/llm/llm_yaml_config_example.py",
"source/serve/doc_code/llm/qwen_example.py",
],
Expand Down
203 changes: 203 additions & 0 deletions doc/source/serve/doc_code/cross_node_parallelism_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# flake8: noqa
"""
Cross-node parallelism examples for Ray Serve LLM.

TP / PP / custom placement group strategies
for multi-node LLM deployments.
"""

# __cross_node_tp_example_start__
import vllm
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Unnecessary Import in Documentation Example

The import vllm in cross_node_parallelism_example.py is unused. Its presence in the documentation examples could mislead users into thinking they need to import vllm directly, even though it's an internal dependency of Ray Serve LLM.

Fix in Cursor Fix in Web

from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app

# Configure a model with tensor parallelism across 2 GPUs
# Tensor parallelism splits model weights across GPUs
llm_config = LLMConfig(
model_loading_config=dict(
model_id="llama-3.1-8b",
model_source="meta-llama/Llama-3.1-8B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1,
max_replicas=2,
)
),
accelerator_type="L4",
engine_kwargs=dict(
tensor_parallel_size=2,
distributed_executor_backend="ray",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

users don't need to specify "ray" as the backend.

max_model_len=8192,
),
)

# Deploy the application
app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
# __cross_node_tp_example_end__

# __cross_node_pp_example_start__
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app

# Configure a model with pipeline parallelism across 2 GPUs
# Pipeline parallelism splits model layers across GPUs
llm_config = LLMConfig(
model_loading_config=dict(
model_id="llama-3.1-8b",
model_source="meta-llama/Llama-3.1-8B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1,
max_replicas=1,
)
),
accelerator_type="L4",
engine_kwargs=dict(
pipeline_parallel_size=2,
distributed_executor_backend="ray",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here

max_model_len=8192,
),
)

# Deploy the application
app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
# __cross_node_pp_example_end__

# __cross_node_tp_pp_example_start__
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app

# Configure a model with both tensor and pipeline parallelism
# This example uses 4 GPUs total (2 TP * 2 PP)
llm_config = LLMConfig(
model_loading_config=dict(
model_id="llama-3.1-8b",
model_source="meta-llama/Llama-3.1-8B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1,
max_replicas=1,
)
),
accelerator_type="L4",
engine_kwargs=dict(
tensor_parallel_size=2,
pipeline_parallel_size=2,
distributed_executor_backend="ray",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here

max_model_len=8192,
enable_chunked_prefill=True,
max_num_batched_tokens=4096,
),
)

# Deploy the application
app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
# __cross_node_tp_pp_example_end__

# __custom_placement_group_pack_example_start__
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app

# Configure a model with custom placement group using PACK strategy
# PACK tries to place workers on as few nodes as possible for locality
llm_config = LLMConfig(
model_loading_config=dict(
model_id="llama-3.1-8b",
model_source="meta-llama/Llama-3.1-8B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1,
max_replicas=1,
)
),
accelerator_type="L4",
engine_kwargs=dict(
tensor_parallel_size=2,
distributed_executor_backend="ray",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here

max_model_len=8192,
),
placement_group_config=dict(
bundles=[{"GPU": 1}] * 2,
strategy="PACK",
),
)

# Deploy the application
app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
# __custom_placement_group_pack_example_end__

# __custom_placement_group_spread_example_start__
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app

# Configure a model with custom placement group using SPREAD strategy
# SPREAD distributes workers across nodes for fault tolerance
llm_config = LLMConfig(
model_loading_config=dict(
model_id="llama-3.1-8b",
model_source="meta-llama/Llama-3.1-8B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1,
max_replicas=1,
)
),
accelerator_type="L4",
engine_kwargs=dict(
tensor_parallel_size=4,
distributed_executor_backend="ray",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here

max_model_len=8192,
),
placement_group_config=dict(
bundles=[{"GPU": 1}] * 4,
strategy="SPREAD",
),
)

# Deploy the application
app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
# __custom_placement_group_spread_example_end__

# __custom_placement_group_strict_pack_example_start__
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app

# Configure a model with custom placement group using STRICT_PACK strategy
# STRICT_PACK ensures all workers are placed on the same node
llm_config = LLMConfig(
model_loading_config=dict(
model_id="llama-3.1-8b",
model_source="meta-llama/Llama-3.1-8B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1,
max_replicas=2,
)
),
accelerator_type="A100",
engine_kwargs=dict(
tensor_parallel_size=2,
distributed_executor_backend="ray",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

max_model_len=8192,
),
placement_group_config=dict(
bundles=[{"GPU": 1}] * 2,
strategy="STRICT_PACK",
),
)

# Deploy the application
app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
# __custom_placement_group_strict_pack_example_end__
Loading