diff --git a/docs/user_guide/examples/offline_inference/bagel.md b/docs/user_guide/examples/offline_inference/bagel.md index 5f458750b4..8ba09c2262 100644 --- a/docs/user_guide/examples/offline_inference/bagel.md +++ b/docs/user_guide/examples/offline_inference/bagel.md @@ -158,10 +158,23 @@ The default yaml configuration deploys Thinker and DiT on the same GPU. You can For larger models or multi-GPU environments, you can enable Tensor Parallelism (TP) by modifying the stage configuration (e.g., [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml)). -1. **Set `tensor_parallel_size`**: Increase this value (e.g., to `2` or `4`). -2. **Set `devices`**: Specify the comma-separated GPU IDs to be used for the stage (e.g., `"0,1"`). +In multi-stage omni models, LLM stages and diffusion stages use different TP config fields: -Example configuration for TP=2 on GPUs 0 and 1: +1. **LLM stage**: set top-level `engine_args.tensor_parallel_size`. +2. **Diffusion stage**: set `engine_args.parallel_config.tensor_parallel_size`. +3. **Set `devices`**: Specify the comma-separated GPU IDs to be used for the target stage (e.g., `"0,1"`). + +Example configuration for the diffusion stage with TP=2 on GPUs 0 and 1: +```yaml + engine_args: + parallel_config: + tensor_parallel_size: 2 + ... + runtime: + devices: "0,1" +``` + +Example configuration for the LLM stage with TP=2 on GPUs 0 and 1: ```yaml engine_args: tensor_parallel_size: 2 diff --git a/docs/user_guide/examples/online_serving/bagel.md b/docs/user_guide/examples/online_serving/bagel.md index 4a6094c089..d8df0a71e0 100644 --- a/docs/user_guide/examples/online_serving/bagel.md +++ b/docs/user_guide/examples/online_serving/bagel.md @@ -35,6 +35,25 @@ For larger models or multi-GPU environments, you can enable Tensor Parallelism ( 1. **Modify Stage Config**: Create or modify a stage configuration yaml (e.g., [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml)). Set `tensor_parallel_size` to `2` (or more) and update `devices` to include multiple GPU IDs (e.g., `"0,1"`). +In multi-stage omni models, LLM stages and diffusion stages use different TP config fields: + +1. **LLM stage**: set top-level `engine_args.tensor_parallel_size`. +2. **Diffusion stage**: set `engine_args.parallel_config.tensor_parallel_size`. +3. **Set `devices`**: Specify the comma-separated GPU IDs to be used for the target stage (e.g., `"0,1"`). + +Example configuration for the diffusion stage with TP=2 on GPUs 0 and 1: + +```yaml + engine_args: + parallel_config: + tensor_parallel_size: 2 + ... + runtime: + devices: "0,1" +``` + +Example configuration for the LLM stage with TP=2 on GPUs 0 and 1: + ```yaml engine_args: tensor_parallel_size: 2 diff --git a/vllm_omni/model_executor/stage_configs/bagel.yaml b/vllm_omni/model_executor/stage_configs/bagel.yaml index d1031b574a..48fd08c590 100644 --- a/vllm_omni/model_executor/stage_configs/bagel.yaml +++ b/vllm_omni/model_executor/stage_configs/bagel.yaml @@ -59,7 +59,8 @@ stage_args: distributed_executor_backend: "mp" enable_prefix_caching: false max_num_batched_tokens: 32768 - tensor_parallel_size: 1 + parallel_config: + tensor_parallel_size: 1 omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml index 4919395cad..a56f695629 100644 --- a/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml +++ b/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml @@ -52,7 +52,8 @@ stage_args: distributed_executor_backend: "mp" enable_prefix_caching: false max_num_batched_tokens: 32768 - tensor_parallel_size: 1 + parallel_config: + tensor_parallel_size: 1 omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml b/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml index 632c227f36..f01fdfd533 100644 --- a/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml +++ b/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml @@ -52,8 +52,8 @@ stage_args: distributed_executor_backend: "mp" enable_prefix_caching: false max_num_batched_tokens: 32768 - tensor_parallel_size: 1 parallel_config: + tensor_parallel_size: 1 ulysses_degree: 2 # ring_degree: 2 omni_kv_config: diff --git a/vllm_omni/platforms/xpu/stage_configs/bagel.yaml b/vllm_omni/platforms/xpu/stage_configs/bagel.yaml index 0fc8a25ea5..3ae4edb772 100644 --- a/vllm_omni/platforms/xpu/stage_configs/bagel.yaml +++ b/vllm_omni/platforms/xpu/stage_configs/bagel.yaml @@ -53,7 +53,8 @@ stage_args: distributed_executor_backend: "mp" enable_prefix_caching: false max_num_batched_tokens: 32768 - tensor_parallel_size: 1 + parallel_config: + tensor_parallel_size: 1 omni_kv_config: need_recv_cache: true engine_input_source: [0]