diff --git a/docs/user_guide/examples/offline_inference/bagel.md b/docs/user_guide/examples/offline_inference/bagel.md
index 5f458750b4..8ba09c2262 100644
--- a/docs/user_guide/examples/offline_inference/bagel.md
+++ b/docs/user_guide/examples/offline_inference/bagel.md
@@ -158,10 +158,23 @@ The default yaml configuration deploys Thinker and DiT on the same GPU. You can
 
 For larger models or multi-GPU environments, you can enable Tensor Parallelism (TP) by modifying the stage configuration (e.g., [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml)).
 
-1. **Set `tensor_parallel_size`**: Increase this value (e.g., to `2` or `4`).
-2. **Set `devices`**: Specify the comma-separated GPU IDs to be used for the stage (e.g., `"0,1"`).
+In multi-stage omni models, LLM stages and diffusion stages use different TP config fields:
 
-Example configuration for TP=2 on GPUs 0 and 1:
+1. **LLM stage**: set top-level `engine_args.tensor_parallel_size`.
+2. **Diffusion stage**: set `engine_args.parallel_config.tensor_parallel_size`.
+3. **Set `devices`**: Specify the comma-separated GPU IDs to be used for the target stage (e.g., `"0,1"`).
+
+Example configuration for the diffusion stage with TP=2 on GPUs 0 and 1:
+```yaml
+    engine_args:
+      parallel_config:
+        tensor_parallel_size: 2
+      ...
+    runtime:
+      devices: "0,1"
+```
+
+Example configuration for the LLM stage with TP=2 on GPUs 0 and 1:
 ```yaml
     engine_args:
       tensor_parallel_size: 2
diff --git a/docs/user_guide/examples/online_serving/bagel.md b/docs/user_guide/examples/online_serving/bagel.md
index 4a6094c089..d8df0a71e0 100644
--- a/docs/user_guide/examples/online_serving/bagel.md
+++ b/docs/user_guide/examples/online_serving/bagel.md
@@ -35,6 +35,25 @@ For larger models or multi-GPU environments, you can enable Tensor Parallelism (
 
 1. **Modify Stage Config**: Create or modify a stage configuration yaml (e.g., [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml)). Set `tensor_parallel_size` to `2` (or more) and update `devices` to include multiple GPU IDs (e.g., `"0,1"`).
 
+In multi-stage omni models, LLM stages and diffusion stages use different TP config fields:
+
+1. **LLM stage**: set top-level `engine_args.tensor_parallel_size`.
+2. **Diffusion stage**: set `engine_args.parallel_config.tensor_parallel_size`.
+3. **Set `devices`**: Specify the comma-separated GPU IDs to be used for the target stage (e.g., `"0,1"`).
+
+Example configuration for the diffusion stage with TP=2 on GPUs 0 and 1:
+
+```yaml
+    engine_args:
+      parallel_config:
+        tensor_parallel_size: 2
+      ...
+    runtime:
+      devices: "0,1"
+```
+
+Example configuration for the LLM stage with TP=2 on GPUs 0 and 1:
+
 ```yaml
     engine_args:
       tensor_parallel_size: 2
diff --git a/vllm_omni/model_executor/stage_configs/bagel.yaml b/vllm_omni/model_executor/stage_configs/bagel.yaml
index d1031b574a..48fd08c590 100644
--- a/vllm_omni/model_executor/stage_configs/bagel.yaml
+++ b/vllm_omni/model_executor/stage_configs/bagel.yaml
@@ -59,7 +59,8 @@ stage_args:
       distributed_executor_backend: "mp"
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
-      tensor_parallel_size: 1
+      parallel_config:
+        tensor_parallel_size: 1
       omni_kv_config:
         need_recv_cache: true
     engine_input_source: [0]
diff --git a/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml
index 4919395cad..a56f695629 100644
--- a/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml
+++ b/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml
@@ -52,7 +52,8 @@ stage_args:
       distributed_executor_backend: "mp"
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
-      tensor_parallel_size: 1
+      parallel_config:
+        tensor_parallel_size: 1
       omni_kv_config:
         need_recv_cache: true
     engine_input_source: [0]
diff --git a/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml b/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml
index 632c227f36..f01fdfd533 100644
--- a/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml
+++ b/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml
@@ -52,8 +52,8 @@ stage_args:
       distributed_executor_backend: "mp"
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
-      tensor_parallel_size: 1
       parallel_config:
+        tensor_parallel_size: 1
         ulysses_degree: 2
         # ring_degree: 2
       omni_kv_config:
diff --git a/vllm_omni/platforms/xpu/stage_configs/bagel.yaml b/vllm_omni/platforms/xpu/stage_configs/bagel.yaml
index 0fc8a25ea5..3ae4edb772 100644
--- a/vllm_omni/platforms/xpu/stage_configs/bagel.yaml
+++ b/vllm_omni/platforms/xpu/stage_configs/bagel.yaml
@@ -53,7 +53,8 @@ stage_args:
       distributed_executor_backend: "mp"
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
-      tensor_parallel_size: 1
+      parallel_config:
+        tensor_parallel_size: 1
       omni_kv_config:
         need_recv_cache: true
     engine_input_source: [0]