From 2daa5731e73f18b32f329ec47d1ae250d0c5a821 Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Fri, 17 Apr 2026 03:09:19 +0000
Subject: [PATCH 01/12] pass tp size to diffusion config

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 vllm_omni/diffusion/data.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py
index b9307657f5c..7347edd5127 100644
--- a/vllm_omni/diffusion/data.py
+++ b/vllm_omni/diffusion/data.py
@@ -776,6 +776,15 @@ def from_kwargs(cls, **kwargs: Any) -> "OmniDiffusionConfig":
         if "diffusers_call_kwargs" in kwargs and kwargs["diffusers_call_kwargs"] is None:
             kwargs["diffusers_call_kwargs"] = {}
 
+        # Forward top-level parallel knobs (e.g. --tensor-parallel-size from CLI)
+        # into parallel_config so the diffusion engine sees them.
+        par = kwargs.get("parallel_config", {})
+        if isinstance(par, Mapping):
+            par = dict(par)
+            if "tensor_parallel_size" in kwargs and "tensor_parallel_size" not in par:
+                par["tensor_parallel_size"] = kwargs["tensor_parallel_size"]
+            kwargs["parallel_config"] = par
+
         # Filter kwargs to only include valid fields
         valid_fields = {f.name for f in fields(cls)}
         filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_fields}

From a41e8f078f56a62b70369d292b748ebc9061ecaa Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Fri, 17 Apr 2026 07:52:46 +0000
Subject: [PATCH 02/12] update bagel CI to use real tp devices

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 .../online_serving/test_bagel_expansion.py    | 54 ++++++++++++++-----
 1 file changed, 41 insertions(+), 13 deletions(-)

diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py
index 21142439bee..df0b8da8ba0 100644
--- a/tests/e2e/online_serving/test_bagel_expansion.py
+++ b/tests/e2e/online_serving/test_bagel_expansion.py
@@ -14,13 +14,21 @@
 512x512 resolution.
 """
 
+from pathlib import Path
+
 import pytest
 
 from tests.helpers.mark import hardware_marks
 from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler, dummy_messages_from_mix_data
+from tests.helpers.stage_config import modify_stage_config
 
 pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
 
+# This test uses the default Bagel YAML, and CLI does not control devices.We modify yaml file directly.
+_BAGEL_DEFAULT_YAML = str(
+    Path(__file__).resolve().parents[3] / "vllm_omni" / "model_executor" / "stage_configs" / "bagel.yaml"
+)
+
 PROMPT = "A futuristic city skyline at twilight, cyberpunk style, ultra-detailed, high resolution."
 NEGATIVE_PROMPT = "low quality, blurry, distorted, deformed, watermark"
 
@@ -28,6 +36,36 @@
 PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=2)
 
 
+def _make_tp_cases(model: str, tp_size: int):
+    """Build Bagel TP test cases with devices auto-derived from tp_size.
+    Devices can not be set through CLI args, so we set them in the YAML.
+    """
+    # Dit devices start from 0, due to CI GPU usage constraint,
+    # for those GPUs that encountered OOM, adjust the offset accordingly.
+    devices = ",".join(str(i) for i in range(tp_size))
+    return [
+        pytest.param(
+            OmniServerParams(
+                model=model,
+                stage_config_path=modify_stage_config(
+                    _BAGEL_DEFAULT_YAML,
+                    updates={
+                        "stage_args": {
+                            1: {
+                                "runtime.devices": devices,
+                                "engine_args.parallel_config.tensor_parallel_size": tp_size,
+                            },
+                        },
+                    },
+                ),
+                server_args=["--cache-backend", "cache_dit"],
+            ),
+            id=f"parallel_tp_{tp_size}",
+            marks=PARALLEL_FEATURE_MARKS,
+        ),
+    ]
+
+
 def _get_diffusion_feature_cases(model: str):
     """Return L4 diffusion feature cases for Bagel.
     TeaCache, Cache-DiT, CFG-Parallel, Tensor-Parallel,
@@ -74,19 +112,9 @@ def _get_diffusion_feature_cases(model: str):
             marks=PARALLEL_FEATURE_MARKS,
         ),
         # Tensor-Parallel size 2 (2 GPUs, Cache-DiT backend)
-        pytest.param(
-            OmniServerParams(
-                model=model,
-                server_args=[
-                    "--cache-backend",
-                    "cache_dit",
-                    "--tensor-parallel-size",
-                    "2",
-                ],
-            ),
-            id="parallel_tp_2",
-            marks=[*PARALLEL_FEATURE_MARKS, pytest.mark.skip(reason="issue: #2862")],
-        ),
+        # Stage 1 (DiT) needs visible GPUs matching TP size; the default YAML
+        # only exposes device "0", so we patch it here.
+        *_make_tp_cases(model, tp_size=2),
         # Ulysses-SP degree=2 (2 GPUs)
         pytest.param(
             OmniServerParams(

From 26352d1e819c490ee6117da819c96dcb7a96ccb6 Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Mon, 27 Apr 2026 03:56:21 +0000
Subject: [PATCH 03/12] adjust codes for comments

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 tests/e2e/online_serving/test_bagel_expansion.py | 11 +++--------
 vllm_omni/engine/async_omni_engine.py            | 12 ++++++++++++
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py
index df0b8da8ba0..ce60db18741 100644
--- a/tests/e2e/online_serving/test_bagel_expansion.py
+++ b/tests/e2e/online_serving/test_bagel_expansion.py
@@ -14,8 +14,6 @@
 512x512 resolution.
 """
 
-from pathlib import Path
-
 import pytest
 
 from tests.helpers.mark import hardware_marks
@@ -24,10 +22,8 @@
 
 pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
 
-# This test uses the default Bagel YAML, and CLI does not control devices.We modify yaml file directly.
-_BAGEL_DEFAULT_YAML = str(
-    Path(__file__).resolve().parents[3] / "vllm_omni" / "model_executor" / "stage_configs" / "bagel.yaml"
-)
+# This test uses the Bagel stage-config YAML under model_executor; CLI still carries TP.
+_BAGEL_DEFAULT_YAML = "vllm_omni/model_executor/stage_configs/bagel.yaml"
 
 PROMPT = "A futuristic city skyline at twilight, cyberpunk style, ultra-detailed, high resolution."
 NEGATIVE_PROMPT = "low quality, blurry, distorted, deformed, watermark"
@@ -53,12 +49,11 @@ def _make_tp_cases(model: str, tp_size: int):
                         "stage_args": {
                             1: {
                                 "runtime.devices": devices,
-                                "engine_args.parallel_config.tensor_parallel_size": tp_size,
                             },
                         },
                     },
                 ),
-                server_args=["--cache-backend", "cache_dit"],
+                server_args=["--cache-backend", "cache_dit", "--tensor-parallel-size", str(tp_size)],
             ),
             id=f"parallel_tp_{tp_size}",
             marks=PARALLEL_FEATURE_MARKS,
diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index 61da4388be0..76870571164 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -110,6 +110,18 @@
         "worker_extension_cls",
         "allowed_local_media_path",
         "allowed_media_domains",
+        # Legacy stage-config YAMLs may intentionally leave parallel or
+        # distributed knobs unspecified at the stage level and rely on
+        # top-level CLI values to fill them in during the per-stage merge.
+        # Keep these fields so stages that omit them can inherit CLI values,
+        # while stages with explicit YAML values still win because the legacy
+        # stage-config loader prefers stage-local engine args.
+        "tensor_parallel_size",
+        "pipeline_parallel_size",
+        "data_parallel_size",
+        "data_parallel_size_local",
+        "data_parallel_backend",
+        "distributed_executor_backend",
     }
 )
 

From 029109f290816451affa3da667299768185900f5 Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Mon, 27 Apr 2026 06:19:37 +0000
Subject: [PATCH 04/12] update for simple test

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 tests/engine/test_arg_utils.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 0d61f6a675b..26ce07b73b7 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -230,7 +230,7 @@ def test_voxcpm_model_arch_injects_model_type_override(mocker):
 def test_strip_single_engine_args():
     """_strip_single_engine_args should remove EngineArgs fields but keep omni fields."""
     kwargs = {
-        # Parent EngineArgs fields — should be stripped
+        # Parent EngineArgs fields — stripped unless explicitly allowlisted
         "compilation_config": '{"cudagraph_mode": "FULL_AND_PIECEWISE"}',
         "tensor_parallel_size": 4,
         "gpu_memory_utilization": 0.9,
@@ -248,7 +248,7 @@ def test_strip_single_engine_args():
 
     # Stripped — parent EngineArgs fields
     assert "compilation_config" not in filtered
-    assert "tensor_parallel_size" not in filtered
+    assert filtered["tensor_parallel_size"] == 4
     assert "gpu_memory_utilization" not in filtered
     assert "model" not in filtered
 
@@ -278,15 +278,18 @@ def test_strip_single_engine_args_model_does_not_trigger_warning(mocker):
     mock_warn.assert_not_called()
 
     # When there *are* genuinely surprising overrides alongside model,
-    # the warning should mention them but not model.
+    # the warning should mention them but not model. Keep-listed fields such as
+    # tensor_parallel_size are intentionally passed through and should not warn.
     AsyncOmniEngine._strip_single_engine_args(
         {
             "model": "some/model",
+            "compilation_config": '{"cudagraph_mode": "FULL_AND_PIECEWISE"}',
             "tensor_parallel_size": 4,
             "custom_pipeline_args": {"pipeline_class": "my.Pipeline"},
         }
     )
     mock_warn.assert_called_once()
     warned_args = mock_warn.call_args[0][-1]  # the formatted arg list
-    assert "tensor_parallel_size" in warned_args
+    assert "compilation_config" in warned_args
+    assert "tensor_parallel_size" not in warned_args
     assert "model" not in warned_args

From c7b0c69cef6842db5a427b7bdebb1ea2753fbfbb Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Mon, 27 Apr 2026 10:02:56 +0000
Subject: [PATCH 05/12] update

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 vllm_omni/engine/async_omni_engine.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index 76870571164..dbd5db3d92f 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -117,11 +117,6 @@
         # while stages with explicit YAML values still win because the legacy
         # stage-config loader prefers stage-local engine args.
         "tensor_parallel_size",
-        "pipeline_parallel_size",
-        "data_parallel_size",
-        "data_parallel_size_local",
-        "data_parallel_backend",
-        "distributed_executor_backend",
     }
 )
 

From 30882e70b87a0ea336c395673b9f941d76bbbbef Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Wed, 29 Apr 2026 03:24:02 +0000
Subject: [PATCH 06/12] use deployment yaml

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 tests/e2e/online_serving/test_bagel_expansion.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py
index 6b3d8caa3f6..7f141918278 100644
--- a/tests/e2e/online_serving/test_bagel_expansion.py
+++ b/tests/e2e/online_serving/test_bagel_expansion.py
@@ -19,12 +19,11 @@
 
 from tests.helpers.mark import hardware_marks
 from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler, dummy_messages_from_mix_data
-from tests.helpers.stage_config import modify_stage_config
+from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config
 
 pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
 
-# This test uses the Bagel stage-config YAML under model_executor; CLI still carries TP.
-_BAGEL_DEFAULT_YAML = "vllm_omni/model_executor/stage_configs/bagel.yaml"
+_BAGEL_DEFAULT_YAML = get_deploy_config_path("ci/bagel.yaml")
 
 PROMPT = "A futuristic city skyline at twilight, cyberpunk style, ultra-detailed, high resolution."
 NEGATIVE_PROMPT = "low quality, blurry, distorted, deformed, watermark"

From f99b0ed2000d96bae231c1669cb74ef610f56673 Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Wed, 29 Apr 2026 06:15:42 +0000
Subject: [PATCH 07/12] fix device name

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 tests/e2e/online_serving/test_bagel_expansion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py
index 7f141918278..8ac049b2b01 100644
--- a/tests/e2e/online_serving/test_bagel_expansion.py
+++ b/tests/e2e/online_serving/test_bagel_expansion.py
@@ -48,7 +48,7 @@ def _make_tp_cases(model: str, tp_size: int):
                     updates={
                         "stage_args": {
                             1: {
-                                "runtime.devices": devices,
+                                "devices": devices,
                             },
                         },
                     },

From f8766a01452ed5a40f40be59c8310200353228da Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Wed, 29 Apr 2026 08:09:50 +0000
Subject: [PATCH 08/12] set a default tp value to avoid all stages share the
 same tp size

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 tests/e2e/online_serving/test_bagel_expansion.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py
index 8ac049b2b01..544f82f9a5d 100644
--- a/tests/e2e/online_serving/test_bagel_expansion.py
+++ b/tests/e2e/online_serving/test_bagel_expansion.py
@@ -47,6 +47,9 @@ def _make_tp_cases(model: str, tp_size: int):
                     _BAGEL_DEFAULT_YAML,
                     updates={
                         "stage_args": {
+                            0: {
+                                "tensor_parallel_size": 1,
+                            },
                             1: {
                                 "devices": devices,
                             },

From 250a2fee215f52f3b50f4ae47c7c2f32827afb06 Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Wed, 29 Apr 2026 13:55:18 +0000
Subject: [PATCH 09/12] update

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 .../online_serving/test_bagel_expansion.py    | 36 +++++++++++--------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py
index 544f82f9a5d..f4b5d80e945 100644
--- a/tests/e2e/online_serving/test_bagel_expansion.py
+++ b/tests/e2e/online_serving/test_bagel_expansion.py
@@ -15,11 +15,13 @@
 512x512 resolution.
 """
 
+import json
+
 import pytest
 
 from tests.helpers.mark import hardware_marks
 from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler, dummy_messages_from_mix_data
-from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config
+from tests.helpers.stage_config import get_deploy_config_path
 
 pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
 
@@ -39,24 +41,28 @@ def _make_tp_cases(model: str, tp_size: int):
     # Dit devices start from 0, due to CI GPU usage constraint,
     # for those GPUs that encountered OOM, adjust the offset accordingly.
     devices = ",".join(str(i) for i in range(tp_size))
+    stage_overrides = json.dumps(
+        {
+            "0": {
+                "tensor_parallel_size": 1,
+                "gpu_memory_utilization": 0.95,
+            },
+            "1": {"devices": devices},
+        }
+    )
     return [
         pytest.param(
             OmniServerParams(
                 model=model,
-                stage_config_path=modify_stage_config(
-                    _BAGEL_DEFAULT_YAML,
-                    updates={
-                        "stage_args": {
-                            0: {
-                                "tensor_parallel_size": 1,
-                            },
-                            1: {
-                                "devices": devices,
-                            },
-                        },
-                    },
-                ),
-                server_args=["--cache-backend", "cache_dit", "--tensor-parallel-size", str(tp_size)],
+                stage_config_path=_BAGEL_DEFAULT_YAML,
+                server_args=[
+                    "--stage-overrides",
+                    stage_overrides,
+                    "--cache-backend",
+                    "cache_dit",
+                    "--tensor-parallel-size",
+                    str(tp_size),
+                ],
             ),
             id=f"parallel_tp_{tp_size}",
             marks=PARALLEL_FEATURE_MARKS,

From 27e7f75e8b4a6774602eed49d9a014edfc3ec225 Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Wed, 29 Apr 2026 16:27:21 +0000
Subject: [PATCH 10/12] update

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 tests/e2e/online_serving/test_bagel_expansion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py
index f4b5d80e945..cf93f9d7a0f 100644
--- a/tests/e2e/online_serving/test_bagel_expansion.py
+++ b/tests/e2e/online_serving/test_bagel_expansion.py
@@ -45,7 +45,7 @@ def _make_tp_cases(model: str, tp_size: int):
         {
             "0": {
                 "tensor_parallel_size": 1,
-                "gpu_memory_utilization": 0.95,
+                "gpu_memory_utilization": 0.75,
             },
             "1": {"devices": devices},
         }

From 04b54c23ee0104b6ca93b5660296086e2cb9ffc5 Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Thu, 30 Apr 2026 00:42:14 +0000
Subject: [PATCH 11/12] update

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 .buildkite/test-nightly.yml                   | 42 ++++++++++++++++++-
 .../online_serving/test_bagel_expansion.py    | 14 +++----
 2 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index c68215db519..6f206422a4b 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -297,7 +297,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100"
         timeout_in_minutes: 120
         commands:
-          - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy
+          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -332,10 +332,48 @@ steps:
                       path: /mnt/hf-cache
                       type: DirectoryOrCreate
 
+      - label: ":full_moon: Diffusion X2I(&A&T) · Bagel Function Test with H100"
+        timeout_in_minutes: 120
+        commands:
+          - pytest -sv tests/e2e/online_serving/test_bagel_expansion.py -m "full_model and diffusion and H100" --run-level "full_model"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 3
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4"
         timeout_in_minutes: 60
         commands:
-          - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy
+          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py
index cf93f9d7a0f..ecefc1c6beb 100644
--- a/tests/e2e/online_serving/test_bagel_expansion.py
+++ b/tests/e2e/online_serving/test_bagel_expansion.py
@@ -32,21 +32,19 @@
 
 SINGLE_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"})
 PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=2)
+TP_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=3)
 
 
 def _make_tp_cases(model: str, tp_size: int):
     """Build Bagel TP test cases with devices auto-derived from tp_size.
     Devices can not be set through CLI args, so we set them in the YAML.
     """
-    # Dit devices start from 0, due to CI GPU usage constraint,
-    # for those GPUs that encountered OOM, adjust the offset accordingly.
-    devices = ",".join(str(i) for i in range(tp_size))
+    # Stage 0 uses GPU 0, so place DiT TP ranks on GPU 1..N to
+    # avoid AR/DiT memory contention on one device.
+    devices = ",".join(str(i + 1) for i in range(tp_size))
     stage_overrides = json.dumps(
         {
-            "0": {
-                "tensor_parallel_size": 1,
-                "gpu_memory_utilization": 0.75,
-            },
+            "0": {"tensor_parallel_size": 1},
             "1": {"devices": devices},
         }
     )
@@ -65,7 +63,7 @@ def _make_tp_cases(model: str, tp_size: int):
                 ],
             ),
             id=f"parallel_tp_{tp_size}",
-            marks=PARALLEL_FEATURE_MARKS,
+            marks=TP_FEATURE_MARKS,
         ),
     ]
 

From 53f45224b5d58bdd4631998f7ed0fba714916d21 Mon Sep 17 00:00:00 2001
From: natureofnature <wzliu@connect.hku.hk>
Date: Thu, 30 Apr 2026 03:21:44 +0000
Subject: [PATCH 12/12] update

Signed-off-by: natureofnature <wzliu@connect.hku.hk>
---
 vllm_omni/diffusion/data.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py
index 4dd957fb14e..676fcd9c79c 100644
--- a/vllm_omni/diffusion/data.py
+++ b/vllm_omni/diffusion/data.py
@@ -781,8 +781,11 @@ def from_kwargs(cls, **kwargs: Any) -> "OmniDiffusionConfig":
         par = kwargs.get("parallel_config", {})
         if isinstance(par, Mapping):
             par = dict(par)
-            if "tensor_parallel_size" in kwargs and "tensor_parallel_size" not in par:
-                par["tensor_parallel_size"] = kwargs["tensor_parallel_size"]
+            if par.get("tensor_parallel_size") is None:
+                par.pop("tensor_parallel_size", None)
+            tensor_parallel_size = kwargs.get("tensor_parallel_size")
+            if tensor_parallel_size is not None and "tensor_parallel_size" not in par:
+                par["tensor_parallel_size"] = tensor_parallel_size
             kwargs["parallel_config"] = par
 
         # Filter kwargs to only include valid fields