vllm-project · hsliuustc0106 · Apr 30, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 27, 2026
@@ -297,7 +297,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100"
         timeout_in_minutes: 120
         commands:
-          - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy
+          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -332,10 +332,48 @@ steps:
                       path: /mnt/hf-cache
                       type: DirectoryOrCreate
 
+      - label: ":full_moon: Diffusion X2I(&A&T) · Bagel Function Test with H100"
+        timeout_in_minutes: 120
+        commands:
+          - pytest -sv tests/e2e/online_serving/test_bagel_expansion.py -m "full_model and diffusion and H100" --run-level "full_model"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 3
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4"
         timeout_in_minutes: 60
         commands:
-          - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy
+          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:

@@ -15,18 +15,57 @@
 512x512 resolution.
 """
 
+import json
+
 import pytest
 
 from tests.helpers.mark import hardware_marks
 from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler, dummy_messages_from_mix_data
+from tests.helpers.stage_config import get_deploy_config_path
 
 pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
 
+_BAGEL_DEFAULT_YAML = get_deploy_config_path("ci/bagel.yaml")
+
 PROMPT = "A futuristic city skyline at twilight, cyberpunk style, ultra-detailed, high resolution."
 NEGATIVE_PROMPT = "low quality, blurry, distorted, deformed, watermark"
 
 SINGLE_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"})
 PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=2)
+TP_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=3)
+
+
+def _make_tp_cases(model: str, tp_size: int):
+    """Build Bagel TP test cases with devices auto-derived from tp_size.
+    Devices can not be set through CLI args, so we set them in the YAML.
+    """
+    # Stage 0 uses GPU 0, so place DiT TP ranks on GPU 1..N to
+    # avoid AR/DiT memory contention on one device.
+    devices = ",".join(str(i + 1) for i in range(tp_size))
+    stage_overrides = json.dumps(
+        {
+            "0": {"tensor_parallel_size": 1},
+            "1": {"devices": devices},
+        }
+    )
+    return [
+        pytest.param(
+            OmniServerParams(
+                model=model,
+                stage_config_path=_BAGEL_DEFAULT_YAML,
+                server_args=[
+                    "--stage-overrides",
+                    stage_overrides,
+                    "--cache-backend",
+                    "cache_dit",
+                    "--tensor-parallel-size",
+                    str(tp_size),
+                ],
+            ),
+            id=f"parallel_tp_{tp_size}",
+            marks=TP_FEATURE_MARKS,
+        ),
+    ]
 
 
 def _get_diffusion_feature_cases(model: str):
@@ -75,19 +114,9 @@ def _get_diffusion_feature_cases(model: str):
             marks=PARALLEL_FEATURE_MARKS,
         ),
         # Tensor-Parallel size 2 (2 GPUs, Cache-DiT backend)
-        pytest.param(
-            OmniServerParams(
-                model=model,
-                server_args=[
-                    "--cache-backend",
-                    "cache_dit",
-                    "--tensor-parallel-size",
-                    "2",
-                ],
-            ),
-            id="parallel_tp_2",
-            marks=[*PARALLEL_FEATURE_MARKS, pytest.mark.skip(reason="issue: #2862")],
-        ),
+        # Stage 1 (DiT) needs visible GPUs matching TP size; the default YAML
+        # only exposes device "0", so we patch it here.
+        *_make_tp_cases(model, tp_size=2),
         # Ulysses-SP degree=2 (2 GPUs)
         pytest.param(
             OmniServerParams(

@@ -251,7 +251,7 @@ def test_voxcpm_model_arch_injects_model_type_override(mocker):
 def test_strip_single_engine_args():
     """_strip_single_engine_args should remove EngineArgs fields but keep omni fields."""
     kwargs = {
-        # Parent EngineArgs fields — should be stripped
+        # Parent EngineArgs fields — stripped unless explicitly allowlisted
         "compilation_config": '{"cudagraph_mode": "FULL_AND_PIECEWISE"}',
         "tensor_parallel_size": 4,
         "gpu_memory_utilization": 0.9,
@@ -269,7 +269,7 @@ def test_strip_single_engine_args():
 
     # Stripped — parent EngineArgs fields
     assert "compilation_config" not in filtered
-    assert "tensor_parallel_size" not in filtered
+    assert filtered["tensor_parallel_size"] == 4
     assert "gpu_memory_utilization" not in filtered
     assert "model" not in filtered
 
@@ -299,15 +299,18 @@ def test_strip_single_engine_args_model_does_not_trigger_warning(mocker):
     mock_warn.assert_not_called()
 
     # When there *are* genuinely surprising overrides alongside model,
-    # the warning should mention them but not model.
+    # the warning should mention them but not model. Keep-listed fields such as
+    # tensor_parallel_size are intentionally passed through and should not warn.
     AsyncOmniEngine._strip_single_engine_args(
         {
             "model": "some/model",
+            "compilation_config": '{"cudagraph_mode": "FULL_AND_PIECEWISE"}',
             "tensor_parallel_size": 4,
             "custom_pipeline_args": {"pipeline_class": "my.Pipeline"},
         }
     )
     mock_warn.assert_called_once()
     warned_args = mock_warn.call_args[0][-1]  # the formatted arg list
-    assert "tensor_parallel_size" in warned_args
+    assert "compilation_config" in warned_args
+    assert "tensor_parallel_size" not in warned_args
     assert "model" not in warned_args
@@ -776,6 +776,18 @@ def from_kwargs(cls, **kwargs: Any) -> "OmniDiffusionConfig":
         if "diffusers_call_kwargs" in kwargs and kwargs["diffusers_call_kwargs"] is None:
             kwargs["diffusers_call_kwargs"] = {}
 
+        # Forward top-level parallel knobs (e.g. --tensor-parallel-size from CLI)
+        # into parallel_config so the diffusion engine sees them.
+        par = kwargs.get("parallel_config", {})
+        if isinstance(par, Mapping):
+            par = dict(par)
+            if par.get("tensor_parallel_size") is None:
+                par.pop("tensor_parallel_size", None)
+            tensor_parallel_size = kwargs.get("tensor_parallel_size")
+            if tensor_parallel_size is not None and "tensor_parallel_size" not in par:
+                par["tensor_parallel_size"] = tensor_parallel_size
+            kwargs["parallel_config"] = par
+
         # Filter kwargs to only include valid fields
         valid_fields = {f.name for f in fields(cls)}
         filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_fields}

@@ -112,6 +112,13 @@
         "worker_extension_cls",
         "allowed_local_media_path",
         "allowed_media_domains",
+        # Legacy stage-config YAMLs may intentionally leave parallel or
+        # distributed knobs unspecified at the stage level and rely on
+        # top-level CLI values to fill them in during the per-stage merge.
+        # Keep these fields so stages that omit them can inherit CLI values,
+        # while stages with explicit YAML values still win because the legacy
+        # stage-config loader prefers stage-local engine args.
+        "tensor_parallel_size",
     }
 )