From 2daa5731e73f18b32f329ec47d1ae250d0c5a821 Mon Sep 17 00:00:00 2001 From: natureofnature Date: Fri, 17 Apr 2026 03:09:19 +0000 Subject: [PATCH 01/12] pass tp size to diffusion config Signed-off-by: natureofnature --- vllm_omni/diffusion/data.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index b9307657f5c..7347edd5127 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -776,6 +776,15 @@ def from_kwargs(cls, **kwargs: Any) -> "OmniDiffusionConfig": if "diffusers_call_kwargs" in kwargs and kwargs["diffusers_call_kwargs"] is None: kwargs["diffusers_call_kwargs"] = {} + # Forward top-level parallel knobs (e.g. --tensor-parallel-size from CLI) + # into parallel_config so the diffusion engine sees them. + par = kwargs.get("parallel_config", {}) + if isinstance(par, Mapping): + par = dict(par) + if "tensor_parallel_size" in kwargs and "tensor_parallel_size" not in par: + par["tensor_parallel_size"] = kwargs["tensor_parallel_size"] + kwargs["parallel_config"] = par + # Filter kwargs to only include valid fields valid_fields = {f.name for f in fields(cls)} filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_fields} From a41e8f078f56a62b70369d292b748ebc9061ecaa Mon Sep 17 00:00:00 2001 From: natureofnature Date: Fri, 17 Apr 2026 07:52:46 +0000 Subject: [PATCH 02/12] update bagel CI to use real tp devices Signed-off-by: natureofnature --- .../online_serving/test_bagel_expansion.py | 54 ++++++++++++++----- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py index 21142439bee..df0b8da8ba0 100644 --- a/tests/e2e/online_serving/test_bagel_expansion.py +++ b/tests/e2e/online_serving/test_bagel_expansion.py @@ -14,13 +14,21 @@ 512x512 resolution. """ +from pathlib import Path + import pytest from tests.helpers.mark import hardware_marks from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler, dummy_messages_from_mix_data +from tests.helpers.stage_config import modify_stage_config pytestmark = [pytest.mark.diffusion, pytest.mark.full_model] +# This test uses the default Bagel YAML, and CLI does not control devices.We modify yaml file directly. +_BAGEL_DEFAULT_YAML = str( + Path(__file__).resolve().parents[3] / "vllm_omni" / "model_executor" / "stage_configs" / "bagel.yaml" +) + PROMPT = "A futuristic city skyline at twilight, cyberpunk style, ultra-detailed, high resolution." NEGATIVE_PROMPT = "low quality, blurry, distorted, deformed, watermark" @@ -28,6 +36,36 @@ PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=2) +def _make_tp_cases(model: str, tp_size: int): + """Build Bagel TP test cases with devices auto-derived from tp_size. + Devices can not be set through CLI args, so we set them in the YAML. + """ + # Dit devices start from 0, due to CI GPU usage constraint, + # for those GPUs that encountered OOM, adjust the offset accordingly. + devices = ",".join(str(i) for i in range(tp_size)) + return [ + pytest.param( + OmniServerParams( + model=model, + stage_config_path=modify_stage_config( + _BAGEL_DEFAULT_YAML, + updates={ + "stage_args": { + 1: { + "runtime.devices": devices, + "engine_args.parallel_config.tensor_parallel_size": tp_size, + }, + }, + }, + ), + server_args=["--cache-backend", "cache_dit"], + ), + id=f"parallel_tp_{tp_size}", + marks=PARALLEL_FEATURE_MARKS, + ), + ] + + def _get_diffusion_feature_cases(model: str): """Return L4 diffusion feature cases for Bagel. TeaCache, Cache-DiT, CFG-Parallel, Tensor-Parallel, @@ -74,19 +112,9 @@ def _get_diffusion_feature_cases(model: str): marks=PARALLEL_FEATURE_MARKS, ), # Tensor-Parallel size 2 (2 GPUs, Cache-DiT backend) - pytest.param( - OmniServerParams( - model=model, - server_args=[ - "--cache-backend", - "cache_dit", - "--tensor-parallel-size", - "2", - ], - ), - id="parallel_tp_2", - marks=[*PARALLEL_FEATURE_MARKS, pytest.mark.skip(reason="issue: #2862")], - ), + # Stage 1 (DiT) needs visible GPUs matching TP size; the default YAML + # only exposes device "0", so we patch it here. + *_make_tp_cases(model, tp_size=2), # Ulysses-SP degree=2 (2 GPUs) pytest.param( OmniServerParams( From 26352d1e819c490ee6117da819c96dcb7a96ccb6 Mon Sep 17 00:00:00 2001 From: natureofnature Date: Mon, 27 Apr 2026 03:56:21 +0000 Subject: [PATCH 03/12] adjust codes for comments Signed-off-by: natureofnature --- tests/e2e/online_serving/test_bagel_expansion.py | 11 +++-------- vllm_omni/engine/async_omni_engine.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py index df0b8da8ba0..ce60db18741 100644 --- a/tests/e2e/online_serving/test_bagel_expansion.py +++ b/tests/e2e/online_serving/test_bagel_expansion.py @@ -14,8 +14,6 @@ 512x512 resolution. """ -from pathlib import Path - import pytest from tests.helpers.mark import hardware_marks @@ -24,10 +22,8 @@ pytestmark = [pytest.mark.diffusion, pytest.mark.full_model] -# This test uses the default Bagel YAML, and CLI does not control devices.We modify yaml file directly. -_BAGEL_DEFAULT_YAML = str( - Path(__file__).resolve().parents[3] / "vllm_omni" / "model_executor" / "stage_configs" / "bagel.yaml" -) +# This test uses the Bagel stage-config YAML under model_executor; CLI still carries TP. +_BAGEL_DEFAULT_YAML = "vllm_omni/model_executor/stage_configs/bagel.yaml" PROMPT = "A futuristic city skyline at twilight, cyberpunk style, ultra-detailed, high resolution." NEGATIVE_PROMPT = "low quality, blurry, distorted, deformed, watermark" @@ -53,12 +49,11 @@ def _make_tp_cases(model: str, tp_size: int): "stage_args": { 1: { "runtime.devices": devices, - "engine_args.parallel_config.tensor_parallel_size": tp_size, }, }, }, ), - server_args=["--cache-backend", "cache_dit"], + server_args=["--cache-backend", "cache_dit", "--tensor-parallel-size", str(tp_size)], ), id=f"parallel_tp_{tp_size}", marks=PARALLEL_FEATURE_MARKS, diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 61da4388be0..76870571164 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -110,6 +110,18 @@ "worker_extension_cls", "allowed_local_media_path", "allowed_media_domains", + # Legacy stage-config YAMLs may intentionally leave parallel or + # distributed knobs unspecified at the stage level and rely on + # top-level CLI values to fill them in during the per-stage merge. + # Keep these fields so stages that omit them can inherit CLI values, + # while stages with explicit YAML values still win because the legacy + # stage-config loader prefers stage-local engine args. + "tensor_parallel_size", + "pipeline_parallel_size", + "data_parallel_size", + "data_parallel_size_local", + "data_parallel_backend", + "distributed_executor_backend", } ) From 029109f290816451affa3da667299768185900f5 Mon Sep 17 00:00:00 2001 From: natureofnature Date: Mon, 27 Apr 2026 06:19:37 +0000 Subject: [PATCH 04/12] update for simple test Signed-off-by: natureofnature --- tests/engine/test_arg_utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 0d61f6a675b..26ce07b73b7 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -230,7 +230,7 @@ def test_voxcpm_model_arch_injects_model_type_override(mocker): def test_strip_single_engine_args(): """_strip_single_engine_args should remove EngineArgs fields but keep omni fields.""" kwargs = { - # Parent EngineArgs fields — should be stripped + # Parent EngineArgs fields — stripped unless explicitly allowlisted "compilation_config": '{"cudagraph_mode": "FULL_AND_PIECEWISE"}', "tensor_parallel_size": 4, "gpu_memory_utilization": 0.9, @@ -248,7 +248,7 @@ def test_strip_single_engine_args(): # Stripped — parent EngineArgs fields assert "compilation_config" not in filtered - assert "tensor_parallel_size" not in filtered + assert filtered["tensor_parallel_size"] == 4 assert "gpu_memory_utilization" not in filtered assert "model" not in filtered @@ -278,15 +278,18 @@ def test_strip_single_engine_args_model_does_not_trigger_warning(mocker): mock_warn.assert_not_called() # When there *are* genuinely surprising overrides alongside model, - # the warning should mention them but not model. + # the warning should mention them but not model. Keep-listed fields such as + # tensor_parallel_size are intentionally passed through and should not warn. AsyncOmniEngine._strip_single_engine_args( { "model": "some/model", + "compilation_config": '{"cudagraph_mode": "FULL_AND_PIECEWISE"}', "tensor_parallel_size": 4, "custom_pipeline_args": {"pipeline_class": "my.Pipeline"}, } ) mock_warn.assert_called_once() warned_args = mock_warn.call_args[0][-1] # the formatted arg list - assert "tensor_parallel_size" in warned_args + assert "compilation_config" in warned_args + assert "tensor_parallel_size" not in warned_args assert "model" not in warned_args From c7b0c69cef6842db5a427b7bdebb1ea2753fbfbb Mon Sep 17 00:00:00 2001 From: natureofnature Date: Mon, 27 Apr 2026 10:02:56 +0000 Subject: [PATCH 05/12] update Signed-off-by: natureofnature --- vllm_omni/engine/async_omni_engine.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 76870571164..dbd5db3d92f 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -117,11 +117,6 @@ # while stages with explicit YAML values still win because the legacy # stage-config loader prefers stage-local engine args. "tensor_parallel_size", - "pipeline_parallel_size", - "data_parallel_size", - "data_parallel_size_local", - "data_parallel_backend", - "distributed_executor_backend", } ) From 30882e70b87a0ea336c395673b9f941d76bbbbef Mon Sep 17 00:00:00 2001 From: natureofnature Date: Wed, 29 Apr 2026 03:24:02 +0000 Subject: [PATCH 06/12] use deployment yaml Signed-off-by: natureofnature --- tests/e2e/online_serving/test_bagel_expansion.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py index 6b3d8caa3f6..7f141918278 100644 --- a/tests/e2e/online_serving/test_bagel_expansion.py +++ b/tests/e2e/online_serving/test_bagel_expansion.py @@ -19,12 +19,11 @@ from tests.helpers.mark import hardware_marks from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler, dummy_messages_from_mix_data -from tests.helpers.stage_config import modify_stage_config +from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config pytestmark = [pytest.mark.diffusion, pytest.mark.full_model] -# This test uses the Bagel stage-config YAML under model_executor; CLI still carries TP. -_BAGEL_DEFAULT_YAML = "vllm_omni/model_executor/stage_configs/bagel.yaml" +_BAGEL_DEFAULT_YAML = get_deploy_config_path("ci/bagel.yaml") PROMPT = "A futuristic city skyline at twilight, cyberpunk style, ultra-detailed, high resolution." NEGATIVE_PROMPT = "low quality, blurry, distorted, deformed, watermark" From f99b0ed2000d96bae231c1669cb74ef610f56673 Mon Sep 17 00:00:00 2001 From: natureofnature Date: Wed, 29 Apr 2026 06:15:42 +0000 Subject: [PATCH 07/12] fix device name Signed-off-by: natureofnature --- tests/e2e/online_serving/test_bagel_expansion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py index 7f141918278..8ac049b2b01 100644 --- a/tests/e2e/online_serving/test_bagel_expansion.py +++ b/tests/e2e/online_serving/test_bagel_expansion.py @@ -48,7 +48,7 @@ def _make_tp_cases(model: str, tp_size: int): updates={ "stage_args": { 1: { - "runtime.devices": devices, + "devices": devices, }, }, }, From f8766a01452ed5a40f40be59c8310200353228da Mon Sep 17 00:00:00 2001 From: natureofnature Date: Wed, 29 Apr 2026 08:09:50 +0000 Subject: [PATCH 08/12] set a default tp value to avoid all stages share the same tp size Signed-off-by: natureofnature --- tests/e2e/online_serving/test_bagel_expansion.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py index 8ac049b2b01..544f82f9a5d 100644 --- a/tests/e2e/online_serving/test_bagel_expansion.py +++ b/tests/e2e/online_serving/test_bagel_expansion.py @@ -47,6 +47,9 @@ def _make_tp_cases(model: str, tp_size: int): _BAGEL_DEFAULT_YAML, updates={ "stage_args": { + 0: { + "tensor_parallel_size": 1, + }, 1: { "devices": devices, }, From 250a2fee215f52f3b50f4ae47c7c2f32827afb06 Mon Sep 17 00:00:00 2001 From: natureofnature Date: Wed, 29 Apr 2026 13:55:18 +0000 Subject: [PATCH 09/12] update Signed-off-by: natureofnature --- .../online_serving/test_bagel_expansion.py | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py index 544f82f9a5d..f4b5d80e945 100644 --- a/tests/e2e/online_serving/test_bagel_expansion.py +++ b/tests/e2e/online_serving/test_bagel_expansion.py @@ -15,11 +15,13 @@ 512x512 resolution. """ +import json + import pytest from tests.helpers.mark import hardware_marks from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler, dummy_messages_from_mix_data -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config +from tests.helpers.stage_config import get_deploy_config_path pytestmark = [pytest.mark.diffusion, pytest.mark.full_model] @@ -39,24 +41,28 @@ def _make_tp_cases(model: str, tp_size: int): # Dit devices start from 0, due to CI GPU usage constraint, # for those GPUs that encountered OOM, adjust the offset accordingly. devices = ",".join(str(i) for i in range(tp_size)) + stage_overrides = json.dumps( + { + "0": { + "tensor_parallel_size": 1, + "gpu_memory_utilization": 0.95, + }, + "1": {"devices": devices}, + } + ) return [ pytest.param( OmniServerParams( model=model, - stage_config_path=modify_stage_config( - _BAGEL_DEFAULT_YAML, - updates={ - "stage_args": { - 0: { - "tensor_parallel_size": 1, - }, - 1: { - "devices": devices, - }, - }, - }, - ), - server_args=["--cache-backend", "cache_dit", "--tensor-parallel-size", str(tp_size)], + stage_config_path=_BAGEL_DEFAULT_YAML, + server_args=[ + "--stage-overrides", + stage_overrides, + "--cache-backend", + "cache_dit", + "--tensor-parallel-size", + str(tp_size), + ], ), id=f"parallel_tp_{tp_size}", marks=PARALLEL_FEATURE_MARKS, From 27e7f75e8b4a6774602eed49d9a014edfc3ec225 Mon Sep 17 00:00:00 2001 From: natureofnature Date: Wed, 29 Apr 2026 16:27:21 +0000 Subject: [PATCH 10/12] update Signed-off-by: natureofnature --- tests/e2e/online_serving/test_bagel_expansion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py index f4b5d80e945..cf93f9d7a0f 100644 --- a/tests/e2e/online_serving/test_bagel_expansion.py +++ b/tests/e2e/online_serving/test_bagel_expansion.py @@ -45,7 +45,7 @@ def _make_tp_cases(model: str, tp_size: int): { "0": { "tensor_parallel_size": 1, - "gpu_memory_utilization": 0.95, + "gpu_memory_utilization": 0.75, }, "1": {"devices": devices}, } From 04b54c23ee0104b6ca93b5660296086e2cb9ffc5 Mon Sep 17 00:00:00 2001 From: natureofnature Date: Thu, 30 Apr 2026 00:42:14 +0000 Subject: [PATCH 11/12] update Signed-off-by: natureofnature --- .buildkite/test-nightly.yml | 42 ++++++++++++++++++- .../online_serving/test_bagel_expansion.py | 14 +++---- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index c68215db519..6f206422a4b 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -297,7 +297,7 @@ steps: - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100" timeout_in_minutes: 120 commands: - - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy + - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy agents: queue: "mithril-h100-pool" plugins: @@ -332,10 +332,48 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate + - label: ":full_moon: Diffusion X2I(&A&T) · Bagel Function Test with H100" + timeout_in_minutes: 120 + commands: + - pytest -sv tests/e2e/online_serving/test_bagel_expansion.py -m "full_model and diffusion and H100" --run-level "full_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 3 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4" timeout_in_minutes: 60 commands: - - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy + - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py index cf93f9d7a0f..ecefc1c6beb 100644 --- a/tests/e2e/online_serving/test_bagel_expansion.py +++ b/tests/e2e/online_serving/test_bagel_expansion.py @@ -32,21 +32,19 @@ SINGLE_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}) PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=2) +TP_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=3) def _make_tp_cases(model: str, tp_size: int): """Build Bagel TP test cases with devices auto-derived from tp_size. Devices can not be set through CLI args, so we set them in the YAML. """ - # Dit devices start from 0, due to CI GPU usage constraint, - # for those GPUs that encountered OOM, adjust the offset accordingly. - devices = ",".join(str(i) for i in range(tp_size)) + # Stage 0 uses GPU 0, so place DiT TP ranks on GPU 1..N to + # avoid AR/DiT memory contention on one device. + devices = ",".join(str(i + 1) for i in range(tp_size)) stage_overrides = json.dumps( { - "0": { - "tensor_parallel_size": 1, - "gpu_memory_utilization": 0.75, - }, + "0": {"tensor_parallel_size": 1}, "1": {"devices": devices}, } ) @@ -65,7 +63,7 @@ def _make_tp_cases(model: str, tp_size: int): ], ), id=f"parallel_tp_{tp_size}", - marks=PARALLEL_FEATURE_MARKS, + marks=TP_FEATURE_MARKS, ), ] From 53f45224b5d58bdd4631998f7ed0fba714916d21 Mon Sep 17 00:00:00 2001 From: natureofnature Date: Thu, 30 Apr 2026 03:21:44 +0000 Subject: [PATCH 12/12] update Signed-off-by: natureofnature --- vllm_omni/diffusion/data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index 4dd957fb14e..676fcd9c79c 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -781,8 +781,11 @@ def from_kwargs(cls, **kwargs: Any) -> "OmniDiffusionConfig": par = kwargs.get("parallel_config", {}) if isinstance(par, Mapping): par = dict(par) - if "tensor_parallel_size" in kwargs and "tensor_parallel_size" not in par: - par["tensor_parallel_size"] = kwargs["tensor_parallel_size"] + if par.get("tensor_parallel_size") is None: + par.pop("tensor_parallel_size", None) + tensor_parallel_size = kwargs.get("tensor_parallel_size") + if tensor_parallel_size is not None and "tensor_parallel_size" not in par: + par["tensor_parallel_size"] = tensor_parallel_size kwargs["parallel_config"] = par # Filter kwargs to only include valid fields