diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index c68215db519..6f206422a4b 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -297,7 +297,7 @@ steps: - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100" timeout_in_minutes: 120 commands: - - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy + - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy agents: queue: "mithril-h100-pool" plugins: @@ -332,10 +332,48 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate + - label: ":full_moon: Diffusion X2I(&A&T) · Bagel Function Test with H100" + timeout_in_minutes: 120 + commands: + - pytest -sv tests/e2e/online_serving/test_bagel_expansion.py -m "full_model and diffusion and H100" --run-level "full_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 3 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4" timeout_in_minutes: 60 commands: - - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy + - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py index 21fdc314c96..ecefc1c6beb 100644 --- a/tests/e2e/online_serving/test_bagel_expansion.py +++ b/tests/e2e/online_serving/test_bagel_expansion.py @@ -15,18 +15,57 @@ 512x512 resolution. """ +import json + import pytest from tests.helpers.mark import hardware_marks from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler, dummy_messages_from_mix_data +from tests.helpers.stage_config import get_deploy_config_path pytestmark = [pytest.mark.diffusion, pytest.mark.full_model] +_BAGEL_DEFAULT_YAML = get_deploy_config_path("ci/bagel.yaml") + PROMPT = "A futuristic city skyline at twilight, cyberpunk style, ultra-detailed, high resolution." NEGATIVE_PROMPT = "low quality, blurry, distorted, deformed, watermark" SINGLE_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}) PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=2) +TP_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=3) + + +def _make_tp_cases(model: str, tp_size: int): + """Build Bagel TP test cases with devices auto-derived from tp_size. + Devices can not be set through CLI args, so we set them in the YAML. + """ + # Stage 0 uses GPU 0, so place DiT TP ranks on GPU 1..N to + # avoid AR/DiT memory contention on one device. + devices = ",".join(str(i + 1) for i in range(tp_size)) + stage_overrides = json.dumps( + { + "0": {"tensor_parallel_size": 1}, + "1": {"devices": devices}, + } + ) + return [ + pytest.param( + OmniServerParams( + model=model, + stage_config_path=_BAGEL_DEFAULT_YAML, + server_args=[ + "--stage-overrides", + stage_overrides, + "--cache-backend", + "cache_dit", + "--tensor-parallel-size", + str(tp_size), + ], + ), + id=f"parallel_tp_{tp_size}", + marks=TP_FEATURE_MARKS, + ), + ] def _get_diffusion_feature_cases(model: str): @@ -75,19 +114,9 @@ def _get_diffusion_feature_cases(model: str): marks=PARALLEL_FEATURE_MARKS, ), # Tensor-Parallel size 2 (2 GPUs, Cache-DiT backend) - pytest.param( - OmniServerParams( - model=model, - server_args=[ - "--cache-backend", - "cache_dit", - "--tensor-parallel-size", - "2", - ], - ), - id="parallel_tp_2", - marks=[*PARALLEL_FEATURE_MARKS, pytest.mark.skip(reason="issue: #2862")], - ), + # Stage 1 (DiT) needs visible GPUs matching TP size; the default YAML + # only exposes device "0", so we patch it here. + *_make_tp_cases(model, tp_size=2), # Ulysses-SP degree=2 (2 GPUs) pytest.param( OmniServerParams( diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 709802ce6fd..10077d24aa5 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -251,7 +251,7 @@ def test_voxcpm_model_arch_injects_model_type_override(mocker): def test_strip_single_engine_args(): """_strip_single_engine_args should remove EngineArgs fields but keep omni fields.""" kwargs = { - # Parent EngineArgs fields — should be stripped + # Parent EngineArgs fields — stripped unless explicitly allowlisted "compilation_config": '{"cudagraph_mode": "FULL_AND_PIECEWISE"}', "tensor_parallel_size": 4, "gpu_memory_utilization": 0.9, @@ -269,7 +269,7 @@ def test_strip_single_engine_args(): # Stripped — parent EngineArgs fields assert "compilation_config" not in filtered - assert "tensor_parallel_size" not in filtered + assert filtered["tensor_parallel_size"] == 4 assert "gpu_memory_utilization" not in filtered assert "model" not in filtered @@ -299,15 +299,18 @@ def test_strip_single_engine_args_model_does_not_trigger_warning(mocker): mock_warn.assert_not_called() # When there *are* genuinely surprising overrides alongside model, - # the warning should mention them but not model. + # the warning should mention them but not model. Keep-listed fields such as + # tensor_parallel_size are intentionally passed through and should not warn. AsyncOmniEngine._strip_single_engine_args( { "model": "some/model", + "compilation_config": '{"cudagraph_mode": "FULL_AND_PIECEWISE"}', "tensor_parallel_size": 4, "custom_pipeline_args": {"pipeline_class": "my.Pipeline"}, } ) mock_warn.assert_called_once() warned_args = mock_warn.call_args[0][-1] # the formatted arg list - assert "tensor_parallel_size" in warned_args + assert "compilation_config" in warned_args + assert "tensor_parallel_size" not in warned_args assert "model" not in warned_args diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index cf6841fd21d..676fcd9c79c 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -776,6 +776,18 @@ def from_kwargs(cls, **kwargs: Any) -> "OmniDiffusionConfig": if "diffusers_call_kwargs" in kwargs and kwargs["diffusers_call_kwargs"] is None: kwargs["diffusers_call_kwargs"] = {} + # Forward top-level parallel knobs (e.g. --tensor-parallel-size from CLI) + # into parallel_config so the diffusion engine sees them. + par = kwargs.get("parallel_config", {}) + if isinstance(par, Mapping): + par = dict(par) + if par.get("tensor_parallel_size") is None: + par.pop("tensor_parallel_size", None) + tensor_parallel_size = kwargs.get("tensor_parallel_size") + if tensor_parallel_size is not None and "tensor_parallel_size" not in par: + par["tensor_parallel_size"] = tensor_parallel_size + kwargs["parallel_config"] = par + # Filter kwargs to only include valid fields valid_fields = {f.name for f in fields(cls)} filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_fields} diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 9c8dc25ffea..831d41092a5 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -112,6 +112,13 @@ "worker_extension_cls", "allowed_local_media_path", "allowed_media_domains", + # Legacy stage-config YAMLs may intentionally leave parallel or + # distributed knobs unspecified at the stage level and rely on + # top-level CLI values to fill them in during the per-stage merge. + # Keep these fields so stages that omit them can inherit CLI values, + # while stages with explicit YAML values still win because the legacy + # stage-config loader prefers stage-local engine args. + "tensor_parallel_size", } )