From 41dbfe6bd0b3660d5c8396225a795f09c041cd1d Mon Sep 17 00:00:00 2001 From: Zhang Date: Tue, 31 Mar 2026 11:15:12 +0000 Subject: [PATCH 01/12] add teacache test Signed-off-by: Zhang --- docs/user_guide/diffusion_features.md | 2 +- .../test_stable_audio_model.py | 38 +++++++++++++++---- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 7e325c1edc8..fda26b53113 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -128,7 +128,7 @@ The following tables show which models support each feature: | Model | ⚑TeaCache | ⚑Cache-DiT | πŸ”€SP (Ulysses & Ring) | πŸ”€CFG-Parallel | πŸ”€Tensor-Parallel | πŸ”€HSDP | πŸ’ΎCPU Offload (Layerwise) | πŸ’ΎVAE-Patch-Parallel | πŸ’ΎQuantization | |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:| -| **Stable-Audio-Open** | ❌ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | βœ… | +| **Stable-Audio-Open** | βœ… | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | βœ… | ## Feature Compatibility diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py index ff4d9b40172..d7d11f0d947 100644 --- a/tests/e2e/offline_inference/test_stable_audio_model.py +++ b/tests/e2e/offline_inference/test_stable_audio_model.py @@ -21,13 +21,8 @@ models = ["linyueqian/stable_audio_random"] -@pytest.mark.core_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "L4", "xpu": "B60"}) -@pytest.mark.parametrize("model_name", models) -def test_stable_audio_model(model_name: str): - m = Omni(model=model_name) - +def _run_stable_audio_and_validate(m: Omni) -> None: + """Run a minimal Stable Audio generation and validate output shape.""" # Use minimal settings for testing # Generate a short 2-second audio clip with minimal inference steps audio_start_in_s = 0.0 @@ -70,3 +65,32 @@ def test_stable_audio_model(model_name: str): assert audio.shape[1] == 2 # stereo channels expected_samples = int((audio_end_in_s - audio_start_in_s) * sample_rate) assert audio.shape[2] == expected_samples # 88200 samples for 2 seconds + + +@pytest.mark.core_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "L4", "xpu": "B60"}) +@pytest.mark.parametrize("model_name", models) +def test_stable_audio_model(model_name: str): + m = Omni(model=model_name) + try: + _run_stable_audio_and_validate(m) + finally: + m.close() + + +@pytest.mark.core_model +@pytest.mark.diffusion +@pytest.mark.cache +@hardware_test(res={"cuda": "L4", "xpu": "B60"}) +@pytest.mark.parametrize("model_name", models) +def test_stable_audio_teacache(model_name: str): + m = Omni( + model=model_name, + cache_backend="tea_cache", + cache_config={"rel_l1_thresh": 0.2}, + ) + try: + _run_stable_audio_and_validate(m) + finally: + m.close() From 405f59143eb8d02ba5e0249327d5c6e80223f02b Mon Sep 17 00:00:00 2001 From: Zhang Date: Tue, 31 Mar 2026 11:22:19 +0000 Subject: [PATCH 02/12] update to match new diffusion user guide Signed-off-by: Zhang --- .../offline_inference/text_to_audio.md | 2 ++ .../offline_inference/text_to_audio/README.md | 2 ++ .../text_to_audio/text_to_audio.py | 26 +++++++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/docs/user_guide/examples/offline_inference/text_to_audio.md b/docs/user_guide/examples/offline_inference/text_to_audio.md index 62a70e5254d..9be09194bc5 100644 --- a/docs/user_guide/examples/offline_inference/text_to_audio.md +++ b/docs/user_guide/examples/offline_inference/text_to_audio.md @@ -26,6 +26,7 @@ python text_to_audio.py \ --guidance-scale 7.0 \ --audio-length 10.0 \ --num-inference-steps 100 \ + --cache-backend tea_cache \ --output stable_audio_output.wav ``` @@ -37,6 +38,7 @@ Key arguments: - `--guidance-scale`: classifier-free guidance scale. - `--audio-length`: audio duration in seconds. - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower). +- `--cache-backend`: cache acceleration backend. Stable Audio currently supports `tea_cache`. - `--output`: path to save the generated WAV file. ## Example materials diff --git a/examples/offline_inference/text_to_audio/README.md b/examples/offline_inference/text_to_audio/README.md index 7edc38092ad..50bab3e2f2d 100644 --- a/examples/offline_inference/text_to_audio/README.md +++ b/examples/offline_inference/text_to_audio/README.md @@ -23,6 +23,7 @@ python text_to_audio.py \ --guidance-scale 7.0 \ --audio-length 10.0 \ --num-inference-steps 100 \ + --cache-backend tea_cache \ --output stable_audio_output.wav ``` @@ -34,4 +35,5 @@ Key arguments: - `--guidance-scale`: classifier-free guidance scale. - `--audio-length`: audio duration in seconds. - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower). +- `--cache-backend`: cache acceleration backend. Stable Audio currently supports `tea_cache`. - `--output`: path to save the generated WAV file. diff --git a/examples/offline_inference/text_to_audio/text_to_audio.py b/examples/offline_inference/text_to_audio/text_to_audio.py index a6968c419f6..3adb3ad53a5 100644 --- a/examples/offline_inference/text_to_audio/text_to_audio.py +++ b/examples/offline_inference/text_to_audio/text_to_audio.py @@ -11,6 +11,7 @@ python text_to_audio.py --prompt "The sound of a dog barking" python text_to_audio.py --prompt "A piano playing a gentle melody" --audio-length 10.0 python text_to_audio.py --prompt "Thunder and rain sounds" --negative-prompt "Low quality" + python text_to_audio.py --prompt "A soft synth pad" --cache-backend tea_cache """ import argparse @@ -90,6 +91,23 @@ def parse_args() -> argparse.Namespace: default=44100, help="Sample rate for output audio (Stable Audio uses 44100 Hz).", ) + parser.add_argument( + "--cache-backend", + type=str, + default=None, + choices=["tea_cache"], + help=( + "Cache backend to use for acceleration. " + "Stable Audio currently supports 'tea_cache'. " + "Default: None (no cache acceleration)." + ), + ) + parser.add_argument( + "--tea-cache-rel-l1-thresh", + type=float, + default=0.2, + help="[tea_cache] Threshold for accumulated relative L1 distance.", + ) parser.add_argument( "--enable-diffusion-pipeline-profiler", action="store_true", @@ -124,6 +142,11 @@ def save_audio(audio_data: np.ndarray, output_path: str, sample_rate: int = 4410 def main(): args = parse_args() generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed) + cache_config = None + if args.cache_backend == "tea_cache": + cache_config = { + "rel_l1_thresh": args.tea_cache_rel_l1_thresh, + } print(f"\n{'=' * 60}") print("Stable Audio Open - Text-to-Audio Generation") @@ -134,12 +157,15 @@ def main(): print(f" Audio length: {args.audio_length}s") print(f" Inference steps: {args.num_inference_steps}") print(f" Guidance scale: {args.guidance_scale}") + print(f" Cache backend: {args.cache_backend if args.cache_backend else 'None (no acceleration)'}") print(f" Seed: {args.seed}") print(f"{'=' * 60}\n") # Initialize Omni with Stable Audio model omni = Omni( model=args.model, + cache_backend=args.cache_backend, + cache_config=cache_config, enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, ) From 8038dcbb3f6ad2d2d5337eec6ea2f2d84c23ae13 Mon Sep 17 00:00:00 2001 From: Zhang Date: Wed, 1 Apr 2026 06:04:32 +0000 Subject: [PATCH 03/12] register B60 Signed-off-by: Zhang --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 43e9506fd07..d949318b7f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,6 +181,7 @@ markers = [ "H100: Tests that require H100 GPU", "L4: Tests that require L4 GPU", "MI325: Tests that require MI325 GPU (AMD/ROCm)", + "B60: Tests that require Intel Arc Pro B60 XPU", "A2: Tests that require A2 NPU", "A3: Tests that require A3 NPU", "distributed_cuda: Tests that require multi cards on CUDA platform", From 193cd5681f16d588ce3ab8833b8f04d2ede3afeb Mon Sep 17 00:00:00 2001 From: Zhang Date: Wed, 1 Apr 2026 14:57:59 +0000 Subject: [PATCH 04/12] resolve review comments Signed-off-by: Zhang --- .buildkite/test-amd-merge.yml | 2 +- .buildkite/test-amd-ready.yaml | 2 +- .buildkite/test-merge.yml | 2 +- .buildkite/test-nightly.yml | 2 +- .buildkite/test-ready.yml | 2 +- docs/contributing/ci/CI_5levels.md | 3 +- docs/contributing/ci/tests_style.md | 3 +- tests/conftest.py | 28 ++++-- .../stable_audio_offline_utils.py | 54 +++++++++++ .../offline_inference/test_stable_audio.py | 43 +++++++++ .../test_stable_audio_expansion.py | 49 ++++++++++ .../test_stable_audio_model.py | 96 ------------------- 12 files changed, 177 insertions(+), 109 deletions(-) create mode 100644 tests/e2e/offline_inference/stable_audio_offline_utils.py create mode 100644 tests/e2e/offline_inference/test_stable_audio.py create mode 100644 tests/e2e/offline_inference/test_stable_audio_expansion.py delete mode 100644 tests/e2e/offline_inference/test_stable_audio_model.py diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml index 60ba0d9d416..a1cd9fdc1f4 100644 --- a/.buildkite/test-amd-merge.yml +++ b/.buildkite/test-amd-merge.yml @@ -55,7 +55,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml index 6e31163accb..f36e2b7438f 100644 --- a/.buildkite/test-amd-ready.yaml +++ b/.buildkite/test-amd-ready.yaml @@ -46,7 +46,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 7bee193191e..0a5bc55378c 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -75,7 +75,7 @@ steps: timeout_in_minutes: 20 depends_on: upload-merge-pipeline commands: - - pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py + - pytest -s -v tests/e2e/offline_inference/test_stable_audio.py agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 5c6d6d35a65..e35da7f2a8b 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -161,7 +161,7 @@ steps: if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 89839a2d1ed..e9b11ce3165 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -103,7 +103,7 @@ steps: - label: "Audio Generation Model Test" depends_on: upload-ready-pipeline commands: - - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 81392b201da..e67d4dac1bd 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -243,7 +243,8 @@ vllm_omni/ tests/ β”œβ”€β”€ test_zimage_tensor_parallel.py β”œβ”€β”€ test_cache_dit.py β”œβ”€β”€ test_teacache.py - β”œβ”€β”€ test_stable_audio_model.py + β”œβ”€β”€ test_stable_audio.py + β”œβ”€β”€ test_stable_audio_expansion.py β”œβ”€β”€ test_diffusion_cpu_offload.py β”œβ”€β”€ test_diffusion_layerwise_offload.py β”œβ”€β”€ test_diffusion_lora.py diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md index 0b07c5ffe4a..2b226a2c990 100644 --- a/docs/contributing/ci/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -148,7 +148,8 @@ vllm_omni/ tests/ β”œβ”€β”€ test_zimage_tensor_parallel.py β”œβ”€β”€ test_cache_dit.py β”œβ”€β”€ test_teacache.py - β”œβ”€β”€ test_stable_audio_model.py + β”œβ”€β”€ test_stable_audio.py + β”œβ”€β”€ test_stable_audio_expansion.py β”œβ”€β”€ test_diffusion_cpu_offload.py β”œβ”€β”€ test_diffusion_layerwise_offload.py β”œβ”€β”€ test_diffusion_lora.py diff --git a/tests/conftest.py b/tests/conftest.py index fb888695428..967f9fbd6fb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -162,7 +162,6 @@ def assert_audio_diffusion_response( Validate audio diffusion response. """ raise NotImplementedError("Audio validation is not implemented yet") - # consider using assert_audio_valid defined above def _maybe_int(value: Any) -> int | None: @@ -272,15 +271,32 @@ def assert_video_valid( pass -def assert_audio_valid(path: Path, *, sample_rate: int, channels: int, duration_s: float) -> None: - """Assert the WAV has the expected sample rate, channel count, and duration.""" +def assert_audio_valid( + audio_or_path: Path | np.ndarray, + *, + sample_rate: int, + channels: int, + duration_s: float, +) -> None: + """Assert WAV file or (batch, channels, samples) ndarray matches expected audio format.""" + expected_samples = int(duration_s * sample_rate) + if isinstance(audio_or_path, np.ndarray): + audio = audio_or_path + assert audio.ndim == 3, f"Expected audio ndim=3 (batch, channels, samples), got shape {audio.shape}" + assert audio.shape[0] == 1, f"Expected batch size 1, got {audio.shape[0]}" + assert audio.shape[1] == channels, f"Expected {channels} channels, got {audio.shape[1]}" + assert audio.shape[2] == expected_samples, ( + f"Expected {expected_samples} samples ({duration_s}s @ {sample_rate} Hz), got {audio.shape[2]}" + ) + return + + path = audio_or_path assert path.exists(), f"Audio not found: {path}" info = sf.info(str(path)) assert info.samplerate == sample_rate, f"Expected sample_rate={sample_rate}, got {info.samplerate}" assert info.channels == channels, f"Expected {channels} channel(s), got {info.channels}" - expected_frames = int(duration_s * sample_rate) - assert info.frames == expected_frames, ( - f"Expected {expected_frames} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}" + assert info.frames == expected_samples, ( + f"Expected {expected_samples} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}" ) diff --git a/tests/e2e/offline_inference/stable_audio_offline_utils.py b/tests/e2e/offline_inference/stable_audio_offline_utils.py new file mode 100644 index 00000000000..cf602348393 --- /dev/null +++ b/tests/e2e/offline_inference/stable_audio_offline_utils.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Shared generation helpers for Stable Audio offline e2e tests.""" + +from __future__ import annotations + +import numpy as np +import torch + +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform + + +def generate_stable_audio_short_clip( + omni: Omni, + *, + audio_start_in_s: float = 0.0, + audio_end_in_s: float = 2.0, + num_inference_steps: int = 4, + seed: int = 42, +) -> np.ndarray: + """Run a minimal Stable Audio generation and return audio as (batch, channels, samples).""" + outputs = omni.generate( + prompts={ + "prompt": "The sound of a dog barking", + "negative_prompt": "Low quality.", + }, + sampling_params_list=OmniDiffusionSamplingParams( + num_inference_steps=num_inference_steps, + guidance_scale=7.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), + num_outputs_per_prompt=1, + extra_args={ + "audio_start_in_s": audio_start_in_s, + "audio_end_in_s": audio_end_in_s, + }, + ), + ) + + assert outputs is not None + first_output = outputs[0] + assert first_output.final_output_type == "image" + assert hasattr(first_output, "request_output") and first_output.request_output + + req_out = first_output.request_output + assert isinstance(req_out, OmniRequestOutput) + assert req_out.final_output_type == "audio" + assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output + audio = req_out.multimodal_output.get("audio") + assert isinstance(audio, np.ndarray) + return audio diff --git a/tests/e2e/offline_inference/test_stable_audio.py b/tests/e2e/offline_inference/test_stable_audio.py new file mode 100644 index 00000000000..cb8e0400ee0 --- /dev/null +++ b/tests/e2e/offline_inference/test_stable_audio.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""L2 offline inference: basic Stable Audio deployment and output shape.""" + +import sys +from pathlib import Path + +# ruff: noqa: E402 +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +import pytest + +from tests.conftest import assert_audio_valid +from tests.e2e.offline_inference.stable_audio_offline_utils import generate_stable_audio_short_clip +from tests.utils import hardware_test +from vllm_omni import Omni + +# Use random weights model for CI testing (small, no authentication required) +models = ["linyueqian/stable_audio_random"] + +_SAMPLE_RATE = 44100 +_CLIP_DURATION_S = 2.0 + + +@pytest.mark.core_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "L4", "xpu": "B60"}) +@pytest.mark.parametrize("model_name", models) +def test_stable_audio(model_name: str) -> None: + m = Omni(model=model_name) + try: + audio = generate_stable_audio_short_clip(m) + assert_audio_valid( + audio, + sample_rate=_SAMPLE_RATE, + channels=2, + duration_s=_CLIP_DURATION_S, + ) + finally: + m.close() diff --git a/tests/e2e/offline_inference/test_stable_audio_expansion.py b/tests/e2e/offline_inference/test_stable_audio_expansion.py new file mode 100644 index 00000000000..e0282a4b3b6 --- /dev/null +++ b/tests/e2e/offline_inference/test_stable_audio_expansion.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""L4 offline inference: Stable Audio with combined FP8 quantization and TeaCache.""" + +import sys +from pathlib import Path + +# ruff: noqa: E402 +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +import pytest + +from tests.conftest import assert_audio_valid +from tests.e2e.offline_inference.stable_audio_offline_utils import generate_stable_audio_short_clip +from tests.utils import hardware_test +from vllm_omni import Omni + +models = ["linyueqian/stable_audio_random"] + +_SAMPLE_RATE = 44100 +_CLIP_DURATION_S = 2.0 + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@pytest.mark.cache +@hardware_test(res={"cuda": "L4", "xpu": "B60"}) +@pytest.mark.parametrize("model_name", models) +def test_stable_audio_quantization_and_teacache(model_name: str) -> None: + """TeaCache + FP8 quantization in one run (L4 coverage).""" + m = Omni( + model=model_name, + quantization="fp8", + cache_backend="tea_cache", + cache_config={"rel_l1_thresh": 0.2}, + ) + try: + audio = generate_stable_audio_short_clip(m) + assert_audio_valid( + audio, + sample_rate=_SAMPLE_RATE, + channels=2, + duration_s=_CLIP_DURATION_S, + ) + finally: + m.close() diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py deleted file mode 100644 index d7d11f0d947..00000000000 --- a/tests/e2e/offline_inference/test_stable_audio_model.py +++ /dev/null @@ -1,96 +0,0 @@ -import sys -from pathlib import Path - -import numpy as np -import pytest -import torch - -from tests.utils import hardware_test -from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.outputs import OmniRequestOutput -from vllm_omni.platforms import current_omni_platform - -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - -# Use random weights model for CI testing (small, no authentication required) -models = ["linyueqian/stable_audio_random"] - - -def _run_stable_audio_and_validate(m: Omni) -> None: - """Run a minimal Stable Audio generation and validate output shape.""" - # Use minimal settings for testing - # Generate a short 2-second audio clip with minimal inference steps - audio_start_in_s = 0.0 - audio_end_in_s = 2.0 # Short duration for fast testing - sample_rate = 44100 # Stable Audio uses 44100 Hz - - outputs = m.generate( - prompts={ - "prompt": "The sound of a dog barking", - "negative_prompt": "Low quality.", - }, - sampling_params_list=OmniDiffusionSamplingParams( - num_inference_steps=4, # Minimal steps for speed - guidance_scale=7.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), - num_outputs_per_prompt=1, - extra_args={ - "audio_start_in_s": audio_start_in_s, - "audio_end_in_s": audio_end_in_s, - }, - ), - ) - - # Extract audio from OmniRequestOutput - assert outputs is not None - first_output = outputs[0] - assert first_output.final_output_type == "image" - assert hasattr(first_output, "request_output") and first_output.request_output - - req_out = first_output.request_output - assert isinstance(req_out, OmniRequestOutput) - assert req_out.final_output_type == "audio" - assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output - audio = req_out.multimodal_output.get("audio") - assert isinstance(audio, np.ndarray) - # audio shape: (batch, channels, samples) - # For stable-audio-open-1.0: sample_rate=44100, so 2 seconds = 88200 samples - assert audio.ndim == 3 - assert audio.shape[0] == 1 # batch size - assert audio.shape[1] == 2 # stereo channels - expected_samples = int((audio_end_in_s - audio_start_in_s) * sample_rate) - assert audio.shape[2] == expected_samples # 88200 samples for 2 seconds - - -@pytest.mark.core_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "L4", "xpu": "B60"}) -@pytest.mark.parametrize("model_name", models) -def test_stable_audio_model(model_name: str): - m = Omni(model=model_name) - try: - _run_stable_audio_and_validate(m) - finally: - m.close() - - -@pytest.mark.core_model -@pytest.mark.diffusion -@pytest.mark.cache -@hardware_test(res={"cuda": "L4", "xpu": "B60"}) -@pytest.mark.parametrize("model_name", models) -def test_stable_audio_teacache(model_name: str): - m = Omni( - model=model_name, - cache_backend="tea_cache", - cache_config={"rel_l1_thresh": 0.2}, - ) - try: - _run_stable_audio_and_validate(m) - finally: - m.close() From 466396bebad6b7f1907610eefe829454980069f7 Mon Sep 17 00:00:00 2001 From: Zhang Date: Wed, 1 Apr 2026 15:02:35 +0000 Subject: [PATCH 05/12] reset a file Signed-off-by: Zhang --- .buildkite/test-nightly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index e35da7f2a8b..5c6d6d35a65 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -161,7 +161,7 @@ steps: if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: From e05b372ba2e710b6f246d4b369d2e2004b4c5334 Mon Sep 17 00:00:00 2001 From: Zhang Date: Wed, 1 Apr 2026 15:19:05 +0000 Subject: [PATCH 06/12] add nightly test Signed-off-by: Zhang --- .buildkite/test-nightly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 5c6d6d35a65..0307286913b 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -161,7 +161,7 @@ steps: if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py tests/e2e/offline_inference/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: From 3c3a262fcb43c4d4164c5d861c7622e14eae0263 Mon Sep 17 00:00:00 2001 From: Zhang Jian Date: Sat, 4 Apr 2026 03:36:31 +0000 Subject: [PATCH 07/12] add comment to explain confusing config Signed-off-by: Zhang Jian --- tests/e2e/offline_inference/stable_audio_offline_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/e2e/offline_inference/stable_audio_offline_utils.py b/tests/e2e/offline_inference/stable_audio_offline_utils.py index cf602348393..906dea11a41 100644 --- a/tests/e2e/offline_inference/stable_audio_offline_utils.py +++ b/tests/e2e/offline_inference/stable_audio_offline_utils.py @@ -42,6 +42,12 @@ def generate_stable_audio_short_clip( assert outputs is not None first_output = outputs[0] + # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata. + # The nested request_output is the worker OmniRequestOutput + # (e.g. final_output_type="audio") and holds the multimodal payload. + # Follow-up: add StableAudioPipeline stage YAML, and pass model into + # _create_default_diffusion_stage_cfg so default diffusion metadata can set + # final_output_type to "audio" for future audio pipelines without YAML. assert first_output.final_output_type == "image" assert hasattr(first_output, "request_output") and first_output.request_output From 4ce6d4ff7f490ff3482461f6dd02ea18b84b22e7 Mon Sep 17 00:00:00 2001 From: Zhang Jian Date: Sat, 4 Apr 2026 05:55:37 +0000 Subject: [PATCH 08/12] remove duplicated test Signed-off-by: Zhang Jian --- .buildkite/test-merge.yml | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index c594f128cc8..7a903d9df7c 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -71,24 +71,6 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Audio Generation Model Test" - timeout_in_minutes: 20 - depends_on: upload-merge-pipeline - commands: - - pytest -s -v tests/e2e/offline_inference/test_stable_audio.py - agents: - queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Diffusion Cache Backend Test" timeout_in_minutes: 15 depends_on: upload-merge-pipeline From 2763982c4601619c53aff6712a4a18dead2ca6f3 Mon Sep 17 00:00:00 2001 From: Zhang Jian Date: Sat, 4 Apr 2026 05:56:28 +0000 Subject: [PATCH 09/12] still use offline test for stable audio until PR 2452 is merged Signed-off-by: Zhang Jian --- .buildkite/test-nightly.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index ced77293a39..d935a6e245a 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -154,7 +154,8 @@ steps: if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py tests/e2e/offline_inference/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" + - pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: From 0d9f44b3489aca00b2ef4cdd37cad56b1fab35d4 Mon Sep 17 00:00:00 2001 From: Zhang Jian Date: Sat, 4 Apr 2026 12:55:37 +0000 Subject: [PATCH 10/12] merge 3 tests into 1 Signed-off-by: Zhang Jian --- .buildkite/test-amd-merge.yml | 2 +- .buildkite/test-amd-ready.yaml | 2 +- .buildkite/test-ready.yml | 2 +- docs/contributing/ci/CI_5levels.md | 1 - docs/contributing/ci/tests_style.md | 1 - .../stable_audio_offline_utils.py | 60 ---------------- .../offline_inference/test_stable_audio.py | 43 ------------ .../test_stable_audio_expansion.py | 68 ++++++++++++++++--- 8 files changed, 63 insertions(+), 116 deletions(-) delete mode 100644 tests/e2e/offline_inference/stable_audio_offline_utils.py delete mode 100644 tests/e2e/offline_inference/test_stable_audio.py diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml index a1cd9fdc1f4..4c8f08ea87a 100644 --- a/.buildkite/test-amd-merge.yml +++ b/.buildkite/test-amd-merge.yml @@ -55,7 +55,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml index f36e2b7438f..9f0c848aaae 100644 --- a/.buildkite/test-amd-ready.yaml +++ b/.buildkite/test-amd-ready.yaml @@ -46,7 +46,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index feff28dfc3f..c8d7041b857 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -103,7 +103,7 @@ steps: - label: "Audio Generation Model Test" depends_on: upload-ready-pipeline commands: - - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 492012bef11..dadc702755b 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -242,7 +242,6 @@ vllm_omni/ tests/ β”œβ”€β”€ test_zimage_tensor_parallel.py β”œβ”€β”€ test_cache_dit.py β”œβ”€β”€ test_teacache.py - β”œβ”€β”€ test_stable_audio.py β”œβ”€β”€ test_stable_audio_expansion.py β”œβ”€β”€ test_diffusion_cpu_offload.py β”œβ”€β”€ test_diffusion_layerwise_offload.py diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md index 168aa2907d0..69d5b16d7a5 100644 --- a/docs/contributing/ci/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -147,7 +147,6 @@ vllm_omni/ tests/ β”œβ”€β”€ test_zimage_tensor_parallel.py β”œβ”€β”€ test_cache_dit.py β”œβ”€β”€ test_teacache.py - β”œβ”€β”€ test_stable_audio.py β”œβ”€β”€ test_stable_audio_expansion.py β”œβ”€β”€ test_diffusion_cpu_offload.py β”œβ”€β”€ test_diffusion_layerwise_offload.py diff --git a/tests/e2e/offline_inference/stable_audio_offline_utils.py b/tests/e2e/offline_inference/stable_audio_offline_utils.py deleted file mode 100644 index 906dea11a41..00000000000 --- a/tests/e2e/offline_inference/stable_audio_offline_utils.py +++ /dev/null @@ -1,60 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -"""Shared generation helpers for Stable Audio offline e2e tests.""" - -from __future__ import annotations - -import numpy as np -import torch - -from vllm_omni import Omni -from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.outputs import OmniRequestOutput -from vllm_omni.platforms import current_omni_platform - - -def generate_stable_audio_short_clip( - omni: Omni, - *, - audio_start_in_s: float = 0.0, - audio_end_in_s: float = 2.0, - num_inference_steps: int = 4, - seed: int = 42, -) -> np.ndarray: - """Run a minimal Stable Audio generation and return audio as (batch, channels, samples).""" - outputs = omni.generate( - prompts={ - "prompt": "The sound of a dog barking", - "negative_prompt": "Low quality.", - }, - sampling_params_list=OmniDiffusionSamplingParams( - num_inference_steps=num_inference_steps, - guidance_scale=7.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), - num_outputs_per_prompt=1, - extra_args={ - "audio_start_in_s": audio_start_in_s, - "audio_end_in_s": audio_end_in_s, - }, - ), - ) - - assert outputs is not None - first_output = outputs[0] - # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata. - # The nested request_output is the worker OmniRequestOutput - # (e.g. final_output_type="audio") and holds the multimodal payload. - # Follow-up: add StableAudioPipeline stage YAML, and pass model into - # _create_default_diffusion_stage_cfg so default diffusion metadata can set - # final_output_type to "audio" for future audio pipelines without YAML. - assert first_output.final_output_type == "image" - assert hasattr(first_output, "request_output") and first_output.request_output - - req_out = first_output.request_output - assert isinstance(req_out, OmniRequestOutput) - assert req_out.final_output_type == "audio" - assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output - audio = req_out.multimodal_output.get("audio") - assert isinstance(audio, np.ndarray) - return audio diff --git a/tests/e2e/offline_inference/test_stable_audio.py b/tests/e2e/offline_inference/test_stable_audio.py deleted file mode 100644 index cb8e0400ee0..00000000000 --- a/tests/e2e/offline_inference/test_stable_audio.py +++ /dev/null @@ -1,43 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -"""L2 offline inference: basic Stable Audio deployment and output shape.""" - -import sys -from pathlib import Path - -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -import pytest - -from tests.conftest import assert_audio_valid -from tests.e2e.offline_inference.stable_audio_offline_utils import generate_stable_audio_short_clip -from tests.utils import hardware_test -from vllm_omni import Omni - -# Use random weights model for CI testing (small, no authentication required) -models = ["linyueqian/stable_audio_random"] - -_SAMPLE_RATE = 44100 -_CLIP_DURATION_S = 2.0 - - -@pytest.mark.core_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "L4", "xpu": "B60"}) -@pytest.mark.parametrize("model_name", models) -def test_stable_audio(model_name: str) -> None: - m = Omni(model=model_name) - try: - audio = generate_stable_audio_short_clip(m) - assert_audio_valid( - audio, - sample_rate=_SAMPLE_RATE, - channels=2, - duration_s=_CLIP_DURATION_S, - ) - finally: - m.close() diff --git a/tests/e2e/offline_inference/test_stable_audio_expansion.py b/tests/e2e/offline_inference/test_stable_audio_expansion.py index e0282a4b3b6..ab2727390ca 100644 --- a/tests/e2e/offline_inference/test_stable_audio_expansion.py +++ b/tests/e2e/offline_inference/test_stable_audio_expansion.py @@ -1,7 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""L4 offline inference: Stable Audio with combined FP8 quantization and TeaCache.""" +"""Stable Audio offline e2e: real weights, FP8 + TeaCache (single job to save GPU).""" + +from __future__ import annotations import sys from pathlib import Path @@ -11,28 +13,78 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) +import numpy as np import pytest +import torch from tests.conftest import assert_audio_valid -from tests.e2e.offline_inference.stable_audio_offline_utils import generate_stable_audio_short_clip from tests.utils import hardware_test from vllm_omni import Omni - -models = ["linyueqian/stable_audio_random"] +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform _SAMPLE_RATE = 44100 _CLIP_DURATION_S = 2.0 +def generate_stable_audio_short_clip( + omni: Omni, + *, + audio_start_in_s: float = 0.0, + audio_end_in_s: float = 2.0, + num_inference_steps: int = 4, + seed: int = 42, +) -> np.ndarray: + """Run a minimal Stable Audio generation and return audio as (batch, channels, samples).""" + outputs = omni.generate( + prompts={ + "prompt": "The sound of a dog barking", + "negative_prompt": "Low quality.", + }, + sampling_params_list=OmniDiffusionSamplingParams( + num_inference_steps=num_inference_steps, + guidance_scale=7.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), + num_outputs_per_prompt=1, + extra_args={ + "audio_start_in_s": audio_start_in_s, + "audio_end_in_s": audio_end_in_s, + }, + ), + ) + + assert outputs is not None + first_output = outputs[0] + # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata. + # The nested request_output is the worker OmniRequestOutput + # (e.g. final_output_type="audio") and holds the multimodal payload. + # Follow-up: add StableAudioPipeline stage YAML, and pass model into + # _create_default_diffusion_stage_cfg so default diffusion metadata can set + # final_output_type to "audio" for future audio pipelines without YAML. + assert first_output.final_output_type == "image" + assert hasattr(first_output, "request_output") and first_output.request_output + + req_out = first_output.request_output + assert isinstance(req_out, OmniRequestOutput) + assert req_out.final_output_type == "audio" + assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output + audio = req_out.multimodal_output.get("audio") + assert isinstance(audio, np.ndarray) + return audio + + @pytest.mark.advanced_model @pytest.mark.diffusion @pytest.mark.cache @hardware_test(res={"cuda": "L4", "xpu": "B60"}) -@pytest.mark.parametrize("model_name", models) -def test_stable_audio_quantization_and_teacache(model_name: str) -> None: - """TeaCache + FP8 quantization in one run (L4 coverage).""" +def test_stable_audio_quantization_and_teacache() -> None: + """Stable Audio Open on real Hub weights with FP8 + TeaCache (covers former L2 smoke + L4 features). + + CI should provide ``HF_TOKEN`` if the checkpoint is gated. + """ m = Omni( - model=model_name, + model="stabilityai/stable-audio-open-1.0", quantization="fp8", cache_backend="tea_cache", cache_config={"rel_l1_thresh": 0.2}, From 7e6fe0028c11dd68f39114df1c7afd6085d025aa Mon Sep 17 00:00:00 2001 From: Zhang Date: Sun, 12 Apr 2026 08:20:29 +0800 Subject: [PATCH 11/12] chore(docs): drop manual edits to generated offline_inference text_to_audio docs/user_guide/examples/offline_inference/*.md is produced at build time; tea_cache usage remains documented in examples/offline_inference/text_to_audio/README.md. Signed-off-by: Zhang --- docs/user_guide/examples/offline_inference/text_to_audio.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/user_guide/examples/offline_inference/text_to_audio.md b/docs/user_guide/examples/offline_inference/text_to_audio.md index 9be09194bc5..62a70e5254d 100644 --- a/docs/user_guide/examples/offline_inference/text_to_audio.md +++ b/docs/user_guide/examples/offline_inference/text_to_audio.md @@ -26,7 +26,6 @@ python text_to_audio.py \ --guidance-scale 7.0 \ --audio-length 10.0 \ --num-inference-steps 100 \ - --cache-backend tea_cache \ --output stable_audio_output.wav ``` @@ -38,7 +37,6 @@ Key arguments: - `--guidance-scale`: classifier-free guidance scale. - `--audio-length`: audio duration in seconds. - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower). -- `--cache-backend`: cache acceleration backend. Stable Audio currently supports `tea_cache`. - `--output`: path to save the generated WAV file. ## Example materials From 8d072d711639c1a9e459d15ebd96607552d75664 Mon Sep 17 00:00:00 2001 From: Zhang Date: Mon, 13 Apr 2026 07:32:52 +0000 Subject: [PATCH 12/12] adapt to pr 2711 Signed-off-by: Zhang --- .../e2e/offline_inference/test_stable_audio_expansion.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/e2e/offline_inference/test_stable_audio_expansion.py b/tests/e2e/offline_inference/test_stable_audio_expansion.py index e7b92fa98de..54c1799e145 100644 --- a/tests/e2e/offline_inference/test_stable_audio_expansion.py +++ b/tests/e2e/offline_inference/test_stable_audio_expansion.py @@ -1,7 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Stable Audio offline e2e: real weights, FP8 + TeaCache (single job to save GPU).""" +"""Stable Audio offline e2e: real weights, FP8 + TeaCache (single job to save GPU). + +NOTE: This test instantiates Omni directly instead of using the omni_runner +fixture (introduced in PR #2711) because the fixture's parametrize interface +only accepts (model, stage_config_path) and does not support extra kwargs like +quantization, cache_backend, or cache_config. +""" from __future__ import annotations