diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml index b6f2037d18a..ac52f60b35b 100644 --- a/.buildkite/test-amd-merge.yml +++ b/.buildkite/test-amd-merge.yml @@ -54,7 +54,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml index ced91635c25..30bbc769412 100644 --- a/.buildkite/test-amd-ready.yaml +++ b/.buildkite/test-amd-ready.yaml @@ -69,7 +69,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 24fc6dd3dc2..2a6cb6488a0 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -76,24 +76,6 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Audio Generation Model Test" - timeout_in_minutes: 20 - depends_on: upload-merge-pipeline - commands: - - pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py - agents: - queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Diffusion Cache Backend Test" timeout_in_minutes: 15 depends_on: upload-merge-pipeline diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 13a812a62f3..2f749f0ee9f 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -123,7 +123,7 @@ steps: - label: "Audio Generation Model Test" depends_on: upload-ready-pipeline commands: - - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 74ae1a38eb8..93060357385 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -242,7 +242,7 @@ vllm_omni/ tests/ ├── test_zimage_tensor_parallel.py ├── test_cache_dit.py ├── test_teacache.py - ├── test_stable_audio_model.py + ├── test_stable_audio_expansion.py ├── test_diffusion_cpu_offload.py ├── test_diffusion_layerwise_offload.py ├── test_diffusion_lora.py diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md index 8b10cf4cc1c..69d5b16d7a5 100644 --- a/docs/contributing/ci/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -147,7 +147,7 @@ vllm_omni/ tests/ ├── test_zimage_tensor_parallel.py ├── test_cache_dit.py ├── test_teacache.py - ├── test_stable_audio_model.py + ├── test_stable_audio_expansion.py ├── test_diffusion_cpu_offload.py ├── test_diffusion_layerwise_offload.py ├── test_diffusion_lora.py diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 2f28131ee55..fae614cd901 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -147,7 +147,7 @@ The following tables show which models support each feature: | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| -| **Stable-Audio-Open** | ❌ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | +| **Stable-Audio-Open** | ✅ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ## Feature Compatibility diff --git a/examples/offline_inference/text_to_audio/README.md b/examples/offline_inference/text_to_audio/README.md index 7edc38092ad..50bab3e2f2d 100644 --- a/examples/offline_inference/text_to_audio/README.md +++ b/examples/offline_inference/text_to_audio/README.md @@ -23,6 +23,7 @@ python text_to_audio.py \ --guidance-scale 7.0 \ --audio-length 10.0 \ --num-inference-steps 100 \ + --cache-backend tea_cache \ --output stable_audio_output.wav ``` @@ -34,4 +35,5 @@ Key arguments: - `--guidance-scale`: classifier-free guidance scale. - `--audio-length`: audio duration in seconds. - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower). +- `--cache-backend`: cache acceleration backend. Stable Audio currently supports `tea_cache`. - `--output`: path to save the generated WAV file. diff --git a/examples/offline_inference/text_to_audio/text_to_audio.py b/examples/offline_inference/text_to_audio/text_to_audio.py index a6968c419f6..3adb3ad53a5 100644 --- a/examples/offline_inference/text_to_audio/text_to_audio.py +++ b/examples/offline_inference/text_to_audio/text_to_audio.py @@ -11,6 +11,7 @@ python text_to_audio.py --prompt "The sound of a dog barking" python text_to_audio.py --prompt "A piano playing a gentle melody" --audio-length 10.0 python text_to_audio.py --prompt "Thunder and rain sounds" --negative-prompt "Low quality" + python text_to_audio.py --prompt "A soft synth pad" --cache-backend tea_cache """ import argparse @@ -90,6 +91,23 @@ def parse_args() -> argparse.Namespace: default=44100, help="Sample rate for output audio (Stable Audio uses 44100 Hz).", ) + parser.add_argument( + "--cache-backend", + type=str, + default=None, + choices=["tea_cache"], + help=( + "Cache backend to use for acceleration. " + "Stable Audio currently supports 'tea_cache'. " + "Default: None (no cache acceleration)." + ), + ) + parser.add_argument( + "--tea-cache-rel-l1-thresh", + type=float, + default=0.2, + help="[tea_cache] Threshold for accumulated relative L1 distance.", + ) parser.add_argument( "--enable-diffusion-pipeline-profiler", action="store_true", @@ -124,6 +142,11 @@ def save_audio(audio_data: np.ndarray, output_path: str, sample_rate: int = 4410 def main(): args = parse_args() generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed) + cache_config = None + if args.cache_backend == "tea_cache": + cache_config = { + "rel_l1_thresh": args.tea_cache_rel_l1_thresh, + } print(f"\n{'=' * 60}") print("Stable Audio Open - Text-to-Audio Generation") @@ -134,12 +157,15 @@ def main(): print(f" Audio length: {args.audio_length}s") print(f" Inference steps: {args.num_inference_steps}") print(f" Guidance scale: {args.guidance_scale}") + print(f" Cache backend: {args.cache_backend if args.cache_backend else 'None (no acceleration)'}") print(f" Seed: {args.seed}") print(f"{'=' * 60}\n") # Initialize Omni with Stable Audio model omni = Omni( model=args.model, + cache_backend=args.cache_backend, + cache_config=cache_config, enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, ) diff --git a/pyproject.toml b/pyproject.toml index e49aa6e3251..57a4b474fd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -182,6 +182,7 @@ markers = [ "H100: Tests that require H100 GPU", "L4: Tests that require L4 GPU", "MI325: Tests that require MI325 GPU (AMD/ROCm)", + "B60: Tests that require Intel Arc Pro B60 XPU", "S5000: Tests that require S5000 GPU (Moore Threads/MUSA)", "A2: Tests that require A2 NPU", "A3: Tests that require A3 NPU", diff --git a/tests/conftest.py b/tests/conftest.py index 9c739533b83..e41d15bdf56 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -167,7 +167,6 @@ def assert_audio_diffusion_response( Validate audio diffusion response. """ raise NotImplementedError("Audio validation is not implemented yet") - # consider using assert_audio_valid defined above def _maybe_int(value: Any) -> int | None: @@ -277,15 +276,32 @@ def assert_video_valid( pass -def assert_audio_valid(path: Path, *, sample_rate: int, channels: int, duration_s: float) -> None: - """Assert the WAV has the expected sample rate, channel count, and duration.""" +def assert_audio_valid( + audio_or_path: Path | np.ndarray, + *, + sample_rate: int, + channels: int, + duration_s: float, +) -> None: + """Assert WAV file or (batch, channels, samples) ndarray matches expected audio format.""" + expected_samples = int(duration_s * sample_rate) + if isinstance(audio_or_path, np.ndarray): + audio = audio_or_path + assert audio.ndim == 3, f"Expected audio ndim=3 (batch, channels, samples), got shape {audio.shape}" + assert audio.shape[0] == 1, f"Expected batch size 1, got {audio.shape[0]}" + assert audio.shape[1] == channels, f"Expected {channels} channels, got {audio.shape[1]}" + assert audio.shape[2] == expected_samples, ( + f"Expected {expected_samples} samples ({duration_s}s @ {sample_rate} Hz), got {audio.shape[2]}" + ) + return + + path = audio_or_path assert path.exists(), f"Audio not found: {path}" info = sf.info(str(path)) assert info.samplerate == sample_rate, f"Expected sample_rate={sample_rate}, got {info.samplerate}" assert info.channels == channels, f"Expected {channels} channel(s), got {info.channels}" - expected_frames = int(duration_s * sample_rate) - assert info.frames == expected_frames, ( - f"Expected {expected_frames} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}" + assert info.frames == expected_samples, ( + f"Expected {expected_samples} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}" ) diff --git a/tests/e2e/offline_inference/test_stable_audio_expansion.py b/tests/e2e/offline_inference/test_stable_audio_expansion.py new file mode 100644 index 00000000000..54c1799e145 --- /dev/null +++ b/tests/e2e/offline_inference/test_stable_audio_expansion.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Stable Audio offline e2e: real weights, FP8 + TeaCache (single job to save GPU). + +NOTE: This test instantiates Omni directly instead of using the omni_runner +fixture (introduced in PR #2711) because the fixture's parametrize interface +only accepts (model, stage_config_path) and does not support extra kwargs like +quantization, cache_backend, or cache_config. +""" + +from __future__ import annotations + +import numpy as np +import pytest +import torch + +from tests.conftest import assert_audio_valid +from tests.utils import hardware_test +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform + +_SAMPLE_RATE = 44100 +_CLIP_DURATION_S = 2.0 + + +def generate_stable_audio_short_clip( + omni: Omni, + *, + audio_start_in_s: float = 0.0, + audio_end_in_s: float = 2.0, + num_inference_steps: int = 4, + seed: int = 42, +) -> np.ndarray: + """Run a minimal Stable Audio generation and return audio as (batch, channels, samples).""" + outputs = omni.generate( + prompts={ + "prompt": "The sound of a dog barking", + "negative_prompt": "Low quality.", + }, + sampling_params_list=OmniDiffusionSamplingParams( + num_inference_steps=num_inference_steps, + guidance_scale=7.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), + num_outputs_per_prompt=1, + extra_args={ + "audio_start_in_s": audio_start_in_s, + "audio_end_in_s": audio_end_in_s, + }, + ), + ) + + assert outputs is not None + first_output = outputs[0] + # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata. + # The nested request_output is the worker OmniRequestOutput + # (e.g. final_output_type="audio") and holds the multimodal payload. + # Follow-up: add StableAudioPipeline stage YAML, and pass model into + # _create_default_diffusion_stage_cfg so default diffusion metadata can set + # final_output_type to "audio" for future audio pipelines without YAML. + assert first_output.final_output_type == "image" + assert hasattr(first_output, "request_output") and first_output.request_output + + req_out = first_output.request_output + assert isinstance(req_out, OmniRequestOutput) + assert req_out.final_output_type == "audio" + assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output + audio = req_out.multimodal_output.get("audio") + assert isinstance(audio, np.ndarray) + return audio + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@pytest.mark.cache +@hardware_test(res={"cuda": "L4", "xpu": "B60"}) +def test_stable_audio_quantization_and_teacache() -> None: + """Stable Audio Open on real Hub weights with FP8 + TeaCache (covers former L2 smoke + L4 features). + + CI should provide ``HF_TOKEN`` if the checkpoint is gated. + """ + m = Omni( + model="stabilityai/stable-audio-open-1.0", + quantization="fp8", + cache_backend="tea_cache", + cache_config={"rel_l1_thresh": 0.2}, + ) + try: + audio = generate_stable_audio_short_clip(m) + assert_audio_valid( + audio, + sample_rate=_SAMPLE_RATE, + channels=2, + duration_s=_CLIP_DURATION_S, + ) + finally: + m.close() diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py deleted file mode 100644 index 21d75aad52a..00000000000 --- a/tests/e2e/offline_inference/test_stable_audio_model.py +++ /dev/null @@ -1,63 +0,0 @@ -import numpy as np -import pytest -import torch - -from tests.utils import hardware_test -from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.outputs import OmniRequestOutput -from vllm_omni.platforms import current_omni_platform - -# Use random weights model for CI testing (small, no authentication required) -models = ["linyueqian/stable_audio_random"] - -# omni_runner expects (model, stage_configs_path); single-stage diffusion has no YAML. -test_params = [(m, None) for m in models] - - -@pytest.mark.core_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "L4", "xpu": "B60"}) -@pytest.mark.parametrize("omni_runner", test_params, indirect=True) -def test_stable_audio_model(omni_runner): - # Use minimal settings for testing - # Generate a short 2-second audio clip with minimal inference steps - audio_start_in_s = 0.0 - audio_end_in_s = 2.0 # Short duration for fast testing - sample_rate = 44100 # Stable Audio uses 44100 Hz - - outputs = omni_runner.omni.generate( - prompts={ - "prompt": "The sound of a dog barking", - "negative_prompt": "Low quality.", - }, - sampling_params_list=OmniDiffusionSamplingParams( - num_inference_steps=4, # Minimal steps for speed - guidance_scale=7.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), - num_outputs_per_prompt=1, - extra_args={ - "audio_start_in_s": audio_start_in_s, - "audio_end_in_s": audio_end_in_s, - }, - ), - ) - - # Extract audio from OmniRequestOutput - assert outputs is not None - first_output = outputs[0] - assert first_output.final_output_type == "image" - assert hasattr(first_output, "request_output") and first_output.request_output - - req_out = first_output.request_output - assert isinstance(req_out, OmniRequestOutput) - assert req_out.final_output_type == "audio" - assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output - audio = req_out.multimodal_output.get("audio") - assert isinstance(audio, np.ndarray) - # audio shape: (batch, channels, samples) - # For stable-audio-open-1.0: sample_rate=44100, so 2 seconds = 88200 samples - assert audio.ndim == 3 - assert audio.shape[0] == 1 # batch size - assert audio.shape[1] == 2 # stereo channels - expected_samples = int((audio_end_in_s - audio_start_in_s) * sample_rate) - assert audio.shape[2] == expected_samples # 88200 samples for 2 seconds