vllm-project · hsliuustc0106 · Apr 14, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
@@ -54,7 +54,7 @@ steps:
 #     - export GPU_ARCHS=gfx942
 #     - export VLLM_LOGGING_LEVEL=DEBUG
 #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model
 
 - label: "Diffusion Cache Backend Test"
   agent_pool: mi325_1

@@ -69,7 +69,7 @@ steps:
 #     - export GPU_ARCHS=gfx942
 #     - export VLLM_LOGGING_LEVEL=DEBUG
 #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model
 
 - label: "Diffusion Cache Backend Test"
   agent_pool: mi325_1

@@ -76,24 +76,6 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Audio Generation Model Test"
-    timeout_in_minutes: 20
-    depends_on: upload-merge-pipeline
-    commands:
-      - pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
-    agents:
-      queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
-    plugins:
-      - docker#v5.2.0:
-          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-          always-pull: true
-          propagate-environment: true
-          environment:
-            - "HF_HOME=/fsx/hf_cache"
-            - "HF_TOKEN"
-          volumes:
-            - "/fsx/hf_cache:/fsx/hf_cache"
-
   - label: "Diffusion Cache Backend Test"
     timeout_in_minutes: 15
     depends_on: upload-merge-pipeline

@@ -123,7 +123,7 @@ steps:
   - label: "Audio Generation Model Test"
     depends_on: upload-ready-pipeline
     commands:
-      - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+      - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
     plugins:

@@ -242,7 +242,7 @@ vllm_omni/                                    tests/
                                                    ├── test_zimage_tensor_parallel.py
                                                    ├── test_cache_dit.py
                                                    ├── test_teacache.py
-                                                   ├── test_stable_audio_model.py
+                                                   ├── test_stable_audio_expansion.py
                                                    ├── test_diffusion_cpu_offload.py
                                                    ├── test_diffusion_layerwise_offload.py
                                                    ├── test_diffusion_lora.py

@@ -147,7 +147,7 @@ vllm_omni/                                    tests/
                                                    ├── test_zimage_tensor_parallel.py
                                                    ├── test_cache_dit.py
                                                    ├── test_teacache.py
-                                                   ├── test_stable_audio_model.py
+                                                   ├── test_stable_audio_expansion.py
                                                    ├── test_diffusion_cpu_offload.py
                                                    ├── test_diffusion_layerwise_offload.py
                                                    ├── test_diffusion_lora.py

@@ -147,7 +147,7 @@ The following tables show which models support each feature:
 
 | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution |
 |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:|
-| **Stable-Audio-Open** | ❌ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
+| **Stable-Audio-Open** | ✅ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
 
 
 ## Feature Compatibility

@@ -23,6 +23,7 @@ python text_to_audio.py \
   --guidance-scale 7.0 \
   --audio-length 10.0 \
   --num-inference-steps 100 \
+  --cache-backend tea_cache \
   --output stable_audio_output.wav
 ```
 
@@ -34,4 +35,5 @@ Key arguments:
 - `--guidance-scale`: classifier-free guidance scale.
 - `--audio-length`: audio duration in seconds.
 - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower).
+- `--cache-backend`: cache acceleration backend. Stable Audio currently supports `tea_cache`.
 - `--output`: path to save the generated WAV file.
@@ -11,6 +11,7 @@
     python text_to_audio.py --prompt "The sound of a dog barking"
     python text_to_audio.py --prompt "A piano playing a gentle melody" --audio-length 10.0
     python text_to_audio.py --prompt "Thunder and rain sounds" --negative-prompt "Low quality"
+    python text_to_audio.py --prompt "A soft synth pad" --cache-backend tea_cache
 """
 
 import argparse
@@ -90,6 +91,23 @@ def parse_args() -> argparse.Namespace:
         default=44100,
         help="Sample rate for output audio (Stable Audio uses 44100 Hz).",
     )
+    parser.add_argument(
+        "--cache-backend",
+        type=str,
+        default=None,
+        choices=["tea_cache"],
+        help=(
+            "Cache backend to use for acceleration. "
+            "Stable Audio currently supports 'tea_cache'. "
+            "Default: None (no cache acceleration)."
+        ),
+    )
+    parser.add_argument(
+        "--tea-cache-rel-l1-thresh",
+        type=float,
+        default=0.2,
+        help="[tea_cache] Threshold for accumulated relative L1 distance.",
+    )
     parser.add_argument(
         "--enable-diffusion-pipeline-profiler",
         action="store_true",
@@ -124,6 +142,11 @@ def save_audio(audio_data: np.ndarray, output_path: str, sample_rate: int = 4410
 def main():
     args = parse_args()
     generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
+    cache_config = None
+    if args.cache_backend == "tea_cache":
+        cache_config = {
+            "rel_l1_thresh": args.tea_cache_rel_l1_thresh,
+        }
 
     print(f"\n{'=' * 60}")
     print("Stable Audio Open - Text-to-Audio Generation")
@@ -134,12 +157,15 @@ def main():
     print(f"  Audio length: {args.audio_length}s")
     print(f"  Inference steps: {args.num_inference_steps}")
     print(f"  Guidance scale: {args.guidance_scale}")
+    print(f"  Cache backend: {args.cache_backend if args.cache_backend else 'None (no acceleration)'}")
     print(f"  Seed: {args.seed}")
     print(f"{'=' * 60}\n")
 
     # Initialize Omni with Stable Audio model
     omni = Omni(
         model=args.model,
+        cache_backend=args.cache_backend,
+        cache_config=cache_config,
         enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler,
     )
 

@@ -182,6 +182,7 @@ markers = [
     "H100: Tests that require H100 GPU",
     "L4: Tests that require L4 GPU",
     "MI325: Tests that require MI325 GPU (AMD/ROCm)",
+    "B60: Tests that require Intel Arc Pro B60 XPU",
     "S5000: Tests that require S5000 GPU (Moore Threads/MUSA)",
     "A2: Tests that require A2 NPU",
     "A3: Tests that require A3 NPU",

@@ -167,7 +167,6 @@ def assert_audio_diffusion_response(
     Validate audio diffusion response.
     """
     raise NotImplementedError("Audio validation is not implemented yet")
-    # consider using assert_audio_valid defined above
 
 
 def _maybe_int(value: Any) -> int | None:
@@ -277,15 +276,32 @@ def assert_video_valid(
                 pass
 
 
-def assert_audio_valid(path: Path, *, sample_rate: int, channels: int, duration_s: float) -> None:
-    """Assert the WAV has the expected sample rate, channel count, and duration."""
+def assert_audio_valid(
+    audio_or_path: Path | np.ndarray,
+    *,
+    sample_rate: int,
+    channels: int,
+    duration_s: float,
+) -> None:
+    """Assert WAV file or (batch, channels, samples) ndarray matches expected audio format."""
+    expected_samples = int(duration_s * sample_rate)
+    if isinstance(audio_or_path, np.ndarray):
+        audio = audio_or_path
+        assert audio.ndim == 3, f"Expected audio ndim=3 (batch, channels, samples), got shape {audio.shape}"
+        assert audio.shape[0] == 1, f"Expected batch size 1, got {audio.shape[0]}"
+        assert audio.shape[1] == channels, f"Expected {channels} channels, got {audio.shape[1]}"
+        assert audio.shape[2] == expected_samples, (
+            f"Expected {expected_samples} samples ({duration_s}s @ {sample_rate} Hz), got {audio.shape[2]}"
+        )
+        return
+
+    path = audio_or_path
     assert path.exists(), f"Audio not found: {path}"
     info = sf.info(str(path))
     assert info.samplerate == sample_rate, f"Expected sample_rate={sample_rate}, got {info.samplerate}"
     assert info.channels == channels, f"Expected {channels} channel(s), got {info.channels}"
-    expected_frames = int(duration_s * sample_rate)
-    assert info.frames == expected_frames, (
-        f"Expected {expected_frames} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}"
+    assert info.frames == expected_samples, (
+        f"Expected {expected_samples} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}"
     )
 
 

@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Stable Audio offline e2e: real weights, FP8 + TeaCache (single job to save GPU).
+
+NOTE: This test instantiates Omni directly instead of using the omni_runner
+fixture (introduced in PR #2711) because the fixture's parametrize interface
+only accepts (model, stage_config_path) and does not support extra kwargs like
+quantization, cache_backend, or cache_config.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+import torch
+
+from tests.conftest import assert_audio_valid
+from tests.utils import hardware_test
+from vllm_omni import Omni
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.platforms import current_omni_platform
+
+_SAMPLE_RATE = 44100
+_CLIP_DURATION_S = 2.0
+
+
+def generate_stable_audio_short_clip(
+    omni: Omni,
+    *,
+    audio_start_in_s: float = 0.0,
+    audio_end_in_s: float = 2.0,
+    num_inference_steps: int = 4,
+    seed: int = 42,
+) -> np.ndarray:
+    """Run a minimal Stable Audio generation and return audio as (batch, channels, samples)."""
+    outputs = omni.generate(
+        prompts={
+            "prompt": "The sound of a dog barking",
+            "negative_prompt": "Low quality.",
+        },
+        sampling_params_list=OmniDiffusionSamplingParams(
+            num_inference_steps=num_inference_steps,
+            guidance_scale=7.0,
+            generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed),
+            num_outputs_per_prompt=1,
+            extra_args={
+                "audio_start_in_s": audio_start_in_s,
+                "audio_end_in_s": audio_end_in_s,
+            },
+        ),
+    )
+
+    assert outputs is not None
+    first_output = outputs[0]
+    # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata.
+    # The nested request_output is the worker OmniRequestOutput
+    # (e.g. final_output_type="audio") and holds the multimodal payload.
+    # Follow-up: add StableAudioPipeline stage YAML, and pass model into
+    # _create_default_diffusion_stage_cfg so default diffusion metadata can set
+    # final_output_type to "audio" for future audio pipelines without YAML.
+    assert first_output.final_output_type == "image"
+    assert hasattr(first_output, "request_output") and first_output.request_output
+
+    req_out = first_output.request_output
+    assert isinstance(req_out, OmniRequestOutput)
+    assert req_out.final_output_type == "audio"
+    assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output
+    audio = req_out.multimodal_output.get("audio")
+    assert isinstance(audio, np.ndarray)
+    return audio
+
+
+@pytest.mark.advanced_model
+@pytest.mark.diffusion
+@pytest.mark.cache
+@hardware_test(res={"cuda": "L4", "xpu": "B60"})
+def test_stable_audio_quantization_and_teacache() -> None:
+    """Stable Audio Open on real Hub weights with FP8 + TeaCache (covers former L2 smoke + L4 features).
+
+    CI should provide ``HF_TOKEN`` if the checkpoint is gated.
+    """
+    m = Omni(
+        model="stabilityai/stable-audio-open-1.0",
+        quantization="fp8",
+        cache_backend="tea_cache",
+        cache_config={"rel_l1_thresh": 0.2},
+    )
+    try:
+        audio = generate_stable_audio_short_clip(m)
+        assert_audio_valid(
+            audio,
+            sample_rate=_SAMPLE_RATE,
+            channels=2,
+            duration_s=_CLIP_DURATION_S,
+        )
+    finally:
+        m.close()