From 41dbfe6bd0b3660d5c8396225a795f09c041cd1d Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Tue, 31 Mar 2026 11:15:12 +0000
Subject: [PATCH 01/12] add teacache test

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 docs/user_guide/diffusion_features.md         |  2 +-
 .../test_stable_audio_model.py                | 38 +++++++++++++++----
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md
index 7e325c1edc8..fda26b53113 100644
--- a/docs/user_guide/diffusion_features.md
+++ b/docs/user_guide/diffusion_features.md
@@ -128,7 +128,7 @@ The following tables show which models support each feature:
 
 | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization |
 |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|
-| **Stable-Audio-Open** | ❌ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ |
+| **Stable-Audio-Open** | ✅ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ |
 
 
 ## Feature Compatibility
diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py
index ff4d9b40172..d7d11f0d947 100644
--- a/tests/e2e/offline_inference/test_stable_audio_model.py
+++ b/tests/e2e/offline_inference/test_stable_audio_model.py
@@ -21,13 +21,8 @@
 models = ["linyueqian/stable_audio_random"]
 
 
-@pytest.mark.core_model
-@pytest.mark.diffusion
-@hardware_test(res={"cuda": "L4", "xpu": "B60"})
-@pytest.mark.parametrize("model_name", models)
-def test_stable_audio_model(model_name: str):
-    m = Omni(model=model_name)
-
+def _run_stable_audio_and_validate(m: Omni) -> None:
+    """Run a minimal Stable Audio generation and validate output shape."""
     # Use minimal settings for testing
     # Generate a short 2-second audio clip with minimal inference steps
     audio_start_in_s = 0.0
@@ -70,3 +65,32 @@ def test_stable_audio_model(model_name: str):
     assert audio.shape[1] == 2  # stereo channels
     expected_samples = int((audio_end_in_s - audio_start_in_s) * sample_rate)
     assert audio.shape[2] == expected_samples  # 88200 samples for 2 seconds
+
+
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "L4", "xpu": "B60"})
+@pytest.mark.parametrize("model_name", models)
+def test_stable_audio_model(model_name: str):
+    m = Omni(model=model_name)
+    try:
+        _run_stable_audio_and_validate(m)
+    finally:
+        m.close()
+
+
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@pytest.mark.cache
+@hardware_test(res={"cuda": "L4", "xpu": "B60"})
+@pytest.mark.parametrize("model_name", models)
+def test_stable_audio_teacache(model_name: str):
+    m = Omni(
+        model=model_name,
+        cache_backend="tea_cache",
+        cache_config={"rel_l1_thresh": 0.2},
+    )
+    try:
+        _run_stable_audio_and_validate(m)
+    finally:
+        m.close()

From 405f59143eb8d02ba5e0249327d5c6e80223f02b Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Tue, 31 Mar 2026 11:22:19 +0000
Subject: [PATCH 02/12] update to match new diffusion user guide

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 .../offline_inference/text_to_audio.md        |  2 ++
 .../offline_inference/text_to_audio/README.md |  2 ++
 .../text_to_audio/text_to_audio.py            | 26 +++++++++++++++++++
 3 files changed, 30 insertions(+)

diff --git a/docs/user_guide/examples/offline_inference/text_to_audio.md b/docs/user_guide/examples/offline_inference/text_to_audio.md
index 62a70e5254d..9be09194bc5 100644
--- a/docs/user_guide/examples/offline_inference/text_to_audio.md
+++ b/docs/user_guide/examples/offline_inference/text_to_audio.md
@@ -26,6 +26,7 @@ python text_to_audio.py \
   --guidance-scale 7.0 \
   --audio-length 10.0 \
   --num-inference-steps 100 \
+  --cache-backend tea_cache \
   --output stable_audio_output.wav
 ```
 
@@ -37,6 +38,7 @@ Key arguments:
 - `--guidance-scale`: classifier-free guidance scale.
 - `--audio-length`: audio duration in seconds.
 - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower).
+- `--cache-backend`: cache acceleration backend. Stable Audio currently supports `tea_cache`.
 - `--output`: path to save the generated WAV file.
 
 ## Example materials
diff --git a/examples/offline_inference/text_to_audio/README.md b/examples/offline_inference/text_to_audio/README.md
index 7edc38092ad..50bab3e2f2d 100644
--- a/examples/offline_inference/text_to_audio/README.md
+++ b/examples/offline_inference/text_to_audio/README.md
@@ -23,6 +23,7 @@ python text_to_audio.py \
   --guidance-scale 7.0 \
   --audio-length 10.0 \
   --num-inference-steps 100 \
+  --cache-backend tea_cache \
   --output stable_audio_output.wav
 ```
 
@@ -34,4 +35,5 @@ Key arguments:
 - `--guidance-scale`: classifier-free guidance scale.
 - `--audio-length`: audio duration in seconds.
 - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower).
+- `--cache-backend`: cache acceleration backend. Stable Audio currently supports `tea_cache`.
 - `--output`: path to save the generated WAV file.
diff --git a/examples/offline_inference/text_to_audio/text_to_audio.py b/examples/offline_inference/text_to_audio/text_to_audio.py
index a6968c419f6..3adb3ad53a5 100644
--- a/examples/offline_inference/text_to_audio/text_to_audio.py
+++ b/examples/offline_inference/text_to_audio/text_to_audio.py
@@ -11,6 +11,7 @@
     python text_to_audio.py --prompt "The sound of a dog barking"
     python text_to_audio.py --prompt "A piano playing a gentle melody" --audio-length 10.0
     python text_to_audio.py --prompt "Thunder and rain sounds" --negative-prompt "Low quality"
+    python text_to_audio.py --prompt "A soft synth pad" --cache-backend tea_cache
 """
 
 import argparse
@@ -90,6 +91,23 @@ def parse_args() -> argparse.Namespace:
         default=44100,
         help="Sample rate for output audio (Stable Audio uses 44100 Hz).",
     )
+    parser.add_argument(
+        "--cache-backend",
+        type=str,
+        default=None,
+        choices=["tea_cache"],
+        help=(
+            "Cache backend to use for acceleration. "
+            "Stable Audio currently supports 'tea_cache'. "
+            "Default: None (no cache acceleration)."
+        ),
+    )
+    parser.add_argument(
+        "--tea-cache-rel-l1-thresh",
+        type=float,
+        default=0.2,
+        help="[tea_cache] Threshold for accumulated relative L1 distance.",
+    )
     parser.add_argument(
         "--enable-diffusion-pipeline-profiler",
         action="store_true",
@@ -124,6 +142,11 @@ def save_audio(audio_data: np.ndarray, output_path: str, sample_rate: int = 4410
 def main():
     args = parse_args()
     generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
+    cache_config = None
+    if args.cache_backend == "tea_cache":
+        cache_config = {
+            "rel_l1_thresh": args.tea_cache_rel_l1_thresh,
+        }
 
     print(f"\n{'=' * 60}")
     print("Stable Audio Open - Text-to-Audio Generation")
@@ -134,12 +157,15 @@ def main():
     print(f"  Audio length: {args.audio_length}s")
     print(f"  Inference steps: {args.num_inference_steps}")
     print(f"  Guidance scale: {args.guidance_scale}")
+    print(f"  Cache backend: {args.cache_backend if args.cache_backend else 'None (no acceleration)'}")
     print(f"  Seed: {args.seed}")
     print(f"{'=' * 60}\n")
 
     # Initialize Omni with Stable Audio model
     omni = Omni(
         model=args.model,
+        cache_backend=args.cache_backend,
+        cache_config=cache_config,
         enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler,
     )
 

From 8038dcbb3f6ad2d2d5337eec6ea2f2d84c23ae13 Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Wed, 1 Apr 2026 06:04:32 +0000
Subject: [PATCH 03/12] register B60

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 43e9506fd07..d949318b7f5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -181,6 +181,7 @@ markers = [
     "H100: Tests that require H100 GPU",
     "L4: Tests that require L4 GPU",
     "MI325: Tests that require MI325 GPU (AMD/ROCm)",
+    "B60: Tests that require Intel Arc Pro B60 XPU",
     "A2: Tests that require A2 NPU",
     "A3: Tests that require A3 NPU",
     "distributed_cuda: Tests that require multi cards on CUDA platform",

From 193cd5681f16d588ce3ab8833b8f04d2ede3afeb Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Wed, 1 Apr 2026 14:57:59 +0000
Subject: [PATCH 04/12] resolve review comments

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 .buildkite/test-amd-merge.yml                 |  2 +-
 .buildkite/test-amd-ready.yaml                |  2 +-
 .buildkite/test-merge.yml                     |  2 +-
 .buildkite/test-nightly.yml                   |  2 +-
 .buildkite/test-ready.yml                     |  2 +-
 docs/contributing/ci/CI_5levels.md            |  3 +-
 docs/contributing/ci/tests_style.md           |  3 +-
 tests/conftest.py                             | 28 ++++--
 .../stable_audio_offline_utils.py             | 54 +++++++++++
 .../offline_inference/test_stable_audio.py    | 43 +++++++++
 .../test_stable_audio_expansion.py            | 49 ++++++++++
 .../test_stable_audio_model.py                | 96 -------------------
 12 files changed, 177 insertions(+), 109 deletions(-)
 create mode 100644 tests/e2e/offline_inference/stable_audio_offline_utils.py
 create mode 100644 tests/e2e/offline_inference/test_stable_audio.py
 create mode 100644 tests/e2e/offline_inference/test_stable_audio_expansion.py
 delete mode 100644 tests/e2e/offline_inference/test_stable_audio_model.py

diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml
index 60ba0d9d416..a1cd9fdc1f4 100644
--- a/.buildkite/test-amd-merge.yml
+++ b/.buildkite/test-amd-merge.yml
@@ -55,7 +55,7 @@ steps:
 #     - export GPU_ARCHS=gfx942
 #     - export VLLM_LOGGING_LEVEL=DEBUG
 #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py
 
 - label: "Diffusion Cache Backend Test"
   agent_pool: mi325_1
diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml
index 6e31163accb..f36e2b7438f 100644
--- a/.buildkite/test-amd-ready.yaml
+++ b/.buildkite/test-amd-ready.yaml
@@ -46,7 +46,7 @@ steps:
 #     - export GPU_ARCHS=gfx942
 #     - export VLLM_LOGGING_LEVEL=DEBUG
 #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py
 
 - label: "Diffusion Cache Backend Test"
   agent_pool: mi325_1
diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml
index 7bee193191e..0a5bc55378c 100644
--- a/.buildkite/test-merge.yml
+++ b/.buildkite/test-merge.yml
@@ -75,7 +75,7 @@ steps:
     timeout_in_minutes: 20
     depends_on: upload-merge-pipeline
     commands:
-      - pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+      - pytest -s -v tests/e2e/offline_inference/test_stable_audio.py
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
     plugins:
diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index 5c6d6d35a65..e35da7f2a8b 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -161,7 +161,7 @@ steps:
     if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model"
+      - pytest -s -v tests/e2e/online_serving/test_*_expansion.py tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model"
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:
diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml
index 89839a2d1ed..e9b11ce3165 100644
--- a/.buildkite/test-ready.yml
+++ b/.buildkite/test-ready.yml
@@ -103,7 +103,7 @@ steps:
   - label: "Audio Generation Model Test"
     depends_on: upload-ready-pipeline
     commands:
-      - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+      - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
     plugins:
diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md
index 81392b201da..e67d4dac1bd 100644
--- a/docs/contributing/ci/CI_5levels.md
+++ b/docs/contributing/ci/CI_5levels.md
@@ -243,7 +243,8 @@ vllm_omni/                                    tests/
                                                    ├── test_zimage_tensor_parallel.py
                                                    ├── test_cache_dit.py
                                                    ├── test_teacache.py
-                                                   ├── test_stable_audio_model.py
+                                                   ├── test_stable_audio.py
+                                                   ├── test_stable_audio_expansion.py
                                                    ├── test_diffusion_cpu_offload.py
                                                    ├── test_diffusion_layerwise_offload.py
                                                    ├── test_diffusion_lora.py
diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md
index 0b07c5ffe4a..2b226a2c990 100644
--- a/docs/contributing/ci/tests_style.md
+++ b/docs/contributing/ci/tests_style.md
@@ -148,7 +148,8 @@ vllm_omni/                                    tests/
                                                    ├── test_zimage_tensor_parallel.py
                                                    ├── test_cache_dit.py
                                                    ├── test_teacache.py
-                                                   ├── test_stable_audio_model.py
+                                                   ├── test_stable_audio.py
+                                                   ├── test_stable_audio_expansion.py
                                                    ├── test_diffusion_cpu_offload.py
                                                    ├── test_diffusion_layerwise_offload.py
                                                    ├── test_diffusion_lora.py
diff --git a/tests/conftest.py b/tests/conftest.py
index fb888695428..967f9fbd6fb 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -162,7 +162,6 @@ def assert_audio_diffusion_response(
     Validate audio diffusion response.
     """
     raise NotImplementedError("Audio validation is not implemented yet")
-    # consider using assert_audio_valid defined above
 
 
 def _maybe_int(value: Any) -> int | None:
@@ -272,15 +271,32 @@ def assert_video_valid(
                 pass
 
 
-def assert_audio_valid(path: Path, *, sample_rate: int, channels: int, duration_s: float) -> None:
-    """Assert the WAV has the expected sample rate, channel count, and duration."""
+def assert_audio_valid(
+    audio_or_path: Path | np.ndarray,
+    *,
+    sample_rate: int,
+    channels: int,
+    duration_s: float,
+) -> None:
+    """Assert WAV file or (batch, channels, samples) ndarray matches expected audio format."""
+    expected_samples = int(duration_s * sample_rate)
+    if isinstance(audio_or_path, np.ndarray):
+        audio = audio_or_path
+        assert audio.ndim == 3, f"Expected audio ndim=3 (batch, channels, samples), got shape {audio.shape}"
+        assert audio.shape[0] == 1, f"Expected batch size 1, got {audio.shape[0]}"
+        assert audio.shape[1] == channels, f"Expected {channels} channels, got {audio.shape[1]}"
+        assert audio.shape[2] == expected_samples, (
+            f"Expected {expected_samples} samples ({duration_s}s @ {sample_rate} Hz), got {audio.shape[2]}"
+        )
+        return
+
+    path = audio_or_path
     assert path.exists(), f"Audio not found: {path}"
     info = sf.info(str(path))
     assert info.samplerate == sample_rate, f"Expected sample_rate={sample_rate}, got {info.samplerate}"
     assert info.channels == channels, f"Expected {channels} channel(s), got {info.channels}"
-    expected_frames = int(duration_s * sample_rate)
-    assert info.frames == expected_frames, (
-        f"Expected {expected_frames} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}"
+    assert info.frames == expected_samples, (
+        f"Expected {expected_samples} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}"
     )
 
 
diff --git a/tests/e2e/offline_inference/stable_audio_offline_utils.py b/tests/e2e/offline_inference/stable_audio_offline_utils.py
new file mode 100644
index 00000000000..cf602348393
--- /dev/null
+++ b/tests/e2e/offline_inference/stable_audio_offline_utils.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Shared generation helpers for Stable Audio offline e2e tests."""
+
+from __future__ import annotations
+
+import numpy as np
+import torch
+
+from vllm_omni import Omni
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.platforms import current_omni_platform
+
+
+def generate_stable_audio_short_clip(
+    omni: Omni,
+    *,
+    audio_start_in_s: float = 0.0,
+    audio_end_in_s: float = 2.0,
+    num_inference_steps: int = 4,
+    seed: int = 42,
+) -> np.ndarray:
+    """Run a minimal Stable Audio generation and return audio as (batch, channels, samples)."""
+    outputs = omni.generate(
+        prompts={
+            "prompt": "The sound of a dog barking",
+            "negative_prompt": "Low quality.",
+        },
+        sampling_params_list=OmniDiffusionSamplingParams(
+            num_inference_steps=num_inference_steps,
+            guidance_scale=7.0,
+            generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed),
+            num_outputs_per_prompt=1,
+            extra_args={
+                "audio_start_in_s": audio_start_in_s,
+                "audio_end_in_s": audio_end_in_s,
+            },
+        ),
+    )
+
+    assert outputs is not None
+    first_output = outputs[0]
+    assert first_output.final_output_type == "image"
+    assert hasattr(first_output, "request_output") and first_output.request_output
+
+    req_out = first_output.request_output
+    assert isinstance(req_out, OmniRequestOutput)
+    assert req_out.final_output_type == "audio"
+    assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output
+    audio = req_out.multimodal_output.get("audio")
+    assert isinstance(audio, np.ndarray)
+    return audio
diff --git a/tests/e2e/offline_inference/test_stable_audio.py b/tests/e2e/offline_inference/test_stable_audio.py
new file mode 100644
index 00000000000..cb8e0400ee0
--- /dev/null
+++ b/tests/e2e/offline_inference/test_stable_audio.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""L2 offline inference: basic Stable Audio deployment and output shape."""
+
+import sys
+from pathlib import Path
+
+# ruff: noqa: E402
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+import pytest
+
+from tests.conftest import assert_audio_valid
+from tests.e2e.offline_inference.stable_audio_offline_utils import generate_stable_audio_short_clip
+from tests.utils import hardware_test
+from vllm_omni import Omni
+
+# Use random weights model for CI testing (small, no authentication required)
+models = ["linyueqian/stable_audio_random"]
+
+_SAMPLE_RATE = 44100
+_CLIP_DURATION_S = 2.0
+
+
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "L4", "xpu": "B60"})
+@pytest.mark.parametrize("model_name", models)
+def test_stable_audio(model_name: str) -> None:
+    m = Omni(model=model_name)
+    try:
+        audio = generate_stable_audio_short_clip(m)
+        assert_audio_valid(
+            audio,
+            sample_rate=_SAMPLE_RATE,
+            channels=2,
+            duration_s=_CLIP_DURATION_S,
+        )
+    finally:
+        m.close()
diff --git a/tests/e2e/offline_inference/test_stable_audio_expansion.py b/tests/e2e/offline_inference/test_stable_audio_expansion.py
new file mode 100644
index 00000000000..e0282a4b3b6
--- /dev/null
+++ b/tests/e2e/offline_inference/test_stable_audio_expansion.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""L4 offline inference: Stable Audio with combined FP8 quantization and TeaCache."""
+
+import sys
+from pathlib import Path
+
+# ruff: noqa: E402
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+import pytest
+
+from tests.conftest import assert_audio_valid
+from tests.e2e.offline_inference.stable_audio_offline_utils import generate_stable_audio_short_clip
+from tests.utils import hardware_test
+from vllm_omni import Omni
+
+models = ["linyueqian/stable_audio_random"]
+
+_SAMPLE_RATE = 44100
+_CLIP_DURATION_S = 2.0
+
+
+@pytest.mark.advanced_model
+@pytest.mark.diffusion
+@pytest.mark.cache
+@hardware_test(res={"cuda": "L4", "xpu": "B60"})
+@pytest.mark.parametrize("model_name", models)
+def test_stable_audio_quantization_and_teacache(model_name: str) -> None:
+    """TeaCache + FP8 quantization in one run (L4 coverage)."""
+    m = Omni(
+        model=model_name,
+        quantization="fp8",
+        cache_backend="tea_cache",
+        cache_config={"rel_l1_thresh": 0.2},
+    )
+    try:
+        audio = generate_stable_audio_short_clip(m)
+        assert_audio_valid(
+            audio,
+            sample_rate=_SAMPLE_RATE,
+            channels=2,
+            duration_s=_CLIP_DURATION_S,
+        )
+    finally:
+        m.close()
diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py
deleted file mode 100644
index d7d11f0d947..00000000000
--- a/tests/e2e/offline_inference/test_stable_audio_model.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import sys
-from pathlib import Path
-
-import numpy as np
-import pytest
-import torch
-
-from tests.utils import hardware_test
-from vllm_omni.inputs.data import OmniDiffusionSamplingParams
-from vllm_omni.outputs import OmniRequestOutput
-from vllm_omni.platforms import current_omni_platform
-
-# ruff: noqa: E402
-REPO_ROOT = Path(__file__).resolve().parents[2]
-if str(REPO_ROOT) not in sys.path:
-    sys.path.insert(0, str(REPO_ROOT))
-
-from vllm_omni import Omni
-
-# Use random weights model for CI testing (small, no authentication required)
-models = ["linyueqian/stable_audio_random"]
-
-
-def _run_stable_audio_and_validate(m: Omni) -> None:
-    """Run a minimal Stable Audio generation and validate output shape."""
-    # Use minimal settings for testing
-    # Generate a short 2-second audio clip with minimal inference steps
-    audio_start_in_s = 0.0
-    audio_end_in_s = 2.0  # Short duration for fast testing
-    sample_rate = 44100  # Stable Audio uses 44100 Hz
-
-    outputs = m.generate(
-        prompts={
-            "prompt": "The sound of a dog barking",
-            "negative_prompt": "Low quality.",
-        },
-        sampling_params_list=OmniDiffusionSamplingParams(
-            num_inference_steps=4,  # Minimal steps for speed
-            guidance_scale=7.0,
-            generator=torch.Generator(current_omni_platform.device_type).manual_seed(42),
-            num_outputs_per_prompt=1,
-            extra_args={
-                "audio_start_in_s": audio_start_in_s,
-                "audio_end_in_s": audio_end_in_s,
-            },
-        ),
-    )
-
-    # Extract audio from OmniRequestOutput
-    assert outputs is not None
-    first_output = outputs[0]
-    assert first_output.final_output_type == "image"
-    assert hasattr(first_output, "request_output") and first_output.request_output
-
-    req_out = first_output.request_output
-    assert isinstance(req_out, OmniRequestOutput)
-    assert req_out.final_output_type == "audio"
-    assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output
-    audio = req_out.multimodal_output.get("audio")
-    assert isinstance(audio, np.ndarray)
-    # audio shape: (batch, channels, samples)
-    # For stable-audio-open-1.0: sample_rate=44100, so 2 seconds = 88200 samples
-    assert audio.ndim == 3
-    assert audio.shape[0] == 1  # batch size
-    assert audio.shape[1] == 2  # stereo channels
-    expected_samples = int((audio_end_in_s - audio_start_in_s) * sample_rate)
-    assert audio.shape[2] == expected_samples  # 88200 samples for 2 seconds
-
-
-@pytest.mark.core_model
-@pytest.mark.diffusion
-@hardware_test(res={"cuda": "L4", "xpu": "B60"})
-@pytest.mark.parametrize("model_name", models)
-def test_stable_audio_model(model_name: str):
-    m = Omni(model=model_name)
-    try:
-        _run_stable_audio_and_validate(m)
-    finally:
-        m.close()
-
-
-@pytest.mark.core_model
-@pytest.mark.diffusion
-@pytest.mark.cache
-@hardware_test(res={"cuda": "L4", "xpu": "B60"})
-@pytest.mark.parametrize("model_name", models)
-def test_stable_audio_teacache(model_name: str):
-    m = Omni(
-        model=model_name,
-        cache_backend="tea_cache",
-        cache_config={"rel_l1_thresh": 0.2},
-    )
-    try:
-        _run_stable_audio_and_validate(m)
-    finally:
-        m.close()

From 466396bebad6b7f1907610eefe829454980069f7 Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Wed, 1 Apr 2026 15:02:35 +0000
Subject: [PATCH 05/12] reset a file

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 .buildkite/test-nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index e35da7f2a8b..5c6d6d35a65 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -161,7 +161,7 @@ steps:
     if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/e2e/online_serving/test_*_expansion.py tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model"
+      - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model"
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:

From e05b372ba2e710b6f246d4b369d2e2004b4c5334 Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Wed, 1 Apr 2026 15:19:05 +0000
Subject: [PATCH 06/12] add nightly test

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 .buildkite/test-nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index 5c6d6d35a65..0307286913b 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -161,7 +161,7 @@ steps:
     if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model"
+      - pytest -s -v tests/e2e/online_serving/test_*_expansion.py tests/e2e/offline_inference/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model"
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:

From 3c3a262fcb43c4d4164c5d861c7622e14eae0263 Mon Sep 17 00:00:00 2001
From: Zhang Jian <jianmusings@gmail.com>
Date: Sat, 4 Apr 2026 03:36:31 +0000
Subject: [PATCH 07/12] add comment to explain confusing config

Signed-off-by: Zhang Jian <jianmusings@gmail.com>
---
 tests/e2e/offline_inference/stable_audio_offline_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/e2e/offline_inference/stable_audio_offline_utils.py b/tests/e2e/offline_inference/stable_audio_offline_utils.py
index cf602348393..906dea11a41 100644
--- a/tests/e2e/offline_inference/stable_audio_offline_utils.py
+++ b/tests/e2e/offline_inference/stable_audio_offline_utils.py
@@ -42,6 +42,12 @@ def generate_stable_audio_short_clip(
 
     assert outputs is not None
     first_output = outputs[0]
+    # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata. 
+    # The nested request_output is the worker OmniRequestOutput
+    # (e.g. final_output_type="audio") and holds the multimodal payload.
+    # Follow-up: add StableAudioPipeline stage YAML, and pass model into
+    # _create_default_diffusion_stage_cfg so default diffusion metadata can set
+    # final_output_type to "audio" for future audio pipelines without YAML.
     assert first_output.final_output_type == "image"
     assert hasattr(first_output, "request_output") and first_output.request_output
 

From 4ce6d4ff7f490ff3482461f6dd02ea18b84b22e7 Mon Sep 17 00:00:00 2001
From: Zhang Jian <jianmusings@gmail.com>
Date: Sat, 4 Apr 2026 05:55:37 +0000
Subject: [PATCH 08/12] remove duplicated test

Signed-off-by: Zhang Jian <jianmusings@gmail.com>
---
 .buildkite/test-merge.yml | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml
index c594f128cc8..7a903d9df7c 100644
--- a/.buildkite/test-merge.yml
+++ b/.buildkite/test-merge.yml
@@ -71,24 +71,6 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Audio Generation Model Test"
-    timeout_in_minutes: 20
-    depends_on: upload-merge-pipeline
-    commands:
-      - pytest -s -v tests/e2e/offline_inference/test_stable_audio.py
-    agents:
-      queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
-    plugins:
-      - docker#v5.2.0:
-          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-          always-pull: true
-          propagate-environment: true
-          environment:
-            - "HF_HOME=/fsx/hf_cache"
-            - "HF_TOKEN"
-          volumes:
-            - "/fsx/hf_cache:/fsx/hf_cache"
-
   - label: "Diffusion Cache Backend Test"
     timeout_in_minutes: 15
     depends_on: upload-merge-pipeline

From 2763982c4601619c53aff6712a4a18dead2ca6f3 Mon Sep 17 00:00:00 2001
From: Zhang Jian <jianmusings@gmail.com>
Date: Sat, 4 Apr 2026 05:56:28 +0000
Subject: [PATCH 09/12] still use offline test for stable audio until PR 2452
 is merged

Signed-off-by: Zhang Jian <jianmusings@gmail.com>
---
 .buildkite/test-nightly.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index ced77293a39..d935a6e245a 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -154,7 +154,8 @@ steps:
     if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/e2e/online_serving/test_*_expansion.py tests/e2e/offline_inference/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model"
+      - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model"
+      - pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model"
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:

From 0d9f44b3489aca00b2ef4cdd37cad56b1fab35d4 Mon Sep 17 00:00:00 2001
From: Zhang Jian <jianmusings@gmail.com>
Date: Sat, 4 Apr 2026 12:55:37 +0000
Subject: [PATCH 10/12] merge 3 tests into 1

Signed-off-by: Zhang Jian <jianmusings@gmail.com>
---
 .buildkite/test-amd-merge.yml                 |  2 +-
 .buildkite/test-amd-ready.yaml                |  2 +-
 .buildkite/test-ready.yml                     |  2 +-
 docs/contributing/ci/CI_5levels.md            |  1 -
 docs/contributing/ci/tests_style.md           |  1 -
 .../stable_audio_offline_utils.py             | 60 ----------------
 .../offline_inference/test_stable_audio.py    | 43 ------------
 .../test_stable_audio_expansion.py            | 68 ++++++++++++++++---
 8 files changed, 63 insertions(+), 116 deletions(-)
 delete mode 100644 tests/e2e/offline_inference/stable_audio_offline_utils.py
 delete mode 100644 tests/e2e/offline_inference/test_stable_audio.py

diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml
index a1cd9fdc1f4..4c8f08ea87a 100644
--- a/.buildkite/test-amd-merge.yml
+++ b/.buildkite/test-amd-merge.yml
@@ -55,7 +55,7 @@ steps:
 #     - export GPU_ARCHS=gfx942
 #     - export VLLM_LOGGING_LEVEL=DEBUG
 #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py
+#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model
 
 - label: "Diffusion Cache Backend Test"
   agent_pool: mi325_1
diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml
index f36e2b7438f..9f0c848aaae 100644
--- a/.buildkite/test-amd-ready.yaml
+++ b/.buildkite/test-amd-ready.yaml
@@ -46,7 +46,7 @@ steps:
 #     - export GPU_ARCHS=gfx942
 #     - export VLLM_LOGGING_LEVEL=DEBUG
 #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py
+#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model
 
 - label: "Diffusion Cache Backend Test"
   agent_pool: mi325_1
diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml
index feff28dfc3f..c8d7041b857 100644
--- a/.buildkite/test-ready.yml
+++ b/.buildkite/test-ready.yml
@@ -103,7 +103,7 @@ steps:
   - label: "Audio Generation Model Test"
     depends_on: upload-ready-pipeline
     commands:
-      - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio.py
+      - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
     plugins:
diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md
index 492012bef11..dadc702755b 100644
--- a/docs/contributing/ci/CI_5levels.md
+++ b/docs/contributing/ci/CI_5levels.md
@@ -242,7 +242,6 @@ vllm_omni/                                    tests/
                                                    ├── test_zimage_tensor_parallel.py
                                                    ├── test_cache_dit.py
                                                    ├── test_teacache.py
-                                                   ├── test_stable_audio.py
                                                    ├── test_stable_audio_expansion.py
                                                    ├── test_diffusion_cpu_offload.py
                                                    ├── test_diffusion_layerwise_offload.py
diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md
index 168aa2907d0..69d5b16d7a5 100644
--- a/docs/contributing/ci/tests_style.md
+++ b/docs/contributing/ci/tests_style.md
@@ -147,7 +147,6 @@ vllm_omni/                                    tests/
                                                    ├── test_zimage_tensor_parallel.py
                                                    ├── test_cache_dit.py
                                                    ├── test_teacache.py
-                                                   ├── test_stable_audio.py
                                                    ├── test_stable_audio_expansion.py
                                                    ├── test_diffusion_cpu_offload.py
                                                    ├── test_diffusion_layerwise_offload.py
diff --git a/tests/e2e/offline_inference/stable_audio_offline_utils.py b/tests/e2e/offline_inference/stable_audio_offline_utils.py
deleted file mode 100644
index 906dea11a41..00000000000
--- a/tests/e2e/offline_inference/stable_audio_offline_utils.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-"""Shared generation helpers for Stable Audio offline e2e tests."""
-
-from __future__ import annotations
-
-import numpy as np
-import torch
-
-from vllm_omni import Omni
-from vllm_omni.inputs.data import OmniDiffusionSamplingParams
-from vllm_omni.outputs import OmniRequestOutput
-from vllm_omni.platforms import current_omni_platform
-
-
-def generate_stable_audio_short_clip(
-    omni: Omni,
-    *,
-    audio_start_in_s: float = 0.0,
-    audio_end_in_s: float = 2.0,
-    num_inference_steps: int = 4,
-    seed: int = 42,
-) -> np.ndarray:
-    """Run a minimal Stable Audio generation and return audio as (batch, channels, samples)."""
-    outputs = omni.generate(
-        prompts={
-            "prompt": "The sound of a dog barking",
-            "negative_prompt": "Low quality.",
-        },
-        sampling_params_list=OmniDiffusionSamplingParams(
-            num_inference_steps=num_inference_steps,
-            guidance_scale=7.0,
-            generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed),
-            num_outputs_per_prompt=1,
-            extra_args={
-                "audio_start_in_s": audio_start_in_s,
-                "audio_end_in_s": audio_end_in_s,
-            },
-        ),
-    )
-
-    assert outputs is not None
-    first_output = outputs[0]
-    # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata. 
-    # The nested request_output is the worker OmniRequestOutput
-    # (e.g. final_output_type="audio") and holds the multimodal payload.
-    # Follow-up: add StableAudioPipeline stage YAML, and pass model into
-    # _create_default_diffusion_stage_cfg so default diffusion metadata can set
-    # final_output_type to "audio" for future audio pipelines without YAML.
-    assert first_output.final_output_type == "image"
-    assert hasattr(first_output, "request_output") and first_output.request_output
-
-    req_out = first_output.request_output
-    assert isinstance(req_out, OmniRequestOutput)
-    assert req_out.final_output_type == "audio"
-    assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output
-    audio = req_out.multimodal_output.get("audio")
-    assert isinstance(audio, np.ndarray)
-    return audio
diff --git a/tests/e2e/offline_inference/test_stable_audio.py b/tests/e2e/offline_inference/test_stable_audio.py
deleted file mode 100644
index cb8e0400ee0..00000000000
--- a/tests/e2e/offline_inference/test_stable_audio.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-"""L2 offline inference: basic Stable Audio deployment and output shape."""
-
-import sys
-from pathlib import Path
-
-# ruff: noqa: E402
-REPO_ROOT = Path(__file__).resolve().parents[2]
-if str(REPO_ROOT) not in sys.path:
-    sys.path.insert(0, str(REPO_ROOT))
-
-import pytest
-
-from tests.conftest import assert_audio_valid
-from tests.e2e.offline_inference.stable_audio_offline_utils import generate_stable_audio_short_clip
-from tests.utils import hardware_test
-from vllm_omni import Omni
-
-# Use random weights model for CI testing (small, no authentication required)
-models = ["linyueqian/stable_audio_random"]
-
-_SAMPLE_RATE = 44100
-_CLIP_DURATION_S = 2.0
-
-
-@pytest.mark.core_model
-@pytest.mark.diffusion
-@hardware_test(res={"cuda": "L4", "xpu": "B60"})
-@pytest.mark.parametrize("model_name", models)
-def test_stable_audio(model_name: str) -> None:
-    m = Omni(model=model_name)
-    try:
-        audio = generate_stable_audio_short_clip(m)
-        assert_audio_valid(
-            audio,
-            sample_rate=_SAMPLE_RATE,
-            channels=2,
-            duration_s=_CLIP_DURATION_S,
-        )
-    finally:
-        m.close()
diff --git a/tests/e2e/offline_inference/test_stable_audio_expansion.py b/tests/e2e/offline_inference/test_stable_audio_expansion.py
index e0282a4b3b6..ab2727390ca 100644
--- a/tests/e2e/offline_inference/test_stable_audio_expansion.py
+++ b/tests/e2e/offline_inference/test_stable_audio_expansion.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-"""L4 offline inference: Stable Audio with combined FP8 quantization and TeaCache."""
+"""Stable Audio offline e2e: real weights, FP8 + TeaCache (single job to save GPU)."""
+
+from __future__ import annotations
 
 import sys
 from pathlib import Path
@@ -11,28 +13,78 @@
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
 
+import numpy as np
 import pytest
+import torch
 
 from tests.conftest import assert_audio_valid
-from tests.e2e.offline_inference.stable_audio_offline_utils import generate_stable_audio_short_clip
 from tests.utils import hardware_test
 from vllm_omni import Omni
-
-models = ["linyueqian/stable_audio_random"]
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.platforms import current_omni_platform
 
 _SAMPLE_RATE = 44100
 _CLIP_DURATION_S = 2.0
 
 
+def generate_stable_audio_short_clip(
+    omni: Omni,
+    *,
+    audio_start_in_s: float = 0.0,
+    audio_end_in_s: float = 2.0,
+    num_inference_steps: int = 4,
+    seed: int = 42,
+) -> np.ndarray:
+    """Run a minimal Stable Audio generation and return audio as (batch, channels, samples)."""
+    outputs = omni.generate(
+        prompts={
+            "prompt": "The sound of a dog barking",
+            "negative_prompt": "Low quality.",
+        },
+        sampling_params_list=OmniDiffusionSamplingParams(
+            num_inference_steps=num_inference_steps,
+            guidance_scale=7.0,
+            generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed),
+            num_outputs_per_prompt=1,
+            extra_args={
+                "audio_start_in_s": audio_start_in_s,
+                "audio_end_in_s": audio_end_in_s,
+            },
+        ),
+    )
+
+    assert outputs is not None
+    first_output = outputs[0]
+    # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata.
+    # The nested request_output is the worker OmniRequestOutput
+    # (e.g. final_output_type="audio") and holds the multimodal payload.
+    # Follow-up: add StableAudioPipeline stage YAML, and pass model into
+    # _create_default_diffusion_stage_cfg so default diffusion metadata can set
+    # final_output_type to "audio" for future audio pipelines without YAML.
+    assert first_output.final_output_type == "image"
+    assert hasattr(first_output, "request_output") and first_output.request_output
+
+    req_out = first_output.request_output
+    assert isinstance(req_out, OmniRequestOutput)
+    assert req_out.final_output_type == "audio"
+    assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output
+    audio = req_out.multimodal_output.get("audio")
+    assert isinstance(audio, np.ndarray)
+    return audio
+
+
 @pytest.mark.advanced_model
 @pytest.mark.diffusion
 @pytest.mark.cache
 @hardware_test(res={"cuda": "L4", "xpu": "B60"})
-@pytest.mark.parametrize("model_name", models)
-def test_stable_audio_quantization_and_teacache(model_name: str) -> None:
-    """TeaCache + FP8 quantization in one run (L4 coverage)."""
+def test_stable_audio_quantization_and_teacache() -> None:
+    """Stable Audio Open on real Hub weights with FP8 + TeaCache (covers former L2 smoke + L4 features).
+
+    CI should provide ``HF_TOKEN`` if the checkpoint is gated.
+    """
     m = Omni(
-        model=model_name,
+        model="stabilityai/stable-audio-open-1.0",
         quantization="fp8",
         cache_backend="tea_cache",
         cache_config={"rel_l1_thresh": 0.2},

From 7e6fe0028c11dd68f39114df1c7afd6085d025aa Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Sun, 12 Apr 2026 08:20:29 +0800
Subject: [PATCH 11/12] chore(docs): drop manual edits to generated
 offline_inference text_to_audio

docs/user_guide/examples/offline_inference/*.md is produced at build time;
tea_cache usage remains documented in examples/offline_inference/text_to_audio/README.md.

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 docs/user_guide/examples/offline_inference/text_to_audio.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/user_guide/examples/offline_inference/text_to_audio.md b/docs/user_guide/examples/offline_inference/text_to_audio.md
index 9be09194bc5..62a70e5254d 100644
--- a/docs/user_guide/examples/offline_inference/text_to_audio.md
+++ b/docs/user_guide/examples/offline_inference/text_to_audio.md
@@ -26,7 +26,6 @@ python text_to_audio.py \
   --guidance-scale 7.0 \
   --audio-length 10.0 \
   --num-inference-steps 100 \
-  --cache-backend tea_cache \
   --output stable_audio_output.wav
 ```
 
@@ -38,7 +37,6 @@ Key arguments:
 - `--guidance-scale`: classifier-free guidance scale.
 - `--audio-length`: audio duration in seconds.
 - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower).
-- `--cache-backend`: cache acceleration backend. Stable Audio currently supports `tea_cache`.
 - `--output`: path to save the generated WAV file.
 
 ## Example materials

From 8d072d711639c1a9e459d15ebd96607552d75664 Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Mon, 13 Apr 2026 07:32:52 +0000
Subject: [PATCH 12/12] adapt to pr 2711

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 .../e2e/offline_inference/test_stable_audio_expansion.py  | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/e2e/offline_inference/test_stable_audio_expansion.py b/tests/e2e/offline_inference/test_stable_audio_expansion.py
index e7b92fa98de..54c1799e145 100644
--- a/tests/e2e/offline_inference/test_stable_audio_expansion.py
+++ b/tests/e2e/offline_inference/test_stable_audio_expansion.py
@@ -1,7 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-"""Stable Audio offline e2e: real weights, FP8 + TeaCache (single job to save GPU)."""
+"""Stable Audio offline e2e: real weights, FP8 + TeaCache (single job to save GPU).
+
+NOTE: This test instantiates Omni directly instead of using the omni_runner
+fixture (introduced in PR #2711) because the fixture's parametrize interface
+only accepts (model, stage_config_path) and does not support extra kwargs like
+quantization, cache_backend, or cache_config.
+"""
 
 from __future__ import annotations