zhangj1an · zhangj1an · Apr 4, 2026
diff --git a/tests/e2e/offline_inference/test_stable_audio_expansion.py b/tests/e2e/offline_inference/test_stable_audio_expansion.py
@@ -56,13 +56,7 @@ def generate_stable_audio_short_clip(
 
     assert outputs is not None
     first_output = outputs[0]
-    # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata.
-    # The nested request_output is the worker OmniRequestOutput
-    # (e.g. final_output_type="audio") and holds the multimodal payload.
-    # Follow-up: add StableAudioPipeline stage YAML, and pass model into
-    # _create_default_diffusion_stage_cfg so default diffusion metadata can set
-    # final_output_type to "audio" for future audio pipelines without YAML.
-    assert first_output.final_output_type == "image"
+    assert first_output.final_output_type == "audio"
     assert hasattr(first_output, "request_output") and first_output.request_output
 
     req_out = first_output.request_output

diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py
@@ -20,6 +20,7 @@
 
 _DIFFUSERS_CLASS_TO_CONFIG: dict[str, str] = {
     "GlmImagePipeline": "glm_image",
+    "StableAudioPipeline": "stable_audio",
 }
 
 

diff --git a/vllm_omni/model_executor/stage_configs/stable_audio.yaml b/vllm_omni/model_executor/stage_configs/stable_audio.yaml
@@ -0,0 +1,8 @@
+# StableAudioPipeline (model_index _class_name). Single-stage: only modality metadata is required here;
+# Omni/AsyncOmni **kwargs merge into engine_args; OmniDiffusionConfig supplies defaults (e.g. max_num_seqs).
+
+stage_args:
+  - stage_id: 0
+    stage_type: diffusion
+    final_output: true
+    final_output_type: audio
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,6 +20,7 @@ @@
     _DIFFUSERS_CLASS_TO_CONFIG: dict[str, str] = {
         "GlmImagePipeline": "glm_image",
+        "StableAudioPipeline": "stable_audio",
     }
@@ Expand Down @@