diff --git a/tests/e2e/offline_inference/test_stable_audio_expansion.py b/tests/e2e/offline_inference/test_stable_audio_expansion.py index ab2727390ca..8ad77c17d7f 100644 --- a/tests/e2e/offline_inference/test_stable_audio_expansion.py +++ b/tests/e2e/offline_inference/test_stable_audio_expansion.py @@ -56,13 +56,7 @@ def generate_stable_audio_short_clip( assert outputs is not None first_output = outputs[0] - # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata. - # The nested request_output is the worker OmniRequestOutput - # (e.g. final_output_type="audio") and holds the multimodal payload. - # Follow-up: add StableAudioPipeline stage YAML, and pass model into - # _create_default_diffusion_stage_cfg so default diffusion metadata can set - # final_output_type to "audio" for future audio pipelines without YAML. - assert first_output.final_output_type == "image" + assert first_output.final_output_type == "audio" assert hasattr(first_output, "request_output") and first_output.request_output req_out = first_output.request_output diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index e29e9eea1c2..94bf85b663c 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -20,6 +20,7 @@ _DIFFUSERS_CLASS_TO_CONFIG: dict[str, str] = { "GlmImagePipeline": "glm_image", + "StableAudioPipeline": "stable_audio", } diff --git a/vllm_omni/model_executor/stage_configs/stable_audio.yaml b/vllm_omni/model_executor/stage_configs/stable_audio.yaml new file mode 100644 index 00000000000..acaff236a83 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/stable_audio.yaml @@ -0,0 +1,8 @@ +# StableAudioPipeline (model_index _class_name). Single-stage: only modality metadata is required here; +# Omni/AsyncOmni **kwargs merge into engine_args; OmniDiffusionConfig supplies defaults (e.g. max_num_seqs). + +stage_args: + - stage_id: 0 + stage_type: diffusion + final_output: true + final_output_type: audio