From 4289bc26e56f0ebfe3730fd0e4a3faa598039947 Mon Sep 17 00:00:00 2001 From: linyueqian Date: Wed, 4 Feb 2026 16:05:33 -0500 Subject: [PATCH 1/2] Fix multimodal_output property to check completion outputs where audio data is attached Signed-off-by: linyueqian --- vllm_omni/outputs.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm_omni/outputs.py b/vllm_omni/outputs.py index 6cba9d96a15..64569846a00 100644 --- a/vllm_omni/outputs.py +++ b/vllm_omni/outputs.py @@ -124,10 +124,16 @@ def from_diffusion( def multimodal_output(self) -> dict[str, Any]: """Return multimodal output from the underlying request output or local field. - For pipeline outputs, this proxies to request_output.multimodal_output. + For pipeline outputs, this checks completion outputs first, then request_output. For diffusion outputs, this returns the local _multimodal_output field. """ if self.request_output is not None: + # Check completion outputs first (where multimodal_output is attached) + if self.request_output.outputs: + for output in self.request_output.outputs: + mm = getattr(output, "multimodal_output", None) + if mm: + return mm return getattr(self.request_output, "multimodal_output", {}) return self._multimodal_output From 5f2849a97a1380abd1d9331d341c35fa76a53519 Mon Sep 17 00:00:00 2001 From: linyueqian Date: Wed, 4 Feb 2026 16:27:57 -0500 Subject: [PATCH 2/2] Fix client to handle binary audio response instead of assuming UTF-8 Signed-off-by: linyueqian --- .../online_serving/qwen3_tts/openai_speech_client.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/online_serving/qwen3_tts/openai_speech_client.py b/examples/online_serving/qwen3_tts/openai_speech_client.py index 4d70460940c..ea45a74827a 100644 --- a/examples/online_serving/qwen3_tts/openai_speech_client.py +++ b/examples/online_serving/qwen3_tts/openai_speech_client.py @@ -111,9 +111,14 @@ def run_tts_generation(args) -> None: print(response.text) return - if response.content.decode("utf-8").startswith('{"error"'): - print(f"Error: {response.content.decode('utf-8')}") - return + # Check for JSON error response (only if content is valid UTF-8 text) + try: + text = response.content.decode("utf-8") + if text.startswith('{"error"'): + print(f"Error: {text}") + return + except UnicodeDecodeError: + pass # Binary audio data, not an error # Save audio response output_path = args.output or "tts_output.wav"