From 4289bc26e56f0ebfe3730fd0e4a3faa598039947 Mon Sep 17 00:00:00 2001
From: linyueqian <linyueqian@outlook.com>
Date: Wed, 4 Feb 2026 16:05:33 -0500
Subject: [PATCH 1/2] Fix multimodal_output property to check completion
 outputs where audio data is attached

Signed-off-by: linyueqian <linyueqian@outlook.com>
---
 vllm_omni/outputs.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/outputs.py b/vllm_omni/outputs.py
index 6cba9d96a15..64569846a00 100644
--- a/vllm_omni/outputs.py
+++ b/vllm_omni/outputs.py
@@ -124,10 +124,16 @@ def from_diffusion(
     def multimodal_output(self) -> dict[str, Any]:
         """Return multimodal output from the underlying request output or local field.
 
-        For pipeline outputs, this proxies to request_output.multimodal_output.
+        For pipeline outputs, this checks completion outputs first, then request_output.
         For diffusion outputs, this returns the local _multimodal_output field.
         """
         if self.request_output is not None:
+            # Check completion outputs first (where multimodal_output is attached)
+            if self.request_output.outputs:
+                for output in self.request_output.outputs:
+                    mm = getattr(output, "multimodal_output", None)
+                    if mm:
+                        return mm
             return getattr(self.request_output, "multimodal_output", {})
         return self._multimodal_output
 

From 5f2849a97a1380abd1d9331d341c35fa76a53519 Mon Sep 17 00:00:00 2001
From: linyueqian <linyueqian@outlook.com>
Date: Wed, 4 Feb 2026 16:27:57 -0500
Subject: [PATCH 2/2] Fix client to handle binary audio response instead of
 assuming UTF-8

Signed-off-by: linyueqian <linyueqian@outlook.com>
---
 .../online_serving/qwen3_tts/openai_speech_client.py  | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/online_serving/qwen3_tts/openai_speech_client.py b/examples/online_serving/qwen3_tts/openai_speech_client.py
index 4d70460940c..ea45a74827a 100644
--- a/examples/online_serving/qwen3_tts/openai_speech_client.py
+++ b/examples/online_serving/qwen3_tts/openai_speech_client.py
@@ -111,9 +111,14 @@ def run_tts_generation(args) -> None:
         print(response.text)
         return
 
-    if response.content.decode("utf-8").startswith('{"error"'):
-        print(f"Error: {response.content.decode('utf-8')}")
-        return
+    # Check for JSON error response (only if content is valid UTF-8 text)
+    try:
+        text = response.content.decode("utf-8")
+        if text.startswith('{"error"'):
+            print(f"Error: {text}")
+            return
+    except UnicodeDecodeError:
+        pass  # Binary audio data, not an error
 
     # Save audio response
     output_path = args.output or "tts_output.wav"