vllm-project · hsliuustc0106 · Feb 7, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 5, 2026
@@ -111,9 +111,14 @@ def run_tts_generation(args) -> None:
         print(response.text)
         return
 
-    if response.content.decode("utf-8").startswith('{"error"'):
-        print(f"Error: {response.content.decode('utf-8')}")
-        return
+    # Check for JSON error response (only if content is valid UTF-8 text)
+    try:
+        text = response.content.decode("utf-8")
+        if text.startswith('{"error"'):
+            print(f"Error: {text}")
+            return
+    except UnicodeDecodeError:
+        pass  # Binary audio data, not an error
 
     # Save audio response
     output_path = args.output or "tts_output.wav"

@@ -124,10 +124,16 @@ def from_diffusion(
     def multimodal_output(self) -> dict[str, Any]:
         """Return multimodal output from the underlying request output or local field.
 
-        For pipeline outputs, this proxies to request_output.multimodal_output.
+        For pipeline outputs, this checks completion outputs first, then request_output.
         For diffusion outputs, this returns the local _multimodal_output field.
         """
         if self.request_output is not None:
+            # Check completion outputs first (where multimodal_output is attached)
+            if self.request_output.outputs:
+                for output in self.request_output.outputs:
+                    mm = getattr(output, "multimodal_output", None)
+                    if mm:
+                        return mm
             return getattr(self.request_output, "multimodal_output", {})
         return self._multimodal_output