Skip to content

Commit 73b34da

Browse files
committed
Clean up impl
1 parent c810967 commit 73b34da

File tree

1 file changed

+16
-45
lines changed
  • plugins/inworld/vision_agents/plugins/inworld

1 file changed

+16
-45
lines changed

plugins/inworld/vision_agents/plugins/inworld/tts.py

Lines changed: 16 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
import av
99
import httpx
10-
import numpy as np
1110
from vision_agents.core import tts
1211
from getstream.video.rtc.track_util import PcmData, AudioFormat
1312

@@ -97,58 +96,33 @@ async def _stream_audio() -> AsyncIterator[PcmData]:
9796
continue
9897

9998
try:
100-
# Parse JSON response
10199
data = json.loads(line)
102-
103-
# Check for errors
104100
if "error" in data:
105101
error_msg = data["error"].get("message", "Unknown error")
106102
logger.error(f"Inworld AI API error: {error_msg}")
107103
continue
108104

109-
# Extract audio content
110105
if "result" in data and "audioContent" in data["result"]:
111-
audio_content_b64 = data["result"]["audioContent"]
112-
113-
# Decode base64 to get WAV bytes
114-
wav_bytes = base64.b64decode(audio_content_b64)
106+
wav_bytes = base64.b64decode(data["result"]["audioContent"])
115107

116108
# Decode WAV to PCM using PyAV
117-
# Inworld returns WAV format, so we need to extract PCM
118-
# Each chunk contains a complete WAV file with header
119-
container = av.open(io.BytesIO(wav_bytes))
120-
audio_stream = container.streams.audio[0]
121-
sample_rate = audio_stream.sample_rate
122-
123-
# Read all frames and convert to PcmData
124-
# Each WAV chunk may contain multiple frames
125-
pcm_chunks = []
126-
for frame in container.decode(audio_stream):
127-
# Use PcmData.from_av_frame for automatic format handling
128-
pcm_chunk = PcmData.from_av_frame(frame)
129-
pcm_chunks.append(pcm_chunk)
130-
131-
container.close()
132-
133-
if pcm_chunks:
134-
# Concatenate frames from this WAV chunk
135-
# Start with first chunk and append others
136-
combined_pcm = pcm_chunks[0].copy()
137-
for chunk in pcm_chunks[1:]:
138-
combined_pcm.append(chunk)
109+
with av.open(io.BytesIO(wav_bytes)) as container:
110+
audio_stream = container.streams.audio[0]
111+
pcm = None
139112

140-
# Ensure mono and int16 format
141-
if combined_pcm.stereo:
142-
combined_pcm = combined_pcm.resample(
143-
target_sample_rate=sample_rate,
144-
target_channels=1
145-
)
146-
147-
# Ensure int16 format
148-
if combined_pcm.format != AudioFormat.S16:
149-
combined_pcm = combined_pcm.to_int16()
113+
for frame in container.decode(audio_stream):
114+
frame_pcm = PcmData.from_av_frame(frame)
115+
if pcm is None:
116+
pcm = frame_pcm
117+
else:
118+
pcm.append(frame_pcm)
150119

151-
yield combined_pcm
120+
if pcm:
121+
pcm = pcm.resample(
122+
target_sample_rate=pcm.sample_rate,
123+
target_channels=1
124+
).to_int16()
125+
yield pcm
152126
except json.JSONDecodeError as e:
153127
logger.warning(f"Failed to parse JSON line: {e}")
154128
continue
@@ -176,9 +150,6 @@ async def stop_audio(self) -> None:
176150
"""
177151
logger.info("🎤 Inworld AI TTS stop requested (no-op)")
178152

179-
async def __aenter__(self):
180-
"""Async context manager entry."""
181-
return self
182153

183154
async def __aexit__(self, exc_type, exc_val, exc_tb):
184155
"""Async context manager exit - close HTTP client if we created it."""

0 commit comments

Comments
 (0)