|
7 | 7 |
|
8 | 8 | import av |
9 | 9 | import httpx |
10 | | -import numpy as np |
11 | 10 | from vision_agents.core import tts |
12 | 11 | from getstream.video.rtc.track_util import PcmData, AudioFormat |
13 | 12 |
|
@@ -97,58 +96,33 @@ async def _stream_audio() -> AsyncIterator[PcmData]: |
97 | 96 | continue |
98 | 97 |
|
99 | 98 | try: |
100 | | - # Parse JSON response |
101 | 99 | data = json.loads(line) |
102 | | - |
103 | | - # Check for errors |
104 | 100 | if "error" in data: |
105 | 101 | error_msg = data["error"].get("message", "Unknown error") |
106 | 102 | logger.error(f"Inworld AI API error: {error_msg}") |
107 | 103 | continue |
108 | 104 |
|
109 | | - # Extract audio content |
110 | 105 | if "result" in data and "audioContent" in data["result"]: |
111 | | - audio_content_b64 = data["result"]["audioContent"] |
112 | | - |
113 | | - # Decode base64 to get WAV bytes |
114 | | - wav_bytes = base64.b64decode(audio_content_b64) |
| 106 | + wav_bytes = base64.b64decode(data["result"]["audioContent"]) |
115 | 107 |
|
116 | 108 | # Decode WAV to PCM using PyAV |
117 | | - # Inworld returns WAV format, so we need to extract PCM |
118 | | - # Each chunk contains a complete WAV file with header |
119 | | - container = av.open(io.BytesIO(wav_bytes)) |
120 | | - audio_stream = container.streams.audio[0] |
121 | | - sample_rate = audio_stream.sample_rate |
122 | | - |
123 | | - # Read all frames and convert to PcmData |
124 | | - # Each WAV chunk may contain multiple frames |
125 | | - pcm_chunks = [] |
126 | | - for frame in container.decode(audio_stream): |
127 | | - # Use PcmData.from_av_frame for automatic format handling |
128 | | - pcm_chunk = PcmData.from_av_frame(frame) |
129 | | - pcm_chunks.append(pcm_chunk) |
130 | | - |
131 | | - container.close() |
132 | | - |
133 | | - if pcm_chunks: |
134 | | - # Concatenate frames from this WAV chunk |
135 | | - # Start with first chunk and append others |
136 | | - combined_pcm = pcm_chunks[0].copy() |
137 | | - for chunk in pcm_chunks[1:]: |
138 | | - combined_pcm.append(chunk) |
| 109 | + with av.open(io.BytesIO(wav_bytes)) as container: |
| 110 | + audio_stream = container.streams.audio[0] |
| 111 | + pcm = None |
139 | 112 |
|
140 | | - # Ensure mono and int16 format |
141 | | - if combined_pcm.stereo: |
142 | | - combined_pcm = combined_pcm.resample( |
143 | | - target_sample_rate=sample_rate, |
144 | | - target_channels=1 |
145 | | - ) |
146 | | - |
147 | | - # Ensure int16 format |
148 | | - if combined_pcm.format != AudioFormat.S16: |
149 | | - combined_pcm = combined_pcm.to_int16() |
| 113 | + for frame in container.decode(audio_stream): |
| 114 | + frame_pcm = PcmData.from_av_frame(frame) |
| 115 | + if pcm is None: |
| 116 | + pcm = frame_pcm |
| 117 | + else: |
| 118 | + pcm.append(frame_pcm) |
150 | 119 |
|
151 | | - yield combined_pcm |
| 120 | + if pcm: |
| 121 | + pcm = pcm.resample( |
| 122 | + target_sample_rate=pcm.sample_rate, |
| 123 | + target_channels=1 |
| 124 | + ).to_int16() |
| 125 | + yield pcm |
152 | 126 | except json.JSONDecodeError as e: |
153 | 127 | logger.warning(f"Failed to parse JSON line: {e}") |
154 | 128 | continue |
@@ -176,9 +150,6 @@ async def stop_audio(self) -> None: |
176 | 150 | """ |
177 | 151 | logger.info("🎤 Inworld AI TTS stop requested (no-op)") |
178 | 152 |
|
179 | | - async def __aenter__(self): |
180 | | - """Async context manager entry.""" |
181 | | - return self |
182 | 153 |
|
183 | 154 | async def __aexit__(self, exc_type, exc_val, exc_tb): |
184 | 155 | """Async context manager exit - close HTTP client if we created it.""" |
|
0 commit comments