GetStream · tbarbugli · Oct 24, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 24, 2025
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -109,6 +109,73 @@ To see how the agent work open up agents.py
 * The LLM uses the VideoForwarder to write the video to a websocket or webrtc connection
 * The STS writes the reply on agent.llm.audio_track and the RealtimeTranscriptEvent / RealtimePartialTranscriptEvent
 
+## Audio management
+
+Some important things about audio inside the library:
+
+1. WebRTC uses Opus 48khz stereo but inside the library audio is always in PCM format
+2. Plugins / AI models work with different PCM formats, usually 16khz mono
+3. PCM data is always passed around using the `PcmData` object which contains information about sample rate, channels and format
+4. Text-to-speech plugins automatically return PCM in the format needed by WebRTC. This is exposed via the `set_output_format` method
+5. Audio resampling can be done using `PcmData.resample` method
+6. When resampling audio in chunks, it is important to re-use the same `av.AudioResampler` resampler (see `PcmData.resample` and `core.tts.TTS`)
+7. Adjusting from stereo to mono and vice-versa can be done using the `PcmData.resample` method
+
+Some ground rules:
+
+1. Do not build code to resample / adjust audio unless it is not covered already by `PcmData`
+2. Do not pass PCM as plain bytes around and write code that assumes specific sample rate or format. Use `PcmData` instead
+
+## Example
+
+```python
+import asyncio
+from vision_agents.core.edge.types import PcmData
+from openai import AsyncOpenAI
+
+async def example():
+    client = AsyncOpenAI(api_key="sk-42")
+
+    resp = await client.audio.speech.create(
+        model="gpt-4o-mini-tts",
+        voice="alloy",
+        input="pcm is cool, give me some of that please",
+        response_format="pcm",
+    )
+
+    # load response into PcmData, note that you need to specify sample_rate, channels and format
+    pcm_data = PcmData.from_bytes(
+        resp.content, sample_rate=24_000, channels=1, format="s16"
+    )
+
+    # check if pcm_data is stereo (it's not in this case ofc)
+    print(pcm_data.stereo)
+
+    # write the pcm to file
+    with open("test.wav", "wb") as f:
+        f.write(pcm_data.to_wav_bytes())
+
+    # resample pcm to be 48khz stereo
+    resampled_pcm = pcm_data.resample(48_000, 2)
+
+    # play-out pcm using ffplay
+    from vision_agents.core.edge.types import play_pcm_with_ffplay
+
+    await play_pcm_with_ffplay(resampled_pcm)
+
+if __name__ == "__main__":
+    asyncio.run(example())
+```
+
+
+### Testing audio manually
+
+Sometimes you need to test audio manually, here's some tips:
+
+1. Do not use earplugs when testing PCM playback ;)
+2. You can use the `PcmData.to_wav_bytes` method to convert PCM into wav bytes (see `manual_tts_to_wav` for an example)
+3. If you have `ffplay` installed, you can playback pcm directly to check if audio is correct
+
 ## Dev / Contributor Guidelines
 
 ### Light wrapping

diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py
@@ -5,7 +5,6 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 from uuid import uuid4
 
-import aiortc
 import getstream.models
 from aiortc import VideoStreamTrack
 from getstream.video.rtc import Call
@@ -15,7 +14,7 @@
 from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import TrackType
 from ..edge import sfu_events
 from ..edge.events import AudioReceivedEvent, TrackAddedEvent, CallEndedEvent
-from ..edge.types import Connection, Participant, PcmData, User
+from ..edge.types import Connection, Participant, PcmData, User, OutputAudioTrack
 from ..events.manager import EventManager
 from ..llm import events as llm_events
 from ..llm.events import (
@@ -32,6 +31,7 @@
 from ..stt.events import STTTranscriptEvent, STTErrorEvent
 from ..stt.stt import STT
 from ..tts.tts import TTS
+from ..tts.events import TTSAudioEvent
 from ..turn_detection import TurnDetector, TurnStartedEvent, TurnEndedEvent
 from ..vad import VAD
 from ..vad.events import VADAudioEvent
@@ -160,7 +160,7 @@ def __init__(
         self._callback_executed = False
         self._track_tasks: Dict[str, asyncio.Task] = {}
         self._connection: Optional[Connection] = None
-        self._audio_track: Optional[aiortc.AudioStreamTrack] = None
+        self._audio_track: Optional[OutputAudioTrack] = None
         self._video_track: Optional[VideoStreamTrack] = None
         self._realtime_connection = None
         self._pc_track_handler_attached: bool = False
@@ -307,6 +307,11 @@ async def on_realtime_agent_speech_transcription(
                 original=event,
             )
 
+        @self.events.subscribe
+        async def _on_tts_audio_write_to_output(event: TTSAudioEvent):
+            if self._audio_track and event and event.audio_data is not None:
+                await self._audio_track.write(event.audio_data)
+
         @self.events.subscribe
         async def on_stt_transcript_event_create_response(event: STTTranscriptEvent):
             if self.realtime_mode or not self.llm:
@@ -1021,19 +1026,19 @@ def _prepare_rtc(self):
                 self._audio_track = self.llm.output_track
                 self.logger.info("🎵 Using Realtime provider output track for audio")
             else:
-                # TODO: what if we want to transform audio...
-                # Get the required framerate and stereo setting from TTS plugin, default to 48000 for WebRTC
-                if self.tts:
-                    framerate = self.tts.get_required_framerate()
-                    stereo = self.tts.get_required_stereo()
-                else:
-                    framerate = 48000
-                    stereo = True  # Default to stereo for WebRTC
+                # Default to WebRTC-friendly format unless configured differently
+                framerate = 48000
+                stereo = True
                 self._audio_track = self.edge.create_audio_track(
                     framerate=framerate, stereo=stereo
                 )
+                # Inform TTS of desired output format so it can resample accordingly
                 if self.tts:
-                    self.tts.set_output_track(self._audio_track)
+                    channels = 2 if stereo else 1
+                    self.tts.set_output_format(
+                        sample_rate=framerate,
+                        channels=channels,
+                    )
 
         # Set up video track if video publishers are available
         if self.publish_video:

diff --git a/agents-core/vision_agents/core/edge/edge_transport.py b/agents-core/vision_agents/core/edge/edge_transport.py
@@ -1,17 +1,17 @@
 """
 Abstraction for stream vs other services here
 """
+
 import abc
 
 from typing import TYPE_CHECKING, Any, Optional
 
 import aiortc
 from pyee.asyncio import AsyncIOEventEmitter
 
-from vision_agents.core.edge.types import User
+from vision_agents.core.edge.types import User, OutputAudioTrack
 
 if TYPE_CHECKING:
-
     pass
 
 
@@ -31,7 +31,7 @@ async def create_user(self, user: User):
         pass
 
     @abc.abstractmethod
-    def create_audio_track(self):
+    def create_audio_track(self) -> OutputAudioTrack:
         pass
 
     @abc.abstractmethod
@@ -55,6 +55,7 @@ async def create_conversation(self, call: Any, user: User, instructions):
         pass
 
     @abc.abstractmethod
-    def add_track_subscriber(self, track_id: str) -> Optional[aiortc.mediastreams.MediaStreamTrack]:
+    def add_track_subscriber(
+        self, track_id: str
+    ) -> Optional[aiortc.mediastreams.MediaStreamTrack]:
         pass
-