GetStream
diff --git a/‎DEVELOPMENT.md‎
Lines changed: 7 additions & 1 deletion b/‎DEVELOPMENT.md‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎agents-core/vision_agents/core/edge/events.py‎
Lines changed: 9 additions & 5 deletions b/‎agents-core/vision_agents/core/edge/events.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎agents-core/vision_agents/core/llm/realtime.py‎
Lines changed: 5 additions & 3 deletions b/‎agents-core/vision_agents/core/llm/realtime.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎agents-core/vision_agents/core/observability/__init__.py‎
Lines changed: 0 additions & 4 deletions b/‎agents-core/vision_agents/core/observability/__init__.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎agents-core/vision_agents/core/observability/metrics.py‎
Lines changed: 0 additions & 6 deletions b/‎agents-core/vision_agents/core/observability/metrics.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎agents-core/vision_agents/core/tts/manual_test.py‎
Lines changed: 3 additions & 2 deletions b/‎agents-core/vision_agents/core/tts/manual_test.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎agents-core/vision_agents/core/tts/tts.py‎
Lines changed: 0 additions & 7 deletions b/‎agents-core/vision_agents/core/tts/tts.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎conftest.py‎
Lines changed: 9 additions & 15 deletions b/‎conftest.py‎
Lines changed: 9 additions & 15 deletions
diff --git a/‎docs/ai/instructions/ai-plugin.md‎
Lines changed: 6 additions & 2 deletions b/‎docs/ai/instructions/ai-plugin.md‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎docs/ai/instructions/ai-realtime-llm.md‎
Lines changed: 16 additions & 12 deletions b/‎docs/ai/instructions/ai-realtime-llm.md‎
Lines changed: 16 additions & 12 deletions
@@ -130,7 +130,7 @@ Some ground rules:
 
 ```python
 import asyncio
-from vision_agents.core.edge.types import PcmData
+from getstream.video.rtc.track_util import PcmData
 from openai import AsyncOpenAI
 
 async def example():
@@ -167,6 +167,12 @@ if __name__ == "__main__":
     asyncio.run(example())
 ```
 
+Other things that you get from the audio utilities:
+
+1. Changing PCM format
+2. Iterate over audio chunks (`PcmData.chunks`)
+3. Process audio with pre/post buffers (`AudioSegmentCollector`)
+4. Accumulating audio (`PcmData.append`)
 
 ### Testing audio manually
 
 
@@ -1,22 +1,24 @@
 from dataclasses import dataclass, field
 
-from vision_agents.core.edge.types import PcmData
+from getstream.video.rtc.track_util import PcmData
 from vision_agents.core.events import PluginBaseEvent
 from typing import Optional, Any
 
 
 @dataclass
 class AudioReceivedEvent(PluginBaseEvent):
     """Event emitted when audio is received from a participant."""
-    type: str = field(default='plugin.edge.audio_received', init=False)
+
+    type: str = field(default="plugin.edge.audio_received", init=False)
     pcm_data: Optional[PcmData] = None
     participant: Optional[Any] = None
 
 
 @dataclass
 class TrackAddedEvent(PluginBaseEvent):
     """Event emitted when a track is added to the call."""
-    type: str = field(default='plugin.edge.track_added', init=False)
+
+    type: str = field(default="plugin.edge.track_added", init=False)
     track_id: Optional[str] = None
     track_type: Optional[int] = None
     user: Optional[Any] = None
@@ -25,7 +27,8 @@ class TrackAddedEvent(PluginBaseEvent):
 @dataclass
 class TrackRemovedEvent(PluginBaseEvent):
     """Event emitted when a track is removed from the call."""
-    type: str = field(default='plugin.edge.track_removed', init=False)
+
+    type: str = field(default="plugin.edge.track_removed", init=False)
     track_id: Optional[str] = None
     track_type: Optional[int] = None
     user: Optional[Any] = None
@@ -34,6 +37,7 @@ class TrackRemovedEvent(PluginBaseEvent):
 @dataclass
 class CallEndedEvent(PluginBaseEvent):
     """Event emitted when a call ends."""
-    type: str = field(default='plugin.edge.call_ended', init=False)
+
+    type: str = field(default="plugin.edge.call_ended", init=False)
     args: Optional[tuple] = None
     kwargs: Optional[dict] = None
@@ -6,7 +6,8 @@
 )
 
 from getstream.video.rtc.audio_track import AudioStreamTrack
-from vision_agents.core.edge.types import PcmData, Participant
+from getstream.video.rtc.track_util import PcmData
+from vision_agents.core.edge.types import Participant
 
 
 import abc
@@ -37,8 +38,9 @@ class Realtime(LLM, abc.ABC):
         - Transcript outgoing audio
 
     """
-    fps : int = 1
-    session_id : str # UUID to identify this session
+
+    fps: int = 1
+    session_id: str  # UUID to identify this session
 
     def __init__(
         self,
 
@@ -12,8 +12,6 @@
     stt_bytes_streamed,
     stt_errors,
     tts_latency_ms,
-    tts_first_byte_ms,
-    tts_bytes_streamed,
     tts_errors,
     tts_events_emitted,
     inflight_ops,
@@ -27,8 +25,6 @@
     "stt_bytes_streamed",
     "stt_errors",
     "tts_latency_ms",
-    "tts_first_byte_ms",
-    "tts_bytes_streamed",
     "tts_errors",
     "tts_events_emitted",
     "inflight_ops",
 
@@ -67,12 +67,6 @@
 tts_latency_ms = meter.create_histogram(
     "tts.latency.ms", unit="ms", description="Total TTS latency"
 )
-tts_first_byte_ms = meter.create_histogram(
-    "tts.first_byte.ms", unit="ms", description="TTS time to first audio byte"
-)
-tts_bytes_streamed = meter.create_counter(
-    "tts.bytes.streamed", unit="By", description="Bytes sent/received for TTS"
-)
 tts_errors = meter.create_counter("tts.errors", description="TTS errors")
 tts_events_emitted = meter.create_counter(
     "tts.events.emitted", description="Number of TTS events emitted"
 
@@ -6,7 +6,8 @@
 
 from vision_agents.core.tts import TTS
 from vision_agents.core.tts.testing import TTSSession
-from vision_agents.core.edge.types import PcmData, play_pcm_with_ffplay
+from getstream.video.rtc.track_util import PcmData, AudioFormat
+from vision_agents.core.edge.types import play_pcm_with_ffplay
 
 
 async def manual_tts_to_wav(
@@ -48,7 +49,7 @@ async def manual_tts_to_wav(
     # Convert captured audio to PcmData
     pcm_bytes = b"".join(result.speeches)
     pcm = PcmData.from_bytes(
-        pcm_bytes, sample_rate=sample_rate, channels=channels, format="s16"
+        pcm_bytes, sample_rate=sample_rate, channels=channels, format=AudioFormat.S16
     )
 
     # Generate a descriptive filename if not provided
 
@@ -20,9 +20,7 @@
 )
 from ..observability import (
     tts_latency_ms,
-    tts_bytes_streamed,
     tts_errors,
-    tts_events_emitted,
 )
 from ..edge.types import PcmData
 
@@ -180,10 +178,6 @@ def _emit_chunk(
         )
 
         payload = pcm_out.to_bytes()
-        # Metrics: counters per chunk
-        attrs = {"tts_class": self.__class__.__name__}
-        tts_bytes_streamed.add(len(payload), attributes=attrs)
-        tts_events_emitted.add(1, attributes=attrs)
         self.events.send(
             TTSAudioEvent(
                 session_id=self.session_id,
@@ -343,7 +337,6 @@ async def send(
             )
             raise
         finally:
-            # Metrics: latency histogram for the entire send call
             elapsed_ms = (time.time() - start_time) * 1000.0
             tts_latency_ms.record(
                 elapsed_ms, attributes={"tts_class": self.__class__.__name__}
 
@@ -13,7 +13,7 @@
 from dotenv import load_dotenv
 from torchvision.io.video import av
 
-from vision_agents.core.edge.types import PcmData
+from getstream.video.rtc.track_util import PcmData, AudioFormat
 from vision_agents.core.stt.events import STTTranscriptEvent, STTErrorEvent
 
 load_dotenv()
@@ -124,7 +124,7 @@ def mia_audio_16khz():
     container.close()
 
     # Create PCM data
-    pcm = PcmData(samples=samples, sample_rate=target_rate, format="s16")
+    pcm = PcmData(samples=samples, sample_rate=target_rate, format=AudioFormat.S16)
 
     return pcm
 
@@ -167,7 +167,7 @@ def mia_audio_48khz():
     container.close()
 
     # Create PCM data
-    pcm = PcmData(samples=samples, sample_rate=target_rate, format="s16")
+    pcm = PcmData(samples=samples, sample_rate=target_rate, format=AudioFormat.S16)
 
     return pcm
 
@@ -176,7 +176,7 @@ def mia_audio_48khz():
 def mia_audio_48khz_chunked():
     """Load mia.mp3 and yield 48kHz PCM data in 20ms chunks."""
     audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
-    
+
     # Load audio file using PyAV
     container = av.open(audio_file_path)
     audio_stream = container.streams.audio[0]
@@ -186,11 +186,7 @@ def mia_audio_48khz_chunked():
     # Create resampler if needed
     resampler = None
     if original_sample_rate != target_rate:
-        resampler = av.AudioResampler(
-            format='s16',
-            layout='mono',
-            rate=target_rate
-        )
+        resampler = av.AudioResampler(format="s16", layout="mono", rate=target_rate)
 
     # Read all audio frames
     samples = []
@@ -219,16 +215,14 @@ def mia_audio_48khz_chunked():
     # Yield chunks of audio
     chunks = []
     for i in range(0, len(samples), chunk_size):
-        chunk_samples = samples[i:i + chunk_size]
-        
+        chunk_samples = samples[i : i + chunk_size]
+
         # Create PCM data for this chunk
         pcm_chunk = PcmData(
-            samples=chunk_samples,
-            sample_rate=target_rate,
-            format="s16"
+            samples=chunk_samples, sample_rate=target_rate, format=AudioFormat.S16
         )
         chunks.append(pcm_chunk)
-    
+
     return chunks
 
 
 
@@ -48,7 +48,7 @@ llm = anthropic.LLM()
 When building the plugin read these guides:
 
 - **TTS**: [ai-tts.md](ai-tts.md)
-- **STT**: [ai-stt.md](ai-stt.md)  
+- **STT**: [ai-stt.md](ai-stt.md)
 - **STS/realtime/LLM**: [ai-llm.md](ai-llm.md) or [ai-realtime-llm.md](ai-realtime-llm.md)
 
 ## Update pyproject.toml
@@ -65,4 +65,8 @@ members = [
     "plugins/myplugin",
     # ... other plugins
 ]
-```
+```
+
+## PCM / Audio management
+
+Use `PcmData` and other utils available from the `getstream.video.rtc.track_util` module. Do not write code that directly manipulates PCM, use the audio utilities instead.
@@ -15,14 +15,14 @@ class MyRealtime(realtime.Realtime):
         super().__init__()
         self.model = model
         self.client = client
-        
+
     async def connect(self):
         # create the websocket or webrtc connection to the realtime LLM
         pass
-    
+
     async def _handle_events(self):
         # handle the events from the connect method
-        
+
         # when receiving audio do this
         audio_event = RealtimeAudioOutputEvent(
             plugin_name="gemini",
@@ -32,26 +32,26 @@ class MyRealtime(realtime.Realtime):
         self.events.send(audio_event)
 
         await self.output_track.write(audio_content)
-        
+
         # for transcriptions...
         # TODO document this
         pass
-    
+
     async def close(self):
         pass
-        
+
     # native method wrapped. wrap the native method, every llm has its own name for this
     # openai calls it create response, anthropic create message. so the name depends on your llm
     async def mynativemethod(self, *args, **kwargs):
-        
+
         # some details to get right here...
         # ensure conversation history is maintained. typically by passing it ie:
         enhanced_instructions = self._build_enhanced_instructions()
         if enhanced_instructions:
             kwargs["system"] = [{"text": enhanced_instructions}]
-            
+
         response_iterator = await self.client.mynativemethod(self, *args, **kwargs)
-        
+
         # while receiving streaming do this
         total_text = ""
         for chunk in response_iterator:
@@ -64,7 +64,7 @@ class MyRealtime(realtime.Realtime):
                     delta=chunk.text,
                 ))
             total_text += chunk.text
-            
+
         llm_response = LLMResponseEvent(response_iterator, total_text)
         # and when completed
         self.events.send(LLMResponseCompletedEvent(
@@ -82,7 +82,7 @@ class MyRealtime(realtime.Realtime):
         # call the LLM with the given text
         # be sure to use the streaming version
         self.mynativemethod(...)
-    
+
     async def simple_audio_response(self, pcm: PcmData):
         # respond to this audio
         pass
@@ -101,4 +101,8 @@ class MyRealtime(realtime.Realtime):
 If you need more examples look in
 
 - gemini_llm.py
-- aws_llm.py (AWS Bedrock implementation)
+- aws_llm.py (AWS Bedrock implementation)
+
+## PCM / Audio management
+
+Use `PcmData` and other utils available from the `getstream.video.rtc.track_util` module. Do not write code that directly manipulates PCM, use the audio utilities instead.