Skip to content

Commit f3aee97

Browse files
committed
Merge branch 'vogent-tommaso' of github.com:GetStream/Vision-Agents into vogent-tommaso
2 parents 23eee89 + d0e2234 commit f3aee97

File tree

23 files changed

+311
-241
lines changed

23 files changed

+311
-241
lines changed

DEVELOPMENT.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ Some ground rules:
130130

131131
```python
132132
import asyncio
133-
from vision_agents.core.edge.types import PcmData
133+
from getstream.video.rtc.track_util import PcmData
134134
from openai import AsyncOpenAI
135135

136136
async def example():
@@ -167,6 +167,12 @@ if __name__ == "__main__":
167167
asyncio.run(example())
168168
```
169169

170+
Other things that you get from the audio utilities:
171+
172+
1. Changing PCM format
173+
2. Iterate over audio chunks (`PcmData.chunks`)
174+
3. Process audio with pre/post buffers (`AudioSegmentCollector`)
175+
4. Accumulating audio (`PcmData.append`)
170176

171177
### Testing audio manually
172178

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,24 @@
11
from dataclasses import dataclass, field
22

3-
from vision_agents.core.edge.types import PcmData
3+
from getstream.video.rtc.track_util import PcmData
44
from vision_agents.core.events import PluginBaseEvent
55
from typing import Optional, Any
66

77

88
@dataclass
99
class AudioReceivedEvent(PluginBaseEvent):
1010
"""Event emitted when audio is received from a participant."""
11-
type: str = field(default='plugin.edge.audio_received', init=False)
11+
12+
type: str = field(default="plugin.edge.audio_received", init=False)
1213
pcm_data: Optional[PcmData] = None
1314
participant: Optional[Any] = None
1415

1516

1617
@dataclass
1718
class TrackAddedEvent(PluginBaseEvent):
1819
"""Event emitted when a track is added to the call."""
19-
type: str = field(default='plugin.edge.track_added', init=False)
20+
21+
type: str = field(default="plugin.edge.track_added", init=False)
2022
track_id: Optional[str] = None
2123
track_type: Optional[int] = None
2224
user: Optional[Any] = None
@@ -25,7 +27,8 @@ class TrackAddedEvent(PluginBaseEvent):
2527
@dataclass
2628
class TrackRemovedEvent(PluginBaseEvent):
2729
"""Event emitted when a track is removed from the call."""
28-
type: str = field(default='plugin.edge.track_removed', init=False)
30+
31+
type: str = field(default="plugin.edge.track_removed", init=False)
2932
track_id: Optional[str] = None
3033
track_type: Optional[int] = None
3134
user: Optional[Any] = None
@@ -34,6 +37,7 @@ class TrackRemovedEvent(PluginBaseEvent):
3437
@dataclass
3538
class CallEndedEvent(PluginBaseEvent):
3639
"""Event emitted when a call ends."""
37-
type: str = field(default='plugin.edge.call_ended', init=False)
40+
41+
type: str = field(default="plugin.edge.call_ended", init=False)
3842
args: Optional[tuple] = None
3943
kwargs: Optional[dict] = None

agents-core/vision_agents/core/llm/realtime.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
)
77

88
from getstream.video.rtc.audio_track import AudioStreamTrack
9-
from vision_agents.core.edge.types import PcmData, Participant
9+
from getstream.video.rtc.track_util import PcmData
10+
from vision_agents.core.edge.types import Participant
1011

1112

1213
import abc
@@ -37,8 +38,9 @@ class Realtime(LLM, abc.ABC):
3738
- Transcript outgoing audio
3839
3940
"""
40-
fps : int = 1
41-
session_id : str # UUID to identify this session
41+
42+
fps: int = 1
43+
session_id: str # UUID to identify this session
4244

4345
def __init__(
4446
self,

agents-core/vision_agents/core/observability/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
stt_bytes_streamed,
1313
stt_errors,
1414
tts_latency_ms,
15-
tts_first_byte_ms,
16-
tts_bytes_streamed,
1715
tts_errors,
1816
tts_events_emitted,
1917
inflight_ops,
@@ -27,8 +25,6 @@
2725
"stt_bytes_streamed",
2826
"stt_errors",
2927
"tts_latency_ms",
30-
"tts_first_byte_ms",
31-
"tts_bytes_streamed",
3228
"tts_errors",
3329
"tts_events_emitted",
3430
"inflight_ops",

agents-core/vision_agents/core/observability/metrics.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,6 @@
6767
tts_latency_ms = meter.create_histogram(
6868
"tts.latency.ms", unit="ms", description="Total TTS latency"
6969
)
70-
tts_first_byte_ms = meter.create_histogram(
71-
"tts.first_byte.ms", unit="ms", description="TTS time to first audio byte"
72-
)
73-
tts_bytes_streamed = meter.create_counter(
74-
"tts.bytes.streamed", unit="By", description="Bytes sent/received for TTS"
75-
)
7670
tts_errors = meter.create_counter("tts.errors", description="TTS errors")
7771
tts_events_emitted = meter.create_counter(
7872
"tts.events.emitted", description="Number of TTS events emitted"

agents-core/vision_agents/core/tts/manual_test.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66

77
from vision_agents.core.tts import TTS
88
from vision_agents.core.tts.testing import TTSSession
9-
from vision_agents.core.edge.types import PcmData, play_pcm_with_ffplay
9+
from getstream.video.rtc.track_util import PcmData, AudioFormat
10+
from vision_agents.core.edge.types import play_pcm_with_ffplay
1011

1112

1213
async def manual_tts_to_wav(
@@ -48,7 +49,7 @@ async def manual_tts_to_wav(
4849
# Convert captured audio to PcmData
4950
pcm_bytes = b"".join(result.speeches)
5051
pcm = PcmData.from_bytes(
51-
pcm_bytes, sample_rate=sample_rate, channels=channels, format="s16"
52+
pcm_bytes, sample_rate=sample_rate, channels=channels, format=AudioFormat.S16
5253
)
5354

5455
# Generate a descriptive filename if not provided

agents-core/vision_agents/core/tts/tts.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@
2020
)
2121
from ..observability import (
2222
tts_latency_ms,
23-
tts_bytes_streamed,
2423
tts_errors,
25-
tts_events_emitted,
2624
)
2725
from ..edge.types import PcmData
2826

@@ -180,10 +178,6 @@ def _emit_chunk(
180178
)
181179

182180
payload = pcm_out.to_bytes()
183-
# Metrics: counters per chunk
184-
attrs = {"tts_class": self.__class__.__name__}
185-
tts_bytes_streamed.add(len(payload), attributes=attrs)
186-
tts_events_emitted.add(1, attributes=attrs)
187181
self.events.send(
188182
TTSAudioEvent(
189183
session_id=self.session_id,
@@ -343,7 +337,6 @@ async def send(
343337
)
344338
raise
345339
finally:
346-
# Metrics: latency histogram for the entire send call
347340
elapsed_ms = (time.time() - start_time) * 1000.0
348341
tts_latency_ms.record(
349342
elapsed_ms, attributes={"tts_class": self.__class__.__name__}

conftest.py

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from dotenv import load_dotenv
1414
from torchvision.io.video import av
1515

16-
from vision_agents.core.edge.types import PcmData
16+
from getstream.video.rtc.track_util import PcmData, AudioFormat
1717
from vision_agents.core.stt.events import STTTranscriptEvent, STTErrorEvent
1818

1919
load_dotenv()
@@ -124,7 +124,7 @@ def mia_audio_16khz():
124124
container.close()
125125

126126
# Create PCM data
127-
pcm = PcmData(samples=samples, sample_rate=target_rate, format="s16")
127+
pcm = PcmData(samples=samples, sample_rate=target_rate, format=AudioFormat.S16)
128128

129129
return pcm
130130

@@ -167,7 +167,7 @@ def mia_audio_48khz():
167167
container.close()
168168

169169
# Create PCM data
170-
pcm = PcmData(samples=samples, sample_rate=target_rate, format="s16")
170+
pcm = PcmData(samples=samples, sample_rate=target_rate, format=AudioFormat.S16)
171171

172172
return pcm
173173

@@ -176,7 +176,7 @@ def mia_audio_48khz():
176176
def mia_audio_48khz_chunked():
177177
"""Load mia.mp3 and yield 48kHz PCM data in 20ms chunks."""
178178
audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
179-
179+
180180
# Load audio file using PyAV
181181
container = av.open(audio_file_path)
182182
audio_stream = container.streams.audio[0]
@@ -186,11 +186,7 @@ def mia_audio_48khz_chunked():
186186
# Create resampler if needed
187187
resampler = None
188188
if original_sample_rate != target_rate:
189-
resampler = av.AudioResampler(
190-
format='s16',
191-
layout='mono',
192-
rate=target_rate
193-
)
189+
resampler = av.AudioResampler(format="s16", layout="mono", rate=target_rate)
194190

195191
# Read all audio frames
196192
samples = []
@@ -219,16 +215,14 @@ def mia_audio_48khz_chunked():
219215
# Yield chunks of audio
220216
chunks = []
221217
for i in range(0, len(samples), chunk_size):
222-
chunk_samples = samples[i:i + chunk_size]
223-
218+
chunk_samples = samples[i : i + chunk_size]
219+
224220
# Create PCM data for this chunk
225221
pcm_chunk = PcmData(
226-
samples=chunk_samples,
227-
sample_rate=target_rate,
228-
format="s16"
222+
samples=chunk_samples, sample_rate=target_rate, format=AudioFormat.S16
229223
)
230224
chunks.append(pcm_chunk)
231-
225+
232226
return chunks
233227

234228

docs/ai/instructions/ai-plugin.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ llm = anthropic.LLM()
4848
When building the plugin read these guides:
4949

5050
- **TTS**: [ai-tts.md](ai-tts.md)
51-
- **STT**: [ai-stt.md](ai-stt.md)
51+
- **STT**: [ai-stt.md](ai-stt.md)
5252
- **STS/realtime/LLM**: [ai-llm.md](ai-llm.md) or [ai-realtime-llm.md](ai-realtime-llm.md)
5353

5454
## Update pyproject.toml
@@ -65,4 +65,8 @@ members = [
6565
"plugins/myplugin",
6666
# ... other plugins
6767
]
68-
```
68+
```
69+
70+
## PCM / Audio management
71+
72+
Use `PcmData` and other utils available from the `getstream.video.rtc.track_util` module. Do not write code that directly manipulates PCM, use the audio utilities instead.

docs/ai/instructions/ai-realtime-llm.md

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@ class MyRealtime(realtime.Realtime):
1515
super().__init__()
1616
self.model = model
1717
self.client = client
18-
18+
1919
async def connect(self):
2020
# create the websocket or webrtc connection to the realtime LLM
2121
pass
22-
22+
2323
async def _handle_events(self):
2424
# handle the events from the connect method
25-
25+
2626
# when receiving audio do this
2727
audio_event = RealtimeAudioOutputEvent(
2828
plugin_name="gemini",
@@ -32,26 +32,26 @@ class MyRealtime(realtime.Realtime):
3232
self.events.send(audio_event)
3333

3434
await self.output_track.write(audio_content)
35-
35+
3636
# for transcriptions...
3737
# TODO document this
3838
pass
39-
39+
4040
async def close(self):
4141
pass
42-
42+
4343
# native method wrapped. wrap the native method, every llm has its own name for this
4444
# openai calls it create response, anthropic create message. so the name depends on your llm
4545
async def mynativemethod(self, *args, **kwargs):
46-
46+
4747
# some details to get right here...
4848
# ensure conversation history is maintained. typically by passing it ie:
4949
enhanced_instructions = self._build_enhanced_instructions()
5050
if enhanced_instructions:
5151
kwargs["system"] = [{"text": enhanced_instructions}]
52-
52+
5353
response_iterator = await self.client.mynativemethod(self, *args, **kwargs)
54-
54+
5555
# while receiving streaming do this
5656
total_text = ""
5757
for chunk in response_iterator:
@@ -64,7 +64,7 @@ class MyRealtime(realtime.Realtime):
6464
delta=chunk.text,
6565
))
6666
total_text += chunk.text
67-
67+
6868
llm_response = LLMResponseEvent(response_iterator, total_text)
6969
# and when completed
7070
self.events.send(LLMResponseCompletedEvent(
@@ -82,7 +82,7 @@ class MyRealtime(realtime.Realtime):
8282
# call the LLM with the given text
8383
# be sure to use the streaming version
8484
self.mynativemethod(...)
85-
85+
8686
async def simple_audio_response(self, pcm: PcmData):
8787
# respond to this audio
8888
pass
@@ -101,4 +101,8 @@ class MyRealtime(realtime.Realtime):
101101
If you need more examples look in
102102

103103
- gemini_llm.py
104-
- aws_llm.py (AWS Bedrock implementation)
104+
- aws_llm.py (AWS Bedrock implementation)
105+
106+
## PCM / Audio management
107+
108+
Use `PcmData` and other utils available from the `getstream.video.rtc.track_util` module. Do not write code that directly manipulates PCM, use the audio utilities instead.

0 commit comments

Comments
 (0)