Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,73 @@ To see how the agent work open up agents.py
* The LLM uses the VideoForwarder to write the video to a websocket or webrtc connection
* The STS writes the reply on agent.llm.audio_track and the RealtimeTranscriptEvent / RealtimePartialTranscriptEvent

## Audio management

Some important things about audio inside the library:

1. WebRTC uses Opus 48khz stereo but inside the library audio is always in PCM format
2. Plugins / AI models work with different PCM formats, usually 16khz mono
3. PCM data is always passed around using the `PcmData` object which contains information about sample rate, channels and format
4. Text-to-speech plugins automatically return PCM in the format needed by WebRTC. This is exposed via the `set_output_format` method
5. Audio resampling can be done using `PcmData.resample` method
6. When resampling audio in chunks, it is important to re-use the same `av.AudioResampler` resampler (see `PcmData.resample` and `core.tts.TTS`)
7. Adjusting from stereo to mono and vice-versa can be done using the `PcmData.resample` method

Some ground rules:

1. Do not build code to resample / adjust audio unless it is not covered already by `PcmData`
2. Do not pass PCM as plain bytes around and write code that assumes specific sample rate or format. Use `PcmData` instead

## Example

```python
import asyncio
from vision_agents.core.edge.types import PcmData
from openai import AsyncOpenAI

async def example():
client = AsyncOpenAI(api_key="sk-42")

resp = await client.audio.speech.create(
model="gpt-4o-mini-tts",
voice="alloy",
input="pcm is cool, give me some of that please",
response_format="pcm",
)

# load response into PcmData, note that you need to specify sample_rate, channels and format
pcm_data = PcmData.from_bytes(
resp.content, sample_rate=24_000, channels=1, format="s16"
)

# check if pcm_data is stereo (it's not in this case ofc)
print(pcm_data.stereo)

# write the pcm to file
with open("test.wav", "wb") as f:
f.write(pcm_data.to_wav_bytes())

# resample pcm to be 48khz stereo
resampled_pcm = pcm_data.resample(48_000, 2)

# play-out pcm using ffplay
from vision_agents.core.edge.types import play_pcm_with_ffplay

await play_pcm_with_ffplay(resampled_pcm)

if __name__ == "__main__":
asyncio.run(example())
```


### Testing audio manually

Sometimes you need to test audio manually, here's some tips:

1. Do not use earplugs when testing PCM playback ;)
2. You can use the `PcmData.to_wav_bytes` method to convert PCM into wav bytes (see `manual_tts_to_wav` for an example)
3. If you have `ffplay` installed, you can playback pcm directly to check if audio is correct

## Dev / Contributor Guidelines

### Light wrapping
Expand Down
29 changes: 17 additions & 12 deletions agents-core/vision_agents/core/agents/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import TYPE_CHECKING, Any, Dict, List, Optional
from uuid import uuid4

import aiortc
import getstream.models
from aiortc import VideoStreamTrack
from getstream.video.rtc import Call
Expand All @@ -15,7 +14,7 @@
from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import TrackType
from ..edge import sfu_events
from ..edge.events import AudioReceivedEvent, TrackAddedEvent, CallEndedEvent
from ..edge.types import Connection, Participant, PcmData, User
from ..edge.types import Connection, Participant, PcmData, User, OutputAudioTrack
from ..events.manager import EventManager
from ..llm import events as llm_events
from ..llm.events import (
Expand All @@ -32,6 +31,7 @@
from ..stt.events import STTTranscriptEvent, STTErrorEvent
from ..stt.stt import STT
from ..tts.tts import TTS
from ..tts.events import TTSAudioEvent
from ..turn_detection import TurnDetector, TurnStartedEvent, TurnEndedEvent
from ..vad import VAD
from ..vad.events import VADAudioEvent
Expand Down Expand Up @@ -160,7 +160,7 @@ def __init__(
self._callback_executed = False
self._track_tasks: Dict[str, asyncio.Task] = {}
self._connection: Optional[Connection] = None
self._audio_track: Optional[aiortc.AudioStreamTrack] = None
self._audio_track: Optional[OutputAudioTrack] = None
self._video_track: Optional[VideoStreamTrack] = None
self._realtime_connection = None
self._pc_track_handler_attached: bool = False
Expand Down Expand Up @@ -307,6 +307,11 @@ async def on_realtime_agent_speech_transcription(
original=event,
)

@self.events.subscribe
async def _on_tts_audio_write_to_output(event: TTSAudioEvent):
if self._audio_track and event and event.audio_data is not None:
await self._audio_track.write(event.audio_data)

@self.events.subscribe
async def on_stt_transcript_event_create_response(event: STTTranscriptEvent):
if self.realtime_mode or not self.llm:
Expand Down Expand Up @@ -1021,19 +1026,19 @@ def _prepare_rtc(self):
self._audio_track = self.llm.output_track
self.logger.info("🎵 Using Realtime provider output track for audio")
else:
# TODO: what if we want to transform audio...
# Get the required framerate and stereo setting from TTS plugin, default to 48000 for WebRTC
if self.tts:
framerate = self.tts.get_required_framerate()
stereo = self.tts.get_required_stereo()
else:
framerate = 48000
stereo = True # Default to stereo for WebRTC
# Default to WebRTC-friendly format unless configured differently
framerate = 48000
stereo = True
self._audio_track = self.edge.create_audio_track(
framerate=framerate, stereo=stereo
)
# Inform TTS of desired output format so it can resample accordingly
if self.tts:
self.tts.set_output_track(self._audio_track)
channels = 2 if stereo else 1
self.tts.set_output_format(
sample_rate=framerate,
channels=channels,
)

# Set up video track if video publishers are available
if self.publish_video:
Expand Down
11 changes: 6 additions & 5 deletions agents-core/vision_agents/core/edge/edge_transport.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
"""
Abstraction for stream vs other services here
"""

import abc

from typing import TYPE_CHECKING, Any, Optional

import aiortc
from pyee.asyncio import AsyncIOEventEmitter

from vision_agents.core.edge.types import User
from vision_agents.core.edge.types import User, OutputAudioTrack

if TYPE_CHECKING:

pass


Expand All @@ -31,7 +31,7 @@ async def create_user(self, user: User):
pass

@abc.abstractmethod
def create_audio_track(self):
def create_audio_track(self) -> OutputAudioTrack:
pass

@abc.abstractmethod
Expand All @@ -55,6 +55,7 @@ async def create_conversation(self, call: Any, user: User, instructions):
pass

@abc.abstractmethod
def add_track_subscriber(self, track_id: str) -> Optional[aiortc.mediastreams.MediaStreamTrack]:
def add_track_subscriber(
self, track_id: str
) -> Optional[aiortc.mediastreams.MediaStreamTrack]:
pass

Loading
Loading