Skip to content

Commit 353041b

Browse files
committed
working resampling mechanism
1 parent bba5ea7 commit 353041b

File tree

4 files changed

+135
-64
lines changed

4 files changed

+135
-64
lines changed

DEVELOPMENT.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,48 @@ Some ground rules:
126126
1. Do not build code to resample / adjust audio unless it is not covered already by `PcmData`
127127
2. Do not pass PCM as plain bytes around and write code that assumes specific sample rate or format. Use `PcmData` instead
128128

129+
## Example
130+
131+
```python
132+
import asyncio
133+
from vision_agents.core.edge.types import PcmData
134+
from openai import AsyncOpenAI
135+
136+
async def example():
137+
client = AsyncOpenAI(api_key="sk-42")
138+
139+
resp = await client.audio.speech.create(
140+
model="gpt-4o-mini-tts",
141+
voice="alloy",
142+
input="pcm is cool, give me some of that please",
143+
response_format="pcm",
144+
)
145+
146+
# load response into PcmData, note that you need to specify sample_rate, channels and format
147+
pcm_data = PcmData.from_bytes(
148+
resp.content, sample_rate=24_000, channels=1, format="s16"
149+
)
150+
151+
# check if pcm_data is stereo (it's not in this case ofc)
152+
print(pcm_data.stereo)
153+
154+
# write the pcm to file
155+
with open("test.wav", "wb") as f:
156+
f.write(pcm_data.to_wav_bytes())
157+
158+
# resample pcm to be 48khz stereo
159+
resampled_pcm = pcm_data.resample(48_000, 2)
160+
161+
# play-out pcm using ffplay
162+
from vision_agents.core.edge.types import play_pcm_with_ffplay
163+
164+
await play_pcm_with_ffplay(resampled_pcm)
165+
166+
if __name__ == "__main__":
167+
asyncio.run(example())
168+
```
169+
170+
129171
### Testing audio manually
130172

131173
Sometimes you need to test audio manually, here's some tips:

agents-core/vision_agents/core/edge/types.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
from numpy._typing import NDArray
1616
from pyee.asyncio import AsyncIOEventEmitter
1717
import av
18+
import asyncio
19+
import os
20+
import shutil
21+
import tempfile
22+
import time
1823

1924
logger = logging.getLogger(__name__)
2025

@@ -76,6 +81,10 @@ class PcmData(NamedTuple):
7681
time_base: Optional[float] = None # Time base for converting timestamps to seconds
7782
channels: int = 1 # Number of channels (1=mono, 2=stereo)
7883

84+
@property
85+
def stereo(self) -> bool:
86+
return self.channels == 2
87+
7988
@property
8089
def duration(self) -> float:
8190
"""
@@ -636,3 +645,60 @@ def _gen():
636645
raise TypeError(
637646
f"Unsupported response type for PcmData.from_response: {type(response)}"
638647
)
648+
649+
650+
async def play_pcm_with_ffplay(
651+
pcm: PcmData,
652+
outfile_path: Optional[str] = None,
653+
timeout_s: float = 30.0,
654+
) -> str:
655+
"""Write PcmData to a WAV file and optionally play it with ffplay.
656+
657+
This is a utility function for testing and debugging audio output.
658+
659+
Args:
660+
pcm: PcmData object to play
661+
outfile_path: Optional path for the WAV file. If None, creates a temp file.
662+
timeout_s: Timeout in seconds for ffplay playback (default: 30.0)
663+
664+
Returns:
665+
Path to the written WAV file
666+
667+
Example:
668+
pcm = PcmData.from_bytes(audio_bytes, sample_rate=48000, channels=2)
669+
wav_path = await play_pcm_with_ffplay(pcm)
670+
"""
671+
672+
# Generate output path if not provided
673+
if outfile_path is None:
674+
tmpdir = tempfile.gettempdir()
675+
timestamp = int(time.time())
676+
outfile_path = os.path.join(tmpdir, f"pcm_playback_{timestamp}.wav")
677+
678+
# Write WAV file
679+
with open(outfile_path, "wb") as f:
680+
f.write(pcm.to_wav_bytes())
681+
682+
logger.info(f"Wrote WAV file: {outfile_path}")
683+
684+
# Optional playback with ffplay
685+
if shutil.which("ffplay"):
686+
logger.info("Playing audio with ffplay...")
687+
proc = await asyncio.create_subprocess_exec(
688+
"ffplay",
689+
"-autoexit",
690+
"-nodisp",
691+
"-hide_banner",
692+
"-loglevel",
693+
"error",
694+
outfile_path,
695+
)
696+
try:
697+
await asyncio.wait_for(proc.wait(), timeout=timeout_s)
698+
except asyncio.TimeoutError:
699+
logger.warning(f"ffplay timed out after {timeout_s}s, killing process")
700+
proc.kill()
701+
else:
702+
logger.warning("ffplay not found in PATH, skipping playback")
703+
704+
return outfile_path
Lines changed: 11 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
1-
import asyncio
21
import os
3-
import shutil
42
import tempfile
53
import time
4+
65
from typing import Optional
76

87
from vision_agents.core.tts import TTS
98
from vision_agents.core.tts.testing import TTSSession
10-
from vision_agents.core.edge.types import PcmData
9+
from vision_agents.core.edge.types import PcmData, play_pcm_with_ffplay
1110

1211

1312
async def manual_tts_to_wav(
@@ -18,7 +17,6 @@ async def manual_tts_to_wav(
1817
text: str = "This is a manual TTS playback test.",
1918
outfile_path: Optional[str] = None,
2019
timeout_s: float = 20.0,
21-
play_env: str = "FFPLAY",
2220
) -> str:
2321
"""Generate TTS audio to a WAV file and optionally play with ffplay.
2422
@@ -48,35 +46,19 @@ async def manual_tts_to_wav(
4846
if result.errors:
4947
raise RuntimeError(f"TTS errors: {result.errors}")
5048

51-
# Write WAV file (16kHz mono, s16)
52-
if outfile_path is None:
53-
tmpdir = tempfile.gettempdir()
54-
timestamp = int(time.time())
55-
outfile_path = os.path.join(
56-
tmpdir, f"tts_manual_test_{tts.__class__.__name__}_{timestamp}.wav"
57-
)
58-
49+
# Convert captured audio to PcmData
5950
pcm_bytes = b"".join(result.speeches)
6051
pcm = PcmData.from_bytes(
6152
pcm_bytes, sample_rate=sample_rate, channels=channels, format="s16"
6253
)
63-
with open(outfile_path, "wb") as f:
64-
f.write(pcm.to_wav_bytes())
6554

66-
# Optional playback
67-
if os.environ.get(play_env) == "1" and shutil.which("ffplay"):
68-
proc = await asyncio.create_subprocess_exec(
69-
"ffplay",
70-
"-autoexit",
71-
"-nodisp",
72-
"-hide_banner",
73-
"-loglevel",
74-
"error",
75-
outfile_path,
55+
# Generate a descriptive filename if not provided
56+
if outfile_path is None:
57+
tmpdir = tempfile.gettempdir()
58+
timestamp = int(time.time())
59+
outfile_path = os.path.join(
60+
tmpdir, f"tts_manual_test_{tts.__class__.__name__}_{timestamp}.wav"
7661
)
77-
try:
78-
await asyncio.wait_for(proc.wait(), timeout=30.0)
79-
except asyncio.TimeoutError:
80-
proc.kill()
8162

82-
return outfile_path
63+
# Use utility function to write WAV and optionally play
64+
return await play_pcm_with_ffplay(pcm, outfile_path=outfile_path, timeout_s=30.0)

agents-core/vision_agents/core/tts/tts.py

Lines changed: 16 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
tts_events_emitted,
2626
)
2727
from ..edge.types import PcmData
28-
import numpy as np
2928

3029
logger = logging.getLogger(__name__)
3130

@@ -61,17 +60,14 @@ def __init__(self, provider_name: Optional[str] = None):
6160
self.provider_name = provider_name or self.__class__.__name__
6261
self.events = EventManager()
6362
self.events.register_events_from_module(events, ignore_not_compatible=True)
63+
6464
# Desired output audio format (what downstream audio track expects)
65-
# Agent can override via set_output_format
6665
self._desired_sample_rate: int = 16000
6766
self._desired_channels: int = 1
6867
self._desired_format: AudioFormat = AudioFormat.PCM_S16
69-
# Native/provider audio format default (used only if plugin returns raw bytes)
70-
self._native_sample_rate: int = 16000
71-
self._native_channels: int = 1
72-
self._native_format: AudioFormat = AudioFormat.PCM_S16
68+
7369
# Persistent resampler to avoid discontinuities between chunks
74-
self._resampler = None
70+
self._resampler: Optional[av.AudioResampler] = None
7571
self._resampler_input_rate: Optional[int] = None
7672
self._resampler_input_channels: Optional[int] = None
7773

@@ -113,7 +109,11 @@ def _get_resampler(self, input_rate: int, input_channels: int):
113109
PyAV AudioResampler instance
114110
"""
115111

116-
if self._resampler is not None and self._resampler_input_rate == input_rate and self._resampler_input_channels == input_channels:
112+
if (
113+
self._resampler is not None
114+
and self._resampler_input_rate == input_rate
115+
and self._resampler_input_channels == input_channels
116+
):
117117
return self._resampler
118118

119119
in_layout = "mono" if input_channels == 1 else "stereo"
@@ -135,40 +135,21 @@ def _get_resampler(self, input_rate: int, input_channels: int):
135135

136136
return self._resampler
137137

138-
def _normalize_to_pcm(self, item: Union[bytes, bytearray, PcmData, Any]) -> PcmData:
139-
"""Normalize a chunk to PcmData using the native provider format."""
140-
if isinstance(item, PcmData):
141-
return item
142-
data = getattr(item, "data", item)
143-
if not isinstance(data, (bytes, bytearray, memoryview)):
144-
raise TypeError("Chunk is not bytes or PcmData")
145-
fmt = (
146-
self._native_format.value
147-
if hasattr(self._native_format, "value")
148-
else "s16"
149-
)
150-
return PcmData.from_bytes(
151-
bytes(data),
152-
sample_rate=self._native_sample_rate,
153-
channels=self._native_channels,
154-
format=fmt,
155-
)
156-
157138
async def _iter_pcm(self, resp: Any) -> AsyncGenerator[PcmData, None]:
158139
"""Yield PcmData chunks from a provider response of various shapes."""
159140
# Single buffer or PcmData
160-
if isinstance(resp, (bytes, bytearray, PcmData)):
161-
yield self._normalize_to_pcm(resp)
141+
if isinstance(resp, (PcmData,)):
142+
yield resp
162143
return
163144
# Async iterable
164145
if hasattr(resp, "__aiter__"):
165146
async for item in resp:
166-
yield self._normalize_to_pcm(item)
147+
yield item
167148
return
168-
# Sync iterable (avoid treating bytes-like as iterable of ints)
169-
if hasattr(resp, "__iter__") and not isinstance(resp, (str, bytes, bytearray)):
149+
# Sync iterable
150+
if hasattr(resp, "__iter__"):
170151
for item in resp:
171-
yield self._normalize_to_pcm(item)
152+
yield item
172153
return
173154
raise TypeError(f"Unsupported return type from stream_audio: {type(resp)}")
174155

@@ -297,9 +278,9 @@ async def send(
297278
chunk_index = 0
298279

299280
# Fast-path: single buffer -> mark final
300-
if isinstance(response, (bytes, bytearray, PcmData)):
281+
if isinstance(response, (PcmData,)):
301282
bytes_len, dur_ms = self._emit_chunk(
302-
self._normalize_to_pcm(response), 0, True, synthesis_id, text, user
283+
response, 0, True, synthesis_id, text, user
303284
)
304285
total_audio_bytes += bytes_len
305286
total_audio_ms += dur_ms

0 commit comments

Comments
 (0)