working resampling mechanism

tbarbugli · tbarbugli · commit 353041ba1ebd · 2025-10-24T22:32:48.000+02:00
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -126,6 +126,48 @@ Some ground rules:
 1. Do not build code to resample / adjust audio unless it is not covered already by `PcmData`
 2. Do not pass PCM as plain bytes around and write code that assumes specific sample rate or format. Use `PcmData` instead
 
+## Example
+
+```python
+import asyncio
+from vision_agents.core.edge.types import PcmData
+from openai import AsyncOpenAI
+
+async def example():
+    client = AsyncOpenAI(api_key="sk-42")
+
+    resp = await client.audio.speech.create(
+        model="gpt-4o-mini-tts",
+        voice="alloy",
+        input="pcm is cool, give me some of that please",
+        response_format="pcm",
+    )
+
+    # load response into PcmData, note that you need to specify sample_rate, channels and format
+    pcm_data = PcmData.from_bytes(
+        resp.content, sample_rate=24_000, channels=1, format="s16"
+    )
+
+    # check if pcm_data is stereo (it's not in this case ofc)
+    print(pcm_data.stereo)
+
+    # write the pcm to file
+    with open("test.wav", "wb") as f:
+        f.write(pcm_data.to_wav_bytes())
+
+    # resample pcm to be 48khz stereo
+    resampled_pcm = pcm_data.resample(48_000, 2)
+
+    # play-out pcm using ffplay
+    from vision_agents.core.edge.types import play_pcm_with_ffplay
+
+    await play_pcm_with_ffplay(resampled_pcm)
+
+if __name__ == "__main__":
+    asyncio.run(example())
+```
+
+
 ### Testing audio manually
 
 Sometimes you need to test audio manually, here's some tips:
diff --git a/agents-core/vision_agents/core/edge/types.py b/agents-core/vision_agents/core/edge/types.py
@@ -15,6 +15,11 @@
 from numpy._typing import NDArray
 from pyee.asyncio import AsyncIOEventEmitter
 import av
+import asyncio
+import os
+import shutil
+import tempfile
+import time
 
 logger = logging.getLogger(__name__)
 
@@ -76,6 +81,10 @@ class PcmData(NamedTuple):
     time_base: Optional[float] = None  # Time base for converting timestamps to seconds
     channels: int = 1  # Number of channels (1=mono, 2=stereo)
 
+    @property
+    def stereo(self) -> bool:
+        return self.channels == 2
+
     @property
     def duration(self) -> float:
         """
@@ -636,3 +645,60 @@ def _gen():
         raise TypeError(
             f"Unsupported response type for PcmData.from_response: {type(response)}"
         )
+
+
+async def play_pcm_with_ffplay(
+    pcm: PcmData,
+    outfile_path: Optional[str] = None,
+    timeout_s: float = 30.0,
+) -> str:
+    """Write PcmData to a WAV file and optionally play it with ffplay.
+
+    This is a utility function for testing and debugging audio output.
+
+    Args:
+        pcm: PcmData object to play
+        outfile_path: Optional path for the WAV file. If None, creates a temp file.
+        timeout_s: Timeout in seconds for ffplay playback (default: 30.0)
+
+    Returns:
+        Path to the written WAV file
+
+    Example:
+        pcm = PcmData.from_bytes(audio_bytes, sample_rate=48000, channels=2)
+        wav_path = await play_pcm_with_ffplay(pcm)
+    """
+
+    # Generate output path if not provided
+    if outfile_path is None:
+        tmpdir = tempfile.gettempdir()
+        timestamp = int(time.time())
+        outfile_path = os.path.join(tmpdir, f"pcm_playback_{timestamp}.wav")
+
+    # Write WAV file
+    with open(outfile_path, "wb") as f:
+        f.write(pcm.to_wav_bytes())
+
+    logger.info(f"Wrote WAV file: {outfile_path}")
+
+    # Optional playback with ffplay
+    if shutil.which("ffplay"):
+        logger.info("Playing audio with ffplay...")
+        proc = await asyncio.create_subprocess_exec(
+            "ffplay",
+            "-autoexit",
+            "-nodisp",
+            "-hide_banner",
+            "-loglevel",
+            "error",
+            outfile_path,
+        )
+        try:
+            await asyncio.wait_for(proc.wait(), timeout=timeout_s)
+        except asyncio.TimeoutError:
+            logger.warning(f"ffplay timed out after {timeout_s}s, killing process")
+            proc.kill()
+    else:
+        logger.warning("ffplay not found in PATH, skipping playback")
+
+    return outfile_path
diff --git a/agents-core/vision_agents/core/tts/manual_test.py b/agents-core/vision_agents/core/tts/manual_test.py
@@ -1,13 +1,12 @@
-import asyncio
 import os
-import shutil
 import tempfile
 import time
+
 from typing import Optional
 
 from vision_agents.core.tts import TTS
 from vision_agents.core.tts.testing import TTSSession
-from vision_agents.core.edge.types import PcmData
+from vision_agents.core.edge.types import PcmData, play_pcm_with_ffplay
 
 
 async def manual_tts_to_wav(
@@ -18,7 +17,6 @@ async def manual_tts_to_wav(
     text: str = "This is a manual TTS playback test.",
     outfile_path: Optional[str] = None,
     timeout_s: float = 20.0,
-    play_env: str = "FFPLAY",
 ) -> str:
     """Generate TTS audio to a WAV file and optionally play with ffplay.
 
@@ -48,35 +46,19 @@ async def manual_tts_to_wav(
     if result.errors:
         raise RuntimeError(f"TTS errors: {result.errors}")
 
-    # Write WAV file (16kHz mono, s16)
-    if outfile_path is None:
-        tmpdir = tempfile.gettempdir()
-        timestamp = int(time.time())
-        outfile_path = os.path.join(
-            tmpdir, f"tts_manual_test_{tts.__class__.__name__}_{timestamp}.wav"
-        )
-
+    # Convert captured audio to PcmData
     pcm_bytes = b"".join(result.speeches)
     pcm = PcmData.from_bytes(
         pcm_bytes, sample_rate=sample_rate, channels=channels, format="s16"
     )
-    with open(outfile_path, "wb") as f:
-        f.write(pcm.to_wav_bytes())
 
-    # Optional playback
-    if os.environ.get(play_env) == "1" and shutil.which("ffplay"):
-        proc = await asyncio.create_subprocess_exec(
-            "ffplay",
-            "-autoexit",
-            "-nodisp",
-            "-hide_banner",
-            "-loglevel",
-            "error",
-            outfile_path,
+    # Generate a descriptive filename if not provided
+    if outfile_path is None:
+        tmpdir = tempfile.gettempdir()
+        timestamp = int(time.time())
+        outfile_path = os.path.join(
+            tmpdir, f"tts_manual_test_{tts.__class__.__name__}_{timestamp}.wav"
         )
-        try:
-            await asyncio.wait_for(proc.wait(), timeout=30.0)
-        except asyncio.TimeoutError:
-            proc.kill()
 
-    return outfile_path
+    # Use utility function to write WAV and optionally play
+    return await play_pcm_with_ffplay(pcm, outfile_path=outfile_path, timeout_s=30.0)
diff --git a/agents-core/vision_agents/core/tts/tts.py b/agents-core/vision_agents/core/tts/tts.py
@@ -25,7 +25,6 @@
     tts_events_emitted,
 )
 from ..edge.types import PcmData
-import numpy as np
 
 logger = logging.getLogger(__name__)
 
@@ -61,17 +60,14 @@ def __init__(self, provider_name: Optional[str] = None):
         self.provider_name = provider_name or self.__class__.__name__
         self.events = EventManager()
         self.events.register_events_from_module(events, ignore_not_compatible=True)
+
         # Desired output audio format (what downstream audio track expects)
-        # Agent can override via set_output_format
         self._desired_sample_rate: int = 16000
         self._desired_channels: int = 1
         self._desired_format: AudioFormat = AudioFormat.PCM_S16
-        # Native/provider audio format default (used only if plugin returns raw bytes)
-        self._native_sample_rate: int = 16000
-        self._native_channels: int = 1
-        self._native_format: AudioFormat = AudioFormat.PCM_S16
+
         # Persistent resampler to avoid discontinuities between chunks
-        self._resampler = None
+        self._resampler: Optional[av.AudioResampler] = None
         self._resampler_input_rate: Optional[int] = None
         self._resampler_input_channels: Optional[int] = None
 
@@ -113,7 +109,11 @@ def _get_resampler(self, input_rate: int, input_channels: int):
             PyAV AudioResampler instance
         """
 
-        if self._resampler is not None and self._resampler_input_rate == input_rate and self._resampler_input_channels == input_channels:
+        if (
+            self._resampler is not None
+            and self._resampler_input_rate == input_rate
+            and self._resampler_input_channels == input_channels
+        ):
             return self._resampler
 
         in_layout = "mono" if input_channels == 1 else "stereo"
@@ -135,40 +135,21 @@ def _get_resampler(self, input_rate: int, input_channels: int):
 
         return self._resampler
 
-    def _normalize_to_pcm(self, item: Union[bytes, bytearray, PcmData, Any]) -> PcmData:
-        """Normalize a chunk to PcmData using the native provider format."""
-        if isinstance(item, PcmData):
-            return item
-        data = getattr(item, "data", item)
-        if not isinstance(data, (bytes, bytearray, memoryview)):
-            raise TypeError("Chunk is not bytes or PcmData")
-        fmt = (
-            self._native_format.value
-            if hasattr(self._native_format, "value")
-            else "s16"
-        )
-        return PcmData.from_bytes(
-            bytes(data),
-            sample_rate=self._native_sample_rate,
-            channels=self._native_channels,
-            format=fmt,
-        )
-
     async def _iter_pcm(self, resp: Any) -> AsyncGenerator[PcmData, None]:
         """Yield PcmData chunks from a provider response of various shapes."""
         # Single buffer or PcmData
-        if isinstance(resp, (bytes, bytearray, PcmData)):
-            yield self._normalize_to_pcm(resp)
+        if isinstance(resp, (PcmData,)):
+            yield resp
             return
         # Async iterable
         if hasattr(resp, "__aiter__"):
             async for item in resp:
-                yield self._normalize_to_pcm(item)
+                yield item
             return
-        # Sync iterable (avoid treating bytes-like as iterable of ints)
-        if hasattr(resp, "__iter__") and not isinstance(resp, (str, bytes, bytearray)):
+        # Sync iterable
+        if hasattr(resp, "__iter__"):
             for item in resp:
-                yield self._normalize_to_pcm(item)
+                yield item
             return
         raise TypeError(f"Unsupported return type from stream_audio: {type(resp)}")
 
@@ -297,9 +278,9 @@ async def send(
             chunk_index = 0
 
             # Fast-path: single buffer -> mark final
-            if isinstance(response, (bytes, bytearray, PcmData)):
+            if isinstance(response, (PcmData,)):
                 bytes_len, dur_ms = self._emit_chunk(
-                    self._normalize_to_pcm(response), 0, True, synthesis_id, text, user
+                    response, 0, True, synthesis_id, text, user
                 )
                 total_audio_bytes += bytes_len
                 total_audio_ms += dur_ms