Skip to content

Commit 61a26cf

Browse files
committed
attempt at fish
1 parent c954409 commit 61a26cf

File tree

15 files changed

+591
-0
lines changed

15 files changed

+591
-0
lines changed

plugins/fish/README.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Fish Audio Text-to-Speech Plugin
2+
3+
A high-quality Text-to-Speech (TTS) plugin for Vision Agents that uses the Fish Audio API.
4+
5+
## Installation
6+
7+
```bash
8+
pip install vision-agents-plugins-fish
9+
```
10+
11+
## Usage
12+
13+
```python
14+
from vision_agents.plugins.fish import TTS
15+
from getstream.video.rtc.audio_track import AudioStreamTrack
16+
17+
# Initialize with API key from environment variable
18+
tts = TTS()
19+
20+
# Or specify API key directly
21+
tts = TTS(api_key="your_fish_audio_api_key")
22+
23+
# Create an audio track to output speech
24+
track = AudioStreamTrack(framerate=16000)
25+
tts.set_output_track(track)
26+
27+
# Register event handlers
28+
@tts.on("audio")
29+
def on_audio(audio_data, user):
30+
print(f"Received audio chunk: {len(audio_data)} bytes")
31+
32+
# Send text to be converted to speech
33+
await tts.send("Hello, this is a test of the Fish Audio text-to-speech plugin.")
34+
```
35+
36+
## Configuration Options
37+
38+
- `api_key`: Fish Audio API key (default: reads from FISH_AUDIO_API_KEY environment variable)
39+
- `reference_id`: Optional reference voice ID to use for synthesis
40+
- `base_url`: Optional custom API endpoint (default: uses Fish Audio's default endpoint)
41+
42+
## Reference Audio
43+
44+
Fish Audio supports using reference audio for voice cloning:
45+
46+
```python
47+
from vision_agents.plugins.fish import TTS
48+
49+
# Using a reference voice ID
50+
tts = TTS(reference_id="your_reference_voice_id")
51+
52+
# Or pass reference audio dynamically when sending text
53+
# (See Fish Audio SDK documentation for advanced usage)
54+
```
55+
56+
## Requirements
57+
58+
- Python 3.10+
59+
- fish-audio-sdk>=2025.4.2
60+

plugins/fish/example/README.md

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Fish Audio TTS Examples
2+
3+
This directory contains examples demonstrating how to use the Fish Audio TTS plugin with Vision Agents.
4+
5+
## Examples
6+
7+
### 1. Simple TTS Example (`simple_tts_example.py`)
8+
Basic usage of Fish Audio TTS without a full agent setup. Perfect for testing or simple integrations.
9+
10+
### 2. Full Agent Example (`fish_tts_example.py`)
11+
Complete agent setup with Fish Audio TTS, Deepgram STT, and real-time communication.
12+
13+
## Setup
14+
15+
1. Install dependencies:
16+
```bash
17+
cd plugins/fish/example
18+
uv sync
19+
```
20+
21+
2. Create a `.env` file with your API keys:
22+
```bash
23+
# Required for Fish Audio TTS
24+
FISH_AUDIO_API_KEY=your_fish_audio_api_key
25+
26+
# Required for full agent example only:
27+
DEEPGRAM_API_KEY=your_deepgram_api_key
28+
STREAM_API_KEY=your_stream_api_key
29+
STREAM_API_SECRET=your_stream_api_secret
30+
```
31+
32+
## Running the Examples
33+
34+
### Simple TTS Example
35+
```bash
36+
uv run simple_tts_example.py
37+
```
38+
39+
### Full Agent Example
40+
```bash
41+
uv run fish_tts_example.py
42+
```
43+
44+
## What it does
45+
46+
The example creates an AI agent that:
47+
- Uses **Fish Audio** for high-quality text-to-speech synthesis
48+
- Uses **Deepgram** for speech-to-text transcription
49+
- Uses **GetStream** for real-time communication
50+
- Uses **Smart Turn** detection for natural conversation flow
51+
52+
The agent will greet you using Fish Audio's TTS and be ready to have a conversation.
53+
54+
## Customization
55+
56+
You can customize the Fish Audio TTS settings:
57+
58+
```python
59+
# Use a specific reference voice
60+
tts = fish.TTS(reference_id="your_reference_voice_id")
61+
62+
# Or use a custom endpoint
63+
tts = fish.TTS(base_url="https://your-custom-endpoint.com")
64+
```
65+
66+
## Additional Resources
67+
68+
- [Fish Audio Documentation](https://docs.fish.audio)
69+
- [Vision Agents Documentation](https://visionagents.ai)
70+

plugins/fish/example/__init__.py

Whitespace-only changes.
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import asyncio
2+
import logging
3+
from uuid import uuid4
4+
5+
from dotenv import load_dotenv
6+
7+
from vision_agents.core import User
8+
from vision_agents.core.agents import Agent
9+
from vision_agents.plugins import fish, getstream, deepgram, smart_turn, gemini
10+
11+
load_dotenv()
12+
13+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s [call_id=%(call_id)s] %(name)s: %(message)s")
14+
logger = logging.getLogger(__name__)
15+
16+
17+
async def start_agent() -> None:
18+
"""
19+
Example demonstrating Fish Audio TTS integration with Vision Agents.
20+
21+
This example creates an agent that uses:
22+
- Fish Audio for text-to-speech (TTS)
23+
- Deepgram for speech-to-text (STT)
24+
- GetStream for edge/real-time communication
25+
- Smart Turn for turn detection
26+
27+
Requirements:
28+
- FISH_AUDIO_API_KEY environment variable
29+
- DEEPGRAM_API_KEY environment variable
30+
- STREAM_API_KEY and STREAM_API_SECRET environment variables
31+
"""
32+
agent = Agent(
33+
edge=getstream.Edge(),
34+
agent_user=User(name="Friendly AI"),
35+
instructions="You are a helpful AI assistant. Be friendly and conversational.",
36+
tts=fish.TTS(), # Uses Fish Audio for text-to-speech
37+
stt=deepgram.STT(),
38+
llm=gemini.LLM("gemini-2.0-flash"),
39+
turn_detection=smart_turn.TurnDetection(buffer_duration=2.0, confidence_threshold=0.5),
40+
)
41+
await agent.create_user()
42+
43+
call = agent.edge.client.video.call("default", str(uuid4()))
44+
await agent.edge.open_demo(call)
45+
46+
with await agent.join(call):
47+
await asyncio.sleep(5)
48+
# The agent will greet the user using Fish Audio TTS
49+
await agent.llm.simple_response(text="Hello! I'm using Fish Audio for text-to-speech. How can I help you today?")
50+
await agent.finish()
51+
52+
53+
if __name__ == "__main__":
54+
asyncio.run(start_agent())
55+
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[project]
2+
name = "fish-tts-example"
3+
version = "0.0.0"
4+
requires-python = ">=3.10"
5+
6+
dependencies = [
7+
"python-dotenv>=1.0",
8+
"vision-agents-plugins-fish",
9+
"vision-agents-plugins-getstream",
10+
"vision-agents-plugins-deepgram",
11+
"vision-agents-plugins-smart-turn",
12+
"vision-agents",
13+
]
14+
15+
[tool.uv.sources]
16+
"vision-agents-plugins-fish" = {path = "..", editable=true}
17+
"vision-agents-plugins-getstream" = {path = "../../getstream", editable=true}
18+
"vision-agents-plugins-deepgram" = {path = "../../deepgram", editable=true}
19+
"vision-agents-plugins-smart-turn" = {path = "../../smart_turn", editable=true}
20+
"vision-agents" = {path = "../../../agents-core", editable=true}
21+

plugins/fish/py.typed

Whitespace-only changes.

plugins/fish/pyproject.toml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
[build-system]
2+
requires = ["hatchling", "hatch-vcs"]
3+
build-backend = "hatchling.build"
4+
5+
[project]
6+
name = "vision-agents-plugins-fish"
7+
dynamic = ["version"]
8+
description = "Fish Audio TTS integration for Vision Agents"
9+
readme = "README.md"
10+
keywords = ["fish-audio", "TTS", "text-to-speech", "AI", "voice agents", "agents"]
11+
requires-python = ">=3.10"
12+
license = "MIT"
13+
dependencies = [
14+
"vision-agents",
15+
"fish-audio-sdk>=2025.4.2",
16+
]
17+
18+
[project.urls]
19+
Documentation = "https://visionagents.ai/"
20+
Website = "https://visionagents.ai/"
21+
Source = "https://github.com/GetStream/Vision-Agents"
22+
23+
[tool.hatch.version]
24+
source = "vcs"
25+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
26+
27+
[tool.hatch.build.targets.wheel]
28+
packages = [".", "vision_agents"]
29+
30+
[tool.hatch.build.targets.sdist]
31+
include = ["/vision_agents"]
32+
33+
[tool.uv.sources]
34+
vision-agents = { workspace = true }
35+
36+
[dependency-groups]
37+
dev = [
38+
"pytest>=8.4.1",
39+
"pytest-asyncio>=1.0.0",
40+
]
41+

plugins/fish/tests/__init__.py

Whitespace-only changes.

plugins/fish/tests/test_tts.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import os
2+
import asyncio
3+
4+
import pytest
5+
from dotenv import load_dotenv
6+
7+
from vision_agents.plugins import fish
8+
from vision_agents.core.tts.events import TTSAudioEvent, TTSErrorEvent
9+
from getstream.video.rtc.audio_track import AudioStreamTrack
10+
11+
# Load environment variables
12+
load_dotenv()
13+
14+
# Audio track for capturing test output
15+
class MockAudioTrack(AudioStreamTrack):
16+
def __init__(self, framerate: int = 16000):
17+
self.framerate = framerate
18+
self.written_data = []
19+
20+
async def write(self, data: bytes):
21+
self.written_data.append(data)
22+
return True
23+
24+
25+
@pytest.mark.integration
26+
async def test_fish_tts_convert_text_to_audio():
27+
"""
28+
Integration test with the real Fish Audio API.
29+
30+
This test uses the actual Fish Audio API with the
31+
FISH_AUDIO_API_KEY environment variable.
32+
It will be skipped if the environment variable is not set.
33+
34+
To set up the FISH_AUDIO_API_KEY:
35+
1. Sign up for a Fish Audio account at https://fish.audio
36+
2. Create an API key in your Fish Audio dashboard
37+
3. Add to your .env file: FISH_AUDIO_API_KEY=your_api_key_here
38+
"""
39+
40+
41+
# Create a real Fish Audio TTS instance
42+
tts = fish.TTS()
43+
44+
# Create an audio track to capture the output
45+
track = MockAudioTrack()
46+
tts.set_output_track(track)
47+
48+
# Track audio events
49+
audio_received = asyncio.Event()
50+
received_chunks = []
51+
52+
@tts.events.subscribe
53+
async def on_audio(event: TTSAudioEvent):
54+
received_chunks.append(event.audio_data)
55+
audio_received.set()
56+
57+
# Track API errors
58+
api_errors = []
59+
60+
@tts.events.subscribe
61+
async def on_error(event: TTSErrorEvent):
62+
api_errors.append(event.error)
63+
audio_received.set() # Unblock the waiting
64+
65+
# Allow event subscriptions to be processed
66+
await asyncio.sleep(0.01)
67+
68+
try:
69+
# Use a short text to minimize API usage
70+
text = "Hello from Fish Audio."
71+
72+
# Send the text to generate speech
73+
send_task = asyncio.create_task(tts.send(text))
74+
75+
# Wait for either audio or an error
76+
try:
77+
await asyncio.wait_for(audio_received.wait(), timeout=15.0)
78+
except asyncio.TimeoutError:
79+
# Cancel the task if it's taking too long
80+
send_task.cancel()
81+
pytest.fail("No audio or error received within timeout")
82+
83+
# Check if we received any API errors
84+
if api_errors:
85+
pytest.skip(f"API error received: {api_errors[0]}")
86+
87+
# Try to ensure the send task completes
88+
try:
89+
await send_task
90+
except Exception as e:
91+
pytest.skip(f"Exception during TTS generation: {e}")
92+
93+
# Verify that we received audio data
94+
assert len(received_chunks) > 0, "No audio chunks were received"
95+
assert len(track.written_data) > 0, "No audio data was written to track"
96+
except Exception as e:
97+
pytest.skip(f"Unexpected error in Fish Audio test: {e}")
98+
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Namespace package
2+
__path__ = __import__("pkgutil").extend_path(__path__, __name__)
3+

0 commit comments

Comments
 (0)