Skip to content

Commit 8c44a7f

Browse files
committed
Boostrapping inworld tts
1 parent 8eb5f3d commit 8c44a7f

File tree

13 files changed

+732
-267
lines changed

13 files changed

+732
-267
lines changed

agents-core/pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ elevenlabs = ["vision-agents-plugins-elevenlabs"]
4343
gemini = ["vision-agents-plugins-gemini"]
4444
getstream = ["vision-agents-plugins-getstream"]
4545
heygen = ["vision-agents-plugins-heygen"]
46+
inworld = ["vision-agents-plugins-inworld"]
4647
kokoro = ["vision-agents-plugins-kokoro"]
4748
krisp = ["vision-agents-plugins-krisp"]
4849
moonshine = ["vision-agents-plugins-moonshine"]
@@ -59,6 +60,7 @@ all-plugins = [
5960
"vision-agents-plugins-gemini",
6061
"vision-agents-plugins-getstream",
6162
"vision-agents-plugins-heygen",
63+
"vision-agents-plugins-inworld",
6264
"vision-agents-plugins-kokoro",
6365
"vision-agents-plugins-krisp",
6466
"vision-agents-plugins-moonshine",

plugins/inworld/README.md

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Inworld AI Text-to-Speech Plugin
2+
3+
A high-quality Text-to-Speech (TTS) plugin for Vision Agents that uses the Inworld AI API with streaming support.
4+
5+
## Installation
6+
7+
```bash
8+
uv add vision-agents[inworld]
9+
```
10+
11+
## Usage
12+
13+
```python
14+
from vision_agents.plugins import inworld
15+
16+
# Initialize with API key from environment variable
17+
tts = inworld.TTS()
18+
19+
# Or specify API key and other options directly
20+
tts = inworld.TTS(
21+
api_key="your_inworld_api_key",
22+
voice_id="Dennis",
23+
model_id="inworld-tts-1",
24+
temperature=1.1
25+
)
26+
27+
# Use with an Agent
28+
from vision_agents.core import Agent
29+
from vision_agents.plugins import getstream, gemini, smart_turn
30+
31+
agent = Agent(
32+
edge=getstream.Edge(),
33+
tts=inworld.TTS(),
34+
llm=gemini.LLM("gemini-2.0-flash"),
35+
turn_detection=smart_turn.TurnDetection(),
36+
)
37+
```
38+
39+
## Configuration Options
40+
41+
- `api_key`: Inworld AI API key (default: reads from `INWORLD_API_KEY` environment variable)
42+
- `voice_id`: The voice ID to use for synthesis (default: "Dennis")
43+
- `model_id`: The model ID to use for synthesis. Options: "inworld-tts-1", "inworld-tts-1-max" (default: "inworld-tts-1")
44+
- `temperature`: Determines the degree of randomness when sampling audio tokens. Accepts values between 0 and 2 (default: 1.1)
45+
- `base_url`: Optional custom API endpoint (default: "https://api.inworld.ai")
46+
- `client`: Optionally pass in your own instance of `httpx.AsyncClient`
47+
48+
## Requirements
49+
50+
- Python 3.10+
51+
- httpx>=0.27.0
52+
53+
## Getting Started
54+
55+
1. Get your Inworld AI API key from the [Inworld Portal](https://studio.inworld.ai/)
56+
2. Set the `INWORLD_API_KEY` environment variable:
57+
```bash
58+
export INWORLD_API_KEY="your_api_key_here"
59+
```
60+
3. Use the plugin in your Vision Agents application
61+
62+
## API Reference
63+
64+
The plugin implements the standard Vision Agents TTS interface:
65+
66+
- `stream_audio(text: str)`: Convert text to speech and return an async iterator of `PcmData` chunks
67+
- `stop_audio()`: Stop audio playback (no-op for this plugin)
68+
- `send(text: str)`: Send text to be converted to speech (inherited from base class)
69+
70+
## Streaming
71+
72+
This plugin only supports streaming mode. Audio chunks are returned as they are processed by the Inworld AI API, providing low-latency audio synthesis.
73+

plugins/inworld/example/__init__.py

Whitespace-only changes.
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""
2+
Inworld AI TTS Example
3+
4+
This example demonstrates Inworld AI TTS integration with Vision Agents.
5+
6+
This example creates an agent that uses:
7+
- Inworld AI for text-to-speech (TTS)
8+
- GetStream for edge/real-time communication
9+
- Smart Turn for turn detection
10+
11+
Requirements:
12+
- INWORLD_API_KEY environment variable
13+
- STREAM_API_KEY and STREAM_API_SECRET environment variables
14+
"""
15+
16+
import asyncio
17+
import logging
18+
19+
from dotenv import load_dotenv
20+
21+
from vision_agents.core import User, Agent, cli
22+
from vision_agents.core.agents import AgentLauncher
23+
from vision_agents.plugins import inworld, getstream, smart_turn, gemini
24+
25+
26+
logger = logging.getLogger(__name__)
27+
28+
load_dotenv()
29+
30+
31+
async def create_agent(**kwargs) -> Agent:
32+
"""Create the agent with Inworld AI TTS."""
33+
agent = Agent(
34+
edge=getstream.Edge(),
35+
agent_user=User(name="Friendly AI", id="agent"),
36+
instructions="You're a helpful voice AI assistant. Keep your responses concise and friendly.",
37+
tts=inworld.TTS(), # Uses Inworld AI for text-to-speech
38+
llm=gemini.LLM("gemini-2.0-flash"),
39+
turn_detection=smart_turn.TurnDetection(buffer_in_seconds=2.0, confidence_threshold=0.5),
40+
)
41+
return agent
42+
43+
44+
async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
45+
"""Join the call and start the agent."""
46+
# Ensure the agent user is created
47+
await agent.create_user()
48+
# Create a call
49+
call = await agent.create_call(call_type, call_id)
50+
51+
logger.info("🤖 Starting Inworld AI Agent...")
52+
53+
# Have the agent join the call/room
54+
with await agent.join(call):
55+
logger.info("Joining call")
56+
logger.info("LLM ready")
57+
58+
await asyncio.sleep(5)
59+
await agent.llm.simple_response(text="Hello! How can I help you today?")
60+
61+
await agent.finish() # Run till the call ends
62+
63+
64+
if __name__ == "__main__":
65+
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
66+
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[project]
2+
name = "inworld-tts-example"
3+
version = "0.1.0"
4+
description = "Example using Inworld AI TTS with Vision Agents"
5+
requires-python = ">=3.10"
6+
dependencies = [
7+
"vision-agents-plugins-inworld",
8+
"vision-agents-plugins-getstream",
9+
"vision-agents-plugins-smart-turn",
10+
"vision-agents-plugins-gemini",
11+
"python-dotenv",
12+
]
13+
14+
[tool.uv.sources]
15+
vision-agents-plugins-inworld = { workspace = true }
16+
vision-agents-plugins-getstream = { workspace = true }
17+
vision-agents-plugins-smart-turn = { workspace = true }
18+
vision-agents-plugins-gemini = { workspace = true }
19+

plugins/inworld/py.typed

Whitespace-only changes.

plugins/inworld/pyproject.toml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
[build-system]
2+
requires = ["hatchling", "hatch-vcs"]
3+
build-backend = "hatchling.build"
4+
5+
[project]
6+
name = "vision-agents-plugins-inworld"
7+
dynamic = ["version"]
8+
description = "Inworld AI TTS integration for Vision Agents"
9+
readme = "README.md"
10+
keywords = ["inworld", "TTS", "text-to-speech", "AI", "voice agents", "agents"]
11+
requires-python = ">=3.10"
12+
license = "MIT"
13+
dependencies = [
14+
"vision-agents",
15+
"httpx>=0.27.0",
16+
]
17+
18+
[project.urls]
19+
Documentation = "https://visionagents.ai/"
20+
Website = "https://visionagents.ai/"
21+
Source = "https://github.com/GetStream/Vision-Agents"
22+
23+
[tool.hatch.version]
24+
source = "vcs"
25+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
26+
27+
[tool.hatch.build.targets.wheel]
28+
packages = [".", "vision_agents"]
29+
30+
[tool.hatch.build.targets.sdist]
31+
include = ["/vision_agents"]
32+
33+
[tool.uv.sources]
34+
vision-agents = { workspace = true }
35+
36+
[dependency-groups]
37+
dev = [
38+
"pytest>=8.4.1",
39+
"pytest-asyncio>=1.0.0",
40+
]
41+

plugins/inworld/tests/__init__.py

Whitespace-only changes.

plugins/inworld/tests/test_tts.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import pytest
2+
from dotenv import load_dotenv
3+
4+
from vision_agents.plugins import inworld
5+
from vision_agents.core.tts.manual_test import manual_tts_to_wav
6+
from vision_agents.core.tts.testing import TTSSession
7+
8+
# Load environment variables
9+
load_dotenv()
10+
11+
12+
class TestInworldTTS:
13+
@pytest.fixture
14+
async def tts(self) -> inworld.TTS:
15+
return inworld.TTS()
16+
17+
@pytest.mark.integration
18+
async def test_inworld_tts_convert_text_to_audio_manual_test(self, tts: inworld.TTS):
19+
await manual_tts_to_wav(tts, sample_rate=48000, channels=2)
20+
21+
@pytest.mark.integration
22+
async def test_inworld_tts_convert_text_to_audio(self, tts: inworld.TTS):
23+
tts.set_output_format(sample_rate=16000, channels=1)
24+
session = TTSSession(tts)
25+
text = "Hello from Inworld AI."
26+
27+
await tts.send(text)
28+
await session.wait_for_result(timeout=15.0)
29+
30+
assert not session.errors
31+
assert len(session.speeches) > 0
32+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from vision_agents.plugins.inworld.tts import TTS
2+
3+
__all__ = ["TTS"]
4+

0 commit comments

Comments
 (0)