Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions agents-core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ elevenlabs = ["vision-agents-plugins-elevenlabs"]
gemini = ["vision-agents-plugins-gemini"]
getstream = ["vision-agents-plugins-getstream"]
heygen = ["vision-agents-plugins-heygen"]
inworld = ["vision-agents-plugins-inworld"]
kokoro = ["vision-agents-plugins-kokoro"]
krisp = ["vision-agents-plugins-krisp"]
moonshine = ["vision-agents-plugins-moonshine"]
Expand All @@ -59,6 +60,7 @@ all-plugins = [
"vision-agents-plugins-gemini",
"vision-agents-plugins-getstream",
"vision-agents-plugins-heygen",
"vision-agents-plugins-inworld",
"vision-agents-plugins-kokoro",
"vision-agents-plugins-krisp",
"vision-agents-plugins-moonshine",
Expand Down
73 changes: 73 additions & 0 deletions plugins/inworld/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Inworld AI Text-to-Speech Plugin

A high-quality Text-to-Speech (TTS) plugin for Vision Agents that uses the Inworld AI API with streaming support.

## Installation

```bash
uv add vision-agents[inworld]
```

## Usage

```python
from vision_agents.plugins import inworld

# Initialize with API key from environment variable
tts = inworld.TTS()

# Or specify API key and other options directly
tts = inworld.TTS(
api_key="your_inworld_api_key",
voice_id="Dennis",
model_id="inworld-tts-1",
temperature=1.1
)

# Use with an Agent
from vision_agents.core import Agent
from vision_agents.plugins import getstream, gemini, smart_turn

agent = Agent(
edge=getstream.Edge(),
tts=inworld.TTS(),
llm=gemini.LLM("gemini-2.0-flash"),
turn_detection=smart_turn.TurnDetection(),
)
```

## Configuration Options

- `api_key`: Inworld AI API key (default: reads from `INWORLD_API_KEY` environment variable)
- `voice_id`: The voice ID to use for synthesis (default: "Dennis")
- `model_id`: The model ID to use for synthesis. Options: "inworld-tts-1", "inworld-tts-1-max" (default: "inworld-tts-1")
- `temperature`: Determines the degree of randomness when sampling audio tokens. Accepts values between 0 and 2 (default: 1.1)
- `base_url`: Optional custom API endpoint (default: "https://api.inworld.ai")
- `client`: Optionally pass in your own instance of `httpx.AsyncClient`

## Requirements

- Python 3.10+
- httpx>=0.27.0

## Getting Started

1. Get your Inworld AI API key from the [Inworld Portal](https://studio.inworld.ai/)
2. Set the `INWORLD_API_KEY` environment variable:
```bash
export INWORLD_API_KEY="your_api_key_here"
```
3. Use the plugin in your Vision Agents application

## API Reference

The plugin implements the standard Vision Agents TTS interface:

- `stream_audio(text: str)`: Convert text to speech and return an async iterator of `PcmData` chunks
- `stop_audio()`: Stop audio playback (no-op for this plugin)
- `send(text: str)`: Send text to be converted to speech (inherited from base class)

## Streaming

This plugin only supports streaming mode. Audio chunks are returned as they are processed by the Inworld AI API, providing low-latency audio synthesis.

Empty file.
69 changes: 69 additions & 0 deletions plugins/inworld/example/inworld_tts_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
Inworld AI TTS Example

This example demonstrates Inworld AI TTS integration with Vision Agents.

This example creates an agent that uses:
- Inworld AI for text-to-speech (TTS)
- Stream for edge/real-time communication
- Deepgram for speech-to-text (STT)
- Smart Turn for turn detection

Requirements:
- INWORLD_API_KEY environment variable
- STREAM_API_KEY and STREAM_API_SECRET environment variables
- DEEPGRAM_API_KEY environment variable
"""

import asyncio
import logging

from dotenv import load_dotenv

from vision_agents.core import User, Agent, cli
from vision_agents.core.agents import AgentLauncher
from vision_agents.plugins import inworld, getstream, smart_turn, gemini, deepgram


logger = logging.getLogger(__name__)

load_dotenv()


async def create_agent(**kwargs) -> Agent:
"""Create the agent with Inworld AI TTS."""
agent = Agent(
edge=getstream.Edge(),
agent_user=User(name="Friendly AI", id="agent"),
instructions="You're a helpful voice AI assistant. Keep your responses concise and friendly.",
tts=inworld.TTS(),
stt=deepgram.STT(),
llm=gemini.LLM("gemini-2.0-flash"),
turn_detection=smart_turn.TurnDetection(),
)
return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
"""Join the call and start the agent."""
# Ensure the agent user is created
await agent.create_user()
# Create a call
call = await agent.create_call(call_type, call_id)

logger.info("🤖 Starting Inworld AI Agent...")

# Have the agent join the call/room
with await agent.join(call):
logger.info("Joining call")
logger.info("LLM ready")

await asyncio.sleep(5)
await agent.llm.simple_response(text="Hello! How can I help you today?")

await agent.finish() # Run till the call ends


if __name__ == "__main__":
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))

21 changes: 21 additions & 0 deletions plugins/inworld/example/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[project]
name = "inworld-tts-example"
version = "0.1.0"
description = "Example using Inworld AI TTS with Vision Agents"
requires-python = ">=3.10"
dependencies = [
"vision-agents-plugins-inworld",
"vision-agents-plugins-getstream",
"vision-agents-plugins-smart-turn",
"vision-agents-plugins-gemini",
"vision-agents-plugins-deepgram",
"python-dotenv",
]

[tool.uv.sources]
vision-agents-plugins-inworld = { workspace = true }
vision-agents-plugins-getstream = { workspace = true }
vision-agents-plugins-smart-turn = { workspace = true }
vision-agents-plugins-gemini = { workspace = true }
vision-agents-plugins-deepgram = { workspace = true }

Empty file added plugins/inworld/py.typed
Empty file.
41 changes: 41 additions & 0 deletions plugins/inworld/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
[build-system]
requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

[project]
name = "vision-agents-plugins-inworld"
dynamic = ["version"]
description = "Inworld AI TTS integration for Vision Agents"
readme = "README.md"
keywords = ["inworld", "TTS", "text-to-speech", "AI", "voice agents", "agents"]
requires-python = ">=3.10"
license = "MIT"
dependencies = [
"vision-agents",
"httpx>=0.27.0",
]

[project.urls]
Documentation = "https://visionagents.ai/"
Website = "https://visionagents.ai/"
Source = "https://github.com/GetStream/Vision-Agents"

[tool.hatch.version]
source = "vcs"
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }

[tool.hatch.build.targets.wheel]
packages = [".", "vision_agents"]

[tool.hatch.build.targets.sdist]
include = ["/vision_agents"]

[tool.uv.sources]
vision-agents = { workspace = true }

[dependency-groups]
dev = [
"pytest>=8.4.1",
"pytest-asyncio>=1.0.0",
]

Empty file.
32 changes: 32 additions & 0 deletions plugins/inworld/tests/test_tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pytest
from dotenv import load_dotenv

from vision_agents.plugins import inworld
from vision_agents.core.tts.manual_test import manual_tts_to_wav
from vision_agents.core.tts.testing import TTSSession

# Load environment variables
load_dotenv()


class TestInworldTTS:
@pytest.fixture
async def tts(self) -> inworld.TTS:
return inworld.TTS()

@pytest.mark.integration
async def test_inworld_tts_convert_text_to_audio_manual_test(self, tts: inworld.TTS):
await manual_tts_to_wav(tts, sample_rate=48000, channels=2)

@pytest.mark.integration
async def test_inworld_tts_convert_text_to_audio(self, tts: inworld.TTS):
tts.set_output_format(sample_rate=16000, channels=1)
session = TTSSession(tts)
text = "Hello from Inworld AI."

await tts.send(text)
await session.wait_for_result(timeout=15.0)

assert not session.errors
assert len(session.speeches) > 0

4 changes: 4 additions & 0 deletions plugins/inworld/vision_agents/plugins/inworld/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from vision_agents.plugins.inworld.tts import TTS

__all__ = ["TTS"]

Loading
Loading