GetStream · Nash0x7E2 · Nov 13, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 13, 2025
diff --git a/agents-core/pyproject.toml b/agents-core/pyproject.toml
@@ -43,6 +43,7 @@ elevenlabs = ["vision-agents-plugins-elevenlabs"]
 gemini = ["vision-agents-plugins-gemini"]
 getstream = ["vision-agents-plugins-getstream"]
 heygen = ["vision-agents-plugins-heygen"]
+inworld = ["vision-agents-plugins-inworld"]
 kokoro = ["vision-agents-plugins-kokoro"]
 krisp = ["vision-agents-plugins-krisp"]
 moonshine = ["vision-agents-plugins-moonshine"]
@@ -59,6 +60,7 @@ all-plugins = [
   "vision-agents-plugins-gemini",
   "vision-agents-plugins-getstream",
   "vision-agents-plugins-heygen",
+  "vision-agents-plugins-inworld",
   "vision-agents-plugins-kokoro",
   "vision-agents-plugins-krisp",
   "vision-agents-plugins-moonshine",

diff --git a/plugins/inworld/README.md b/plugins/inworld/README.md
@@ -0,0 +1,73 @@
+# Inworld AI Text-to-Speech Plugin
+
+A high-quality Text-to-Speech (TTS) plugin for Vision Agents that uses the Inworld AI API with streaming support.
+
+## Installation
+
+```bash
+uv add vision-agents[inworld]
+```
+
+## Usage
+
+```python
+from vision_agents.plugins import inworld
+
+# Initialize with API key from environment variable
+tts = inworld.TTS()
+
+# Or specify API key and other options directly
+tts = inworld.TTS(
+    api_key="your_inworld_api_key",
+    voice_id="Dennis",
+    model_id="inworld-tts-1",
+    temperature=1.1
+)
+
+# Use with an Agent
+from vision_agents.core import Agent
+from vision_agents.plugins import getstream, gemini, smart_turn
+
+agent = Agent(
+    edge=getstream.Edge(),
+    tts=inworld.TTS(),
+    llm=gemini.LLM("gemini-2.0-flash"),
+    turn_detection=smart_turn.TurnDetection(),
+)
+```
+
+## Configuration Options
+
+- `api_key`: Inworld AI API key (default: reads from `INWORLD_API_KEY` environment variable)
+- `voice_id`: The voice ID to use for synthesis (default: "Dennis")
+- `model_id`: The model ID to use for synthesis. Options: "inworld-tts-1", "inworld-tts-1-max" (default: "inworld-tts-1")
+- `temperature`: Determines the degree of randomness when sampling audio tokens. Accepts values between 0 and 2 (default: 1.1)
+- `base_url`: Optional custom API endpoint (default: "https://api.inworld.ai")
+- `client`: Optionally pass in your own instance of `httpx.AsyncClient`
+
+## Requirements
+
+- Python 3.10+
+- httpx>=0.27.0
+
+## Getting Started
+
+1. Get your Inworld AI API key from the [Inworld Portal](https://studio.inworld.ai/)
+2. Set the `INWORLD_API_KEY` environment variable:
+   ```bash
+   export INWORLD_API_KEY="your_api_key_here"
+   ```
+3. Use the plugin in your Vision Agents application
+
+## API Reference
+
+The plugin implements the standard Vision Agents TTS interface:
+
+- `stream_audio(text: str)`: Convert text to speech and return an async iterator of `PcmData` chunks
+- `stop_audio()`: Stop audio playback (no-op for this plugin)
+- `send(text: str)`: Send text to be converted to speech (inherited from base class)
+
+## Streaming
+
+This plugin only supports streaming mode. Audio chunks are returned as they are processed by the Inworld AI API, providing low-latency audio synthesis.
+
diff --git a/plugins/inworld/example/__init__.py b/plugins/inworld/example/__init__.py
diff --git a/plugins/inworld/example/inworld_tts_example.py b/plugins/inworld/example/inworld_tts_example.py
@@ -0,0 +1,69 @@
+"""
+Inworld AI TTS Example
+
+This example demonstrates Inworld AI TTS integration with Vision Agents.
+
+This example creates an agent that uses:
+- Inworld AI for text-to-speech (TTS)
+- Stream for edge/real-time communication
+- Deepgram for speech-to-text (STT)
+- Smart Turn for turn detection
+
+Requirements:
+- INWORLD_API_KEY environment variable
+- STREAM_API_KEY and STREAM_API_SECRET environment variables
+- DEEPGRAM_API_KEY environment variable
+"""
+
+import asyncio
+import logging
+
+from dotenv import load_dotenv
+
+from vision_agents.core import User, Agent, cli
+from vision_agents.core.agents import AgentLauncher
+from vision_agents.plugins import inworld, getstream, smart_turn, gemini, deepgram
+
+
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+
+async def create_agent(**kwargs) -> Agent:
+    """Create the agent with Inworld AI TTS."""
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(name="Friendly AI", id="agent"),
+        instructions="You're a helpful voice AI assistant. Keep your responses concise and friendly.",
+        tts=inworld.TTS(),  
+        stt=deepgram.STT(),
+        llm=gemini.LLM("gemini-2.0-flash"),
+        turn_detection=smart_turn.TurnDetection(),
+    )
+    return agent
+
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
+    """Join the call and start the agent."""
+    # Ensure the agent user is created
+    await agent.create_user()
+    # Create a call
+    call = await agent.create_call(call_type, call_id)
+
+    logger.info("🤖 Starting Inworld AI Agent...")
+
+    # Have the agent join the call/room
+    with await agent.join(call):
+        logger.info("Joining call")
+        logger.info("LLM ready")
+
+        await asyncio.sleep(5)
+        await agent.llm.simple_response(text="Hello! How can I help you today?")
+
+        await agent.finish()  # Run till the call ends
+
+
+if __name__ == "__main__":
+    cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
+
diff --git a/plugins/inworld/example/pyproject.toml b/plugins/inworld/example/pyproject.toml
@@ -0,0 +1,21 @@
+[project]
+name = "inworld-tts-example"
+version = "0.1.0"
+description = "Example using Inworld AI TTS with Vision Agents"
+requires-python = ">=3.10"
+dependencies = [
+    "vision-agents-plugins-inworld",
+    "vision-agents-plugins-getstream",
+    "vision-agents-plugins-smart-turn",
+    "vision-agents-plugins-gemini", 
+    "vision-agents-plugins-deepgram",
+    "python-dotenv",
+]
+
+[tool.uv.sources]
+vision-agents-plugins-inworld = { workspace = true }
+vision-agents-plugins-getstream = { workspace = true }
+vision-agents-plugins-smart-turn = { workspace = true }
+vision-agents-plugins-gemini = { workspace = true }
+vision-agents-plugins-deepgram = { workspace = true }
+
diff --git a/plugins/inworld/py.typed b/plugins/inworld/py.typed
diff --git a/plugins/inworld/pyproject.toml b/plugins/inworld/pyproject.toml
@@ -0,0 +1,41 @@
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+
+[project]
+name = "vision-agents-plugins-inworld"
+dynamic = ["version"]
+description = "Inworld AI TTS integration for Vision Agents"
+readme = "README.md"
+keywords = ["inworld", "TTS", "text-to-speech", "AI", "voice agents", "agents"]
+requires-python = ">=3.10"
+license = "MIT"
+dependencies = [
+    "vision-agents",
+    "httpx>=0.27.0",
+]
+
+[project.urls]
+Documentation = "https://visionagents.ai/"
+Website = "https://visionagents.ai/"
+Source = "https://github.com/GetStream/Vision-Agents"
+
+[tool.hatch.version]
+source = "vcs"
+raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
+
+[tool.hatch.build.targets.wheel]
+packages = [".", "vision_agents"]
+
+[tool.hatch.build.targets.sdist]
+include = ["/vision_agents"]
+
+[tool.uv.sources]
+vision-agents = { workspace = true }
+
+[dependency-groups]
+dev = [
+    "pytest>=8.4.1",
+    "pytest-asyncio>=1.0.0",
+]
+
diff --git a/plugins/inworld/tests/__init__.py b/plugins/inworld/tests/__init__.py
diff --git a/plugins/inworld/tests/test_tts.py b/plugins/inworld/tests/test_tts.py
@@ -0,0 +1,32 @@
+import pytest
+from dotenv import load_dotenv
+
+from vision_agents.plugins import inworld
+from vision_agents.core.tts.manual_test import manual_tts_to_wav
+from vision_agents.core.tts.testing import TTSSession
+
+# Load environment variables
+load_dotenv()
+
+
+class TestInworldTTS:
+    @pytest.fixture
+    async def tts(self) -> inworld.TTS:
+        return inworld.TTS()
+
+    @pytest.mark.integration
+    async def test_inworld_tts_convert_text_to_audio_manual_test(self, tts: inworld.TTS):
+        await manual_tts_to_wav(tts, sample_rate=48000, channels=2)
+
+    @pytest.mark.integration
+    async def test_inworld_tts_convert_text_to_audio(self, tts: inworld.TTS):
+        tts.set_output_format(sample_rate=16000, channels=1)
+        session = TTSSession(tts)
+        text = "Hello from Inworld AI."
+
+        await tts.send(text)
+        await session.wait_for_result(timeout=15.0)
+
+        assert not session.errors
+        assert len(session.speeches) > 0
+
diff --git a/plugins/inworld/vision_agents/plugins/inworld/__init__.py b/plugins/inworld/vision_agents/plugins/inworld/__init__.py
@@ -0,0 +1,4 @@
+from vision_agents.plugins.inworld.tts import TTS
+
+__all__ = ["TTS"]
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		from vision_agents.plugins.inworld.tts import TTS

		__all__ = ["TTS"]