GetStream · Nash0x7E2 · Nov 7, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
diff --git a/agents-core/vision_agents/core/agents/agent_launcher.py b/agents-core/vision_agents/core/agents/agent_launcher.py
@@ -92,6 +92,14 @@ async def warmup(self, **kwargs) -> None:
             if agent.turn_detection and hasattr(agent.turn_detection, 'warmup'):
                 logger.debug("Warming up turn detection: %s", agent.turn_detection.__class__.__name__)
                 warmup_tasks.append(agent.turn_detection.warmup())
+
+            # Warmup processors
+            if agent.processors and hasattr(agent.processors, 'warmup'):
+                logger.debug("Warming up processors")
+                for processor in agent.processors:
+                    if hasattr(processor, 'warmup'):
+                        logger.debug("Warming up processor: %s", processor.__class__.__name__)
+                        warmup_tasks.append(processor.warmup())
 
             # Run all warmups in parallel
             if warmup_tasks:

diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py
@@ -5,12 +5,11 @@
 import time
 import uuid
 from dataclasses import asdict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard, Coroutine
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard
 from uuid import uuid4
 
 import getstream.models
 from aiortc import VideoStreamTrack
-from getstream.video.async_call import Call
 from getstream.video.rtc import Call
 
 from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import TrackType
@@ -697,7 +696,7 @@ async def create_user(self) -> None:
     async def create_call(self, call_type: str, call_id: str) -> Call:
         """Shortcut for creating a call/room etc."""
         call = self.edge.client.video.call(call_type, call_id)
-        response = await call.get_or_create(data={"created_by_id": self.agent_user.id})
+        await call.get_or_create(data={"created_by_id": self.agent_user.id})
 
         return call
 

diff --git a/plugins/moondream/README.md b/plugins/moondream/README.md
@@ -1,27 +1,48 @@
 # Moondream Plugin
 
-This plugin provides Moondream 3 detection capabilities for vision-agents, enabling real-time zero-shot object detection on video streams. Choose between cloud-hosted or local processing depending on your needs.
+This plugin provides Moondream 3 vision capabilities for vision-agents, including:
+- **Object Detection**: Real-time zero-shot object detection on video streams
+- **Visual Question Answering (VQA)**: Answer questions about video frames
+- **Image Captioning**: Generate descriptions of video frames
+
+Choose between cloud-hosted or local processing depending on your needs. When running locally, we recommend you do so on CUDA enabled devices.
 
 ## Installation
 
 ```bash
-uv add vision-agents-plugins-moondream
+uv add vision-agents[moondream]
 ```
 
-## Choosing the Right Processor
+## Choosing the Right Component
+
+### Detection Processors
 
-### CloudDetectionProcessor (Recommended for Most Users)
+#### CloudDetectionProcessor (Recommended for Most Users)
 - **Use when:** You want a simple setup with no infrastructure management
 - **Pros:** No model download, no GPU required, automatic updates
 - **Cons:** Requires API key, 2 RPS rate limit by default (can be increased)
 - **Best for:** Development, testing, low-to-medium volume applications
 
-### LocalDetectionProcessor (For Advanced Users)
+#### LocalDetectionProcessor (For Advanced Users)
 - **Use when:** You need higher throughput, have your own GPU infrastructure, or want to avoid rate limits
 - **Pros:** No rate limits, no API costs, full control over hardware
 - **Cons:** Requires GPU for best performance, model download on first use, infrastructure management
 - **Best for:** Production deployments, high-volume applications, Digital Ocean Gradient AI GPUs, or custom infrastructure
 
+### Vision Language Models (VLM)
+
+#### CloudVLM (Recommended for Most Users)
+- **Use when:** You want visual question answering or captioning without managing infrastructure
+- **Pros:** No model download, no GPU required, automatic updates
+- **Cons:** Requires API key, rate limits apply
+- **Best for:** Development, testing, applications requiring VQA or captioning
+
+#### LocalVLM (For Advanced Users)
+- **Use when:** You need VQA or captioning with higher throughput or want to avoid rate limits
+- **Pros:** No rate limits, no API costs, full control over hardware
+- **Cons:** Requires GPU for best performance, model download on first use, infrastructure management
+- **Best for:** Production deployments, high-volume applications, or custom infrastructure
+
 ## Quick Start
 
 ### Using CloudDetectionProcessor (Hosted)
@@ -64,7 +85,7 @@ from vision_agents.core import Agent
 processor = moondream.LocalDetectionProcessor(
     detect_objects=["person", "car", "dog"],
     conf_threshold=0.3,
-    device="cuda",  # Auto-detects CUDA, MPS, or CPU
+    force_cpu=False,  # Auto-detects CUDA, MPS, or CPU
     fps=30
 )
 
@@ -87,6 +108,107 @@ processor = moondream.CloudDetectionProcessor(
 )
 ```
 
+## Vision Language Model (VLM) Quick Start
+
+### Using CloudVLM (Hosted)
+
+The `CloudVLM` uses Moondream's hosted API for visual question answering and captioning. It automatically processes video frames and responds to questions asked via STT (Speech-to-Text).
+
+```python
+import asyncio
+import os
+from dotenv import load_dotenv
+from vision_agents.core import User, Agent, cli
+from vision_agents.core.agents import AgentLauncher
+from vision_agents.plugins import deepgram, getstream, elevenlabs, moondream
+from vision_agents.core.events import CallSessionParticipantJoinedEvent
+
+load_dotenv()
+
+async def create_agent(**kwargs) -> Agent:
+    # Create a cloud VLM for visual question answering
+    llm = moondream.CloudVLM(
+        api_key=os.getenv("MOONDREAM_API_KEY"),  # or set MOONDREAM_API_KEY env var
+        mode="vqa",  # or "caption" for image captioning
+    )
+
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(name="My happy AI friend", id="agent"),
+        llm=llm,
+        tts=elevenlabs.TTS(),
+        stt=deepgram.STT(),
+    )
+    return agent
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
+    await agent.create_user()
+    call = await agent.create_call(call_type, call_id)
+
+    @agent.events.subscribe
+    async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
+        if event.participant.user.id != "agent":
+            await asyncio.sleep(2)
+            # Ask the agent to describe what it sees
+            await agent.simple_response("Describe what you currently see")
+
+    with await agent.join(call):
+        await agent.edge.open_demo(call)
+        await agent.finish()
+
+if __name__ == "__main__":
+    cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
+```
+
+### Using LocalVLM (On-Device)
+
+The `LocalVLM` downloads the model from HuggingFace and runs on device. It supports both VQA and captioning modes.
+
+**Note:** The moondream3-preview model is gated and requires HuggingFace authentication:
+- Request access at https://huggingface.co/moondream/moondream3-preview
+- Set `HF_TOKEN` environment variable: `export HF_TOKEN=your_token_here`
+- Or run: `huggingface-cli login`
+
+```python
+from vision_agents.plugins import moondream
+from vision_agents.core import Agent
+
+# Create a local VLM (no API key needed)
+llm = moondream.LocalVLM(
+    mode="vqa",  # or "caption" for image captioning
+    force_cpu=False,  # Auto-detects CUDA, MPS, or CPU
+)
+
+# Use in an agent
+agent = Agent(
+    llm=llm,
+    tts=your_tts,
+    stt=your_stt,
+    # ... other components
+)
+```
+
+### VLM Modes
+
+The VLM supports two modes:
+
+- **`"vqa"`** (Visual Question Answering): Answers questions about video frames. Questions come from STT transcripts.
+- **`"caption"`** (Image Captioning): Generates descriptions of video frames automatically.
+
+```python
+# VQA mode - answers questions about frames
+llm = moondream.CloudVLM(
+    api_key="your-api-key",
+    mode="vqa"
+)
+
+# Caption mode - generates automatic descriptions
+llm = moondream.CloudVLM(
+    api_key="your-api-key",
+    mode="caption"
+)
+```
+
 ## Configuration
 
 ### CloudDetectionProcessor Parameters
@@ -107,12 +229,30 @@ processor = moondream.CloudDetectionProcessor(
 - `fps`: int - Frame processing rate (default: 30)
 - `interval`: int - Processing interval in seconds (default: 0)
 - `max_workers`: int - Thread pool size for CPU-intensive operations (default: 10)
-- `device`: str - Device to run inference on ('cuda', 'mps', or 'cpu'). Auto-detects CUDA, then MPS (Apple Silicon), then defaults to CPU. Default: `None` (auto-detect)
+- `force_cpu`: bool - If True, force CPU usage even if CUDA/MPS is available. Auto-detects CUDA, then MPS (Apple Silicon), then defaults to CPU. We recommend running on CUDA for best performance. (default: False)
 - `model_name`: str - Hugging Face model identifier (default: "moondream/moondream3-preview")
 - `options`: AgentOptions - Model directory configuration. If not provided, uses default which defaults to tempfile.gettempdir()
 
 **Performance:** Performance will vary depending on your hardware configuration. CUDA is recommended for best performance on NVIDIA GPUs. The model will be downloaded from HuggingFace on first use.
 
+### CloudVLM Parameters
+
+- `api_key`: str - API key for Moondream Cloud API. If not provided, will attempt to read from `MOONDREAM_API_KEY` environment variable.
+- `mode`: Literal["vqa", "caption"] - "vqa" for visual question answering or "caption" for image captioning (default: "vqa")
+- `max_workers`: int - Thread pool size for CPU-intensive operations (default: 10)
+
+**Rate Limits:** By default, the Moondream Cloud API has rate limits. Contact the Moondream team to request higher limits.
+
+### LocalVLM Parameters
+
+- `mode`: Literal["vqa", "caption"] - "vqa" for visual question answering or "caption" for image captioning (default: "vqa")
+- `max_workers`: int - Thread pool size for async operations (default: 10)
+- `force_cpu`: bool - If True, force CPU usage even if CUDA/MPS is available. Auto-detects CUDA, then MPS (Apple Silicon), then defaults to CPU. Note: MPS is automatically converted to CPU due to model compatibility. We recommend running on CUDA for best performance. (default: False)
+- `model_name`: str - Hugging Face model identifier (default: "moondream/moondream3-preview")
+- `options`: AgentOptions - Model directory configuration. If not provided, uses default_agent_options()
+
+**Performance:** Performance will vary depending on your hardware configuration. CUDA is recommended for best performance on NVIDIA GPUs. The model will be downloaded from HuggingFace on first use.
+
 ## Video Publishing
 
 The processor publishes annotated video frames with bounding boxes drawn on detected objects:
@@ -146,16 +286,18 @@ pytest plugins/moondream/tests/ -k "annotation" -v
 
 ### Required
 - `vision-agents` - Core framework
-- `moondream` - Moondream SDK for cloud API (CloudDetectionProcessor only)
+- `moondream` - Moondream SDK for cloud API (CloudDetectionProcessor and CloudVLM)
 - `numpy>=2.0.0` - Array operations
 - `pillow>=10.0.0` - Image processing
 - `opencv-python>=4.8.0` - Video annotation
 - `aiortc` - WebRTC support
 
-### LocalDetectionProcessor Additional Dependencies
+### Local Components Additional Dependencies
 - `torch` - PyTorch for model inference
 - `transformers` - HuggingFace transformers library for model loading
 
+**Note:** LocalDetectionProcessor and LocalVLM both require these dependencies. We recommend only running the model locally on CUDA devices. 
+
 ## Links
 
 - [Moondream Documentation](https://docs.moondream.ai/)

diff --git a/plugins/moondream/example/README.md b/plugins/moondream/example/README.md
@@ -0,0 +1,2 @@
+## Moondream example
+Please see root readme for details. 
diff --git a/plugins/moondream/example/__init__.py b/plugins/moondream/example/__init__.py
diff --git a/plugins/moondream/example/moondream_vlm_example.py b/plugins/moondream/example/moondream_vlm_example.py
@@ -0,0 +1,53 @@
+import asyncio
+import logging
+from dotenv import load_dotenv
+
+from vision_agents.core import User, Agent, cli
+from vision_agents.core.agents import AgentLauncher
+from vision_agents.plugins import deepgram, getstream, elevenlabs, moondream
+from vision_agents.core.events import CallSessionParticipantJoinedEvent
+import os
+
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+async def create_agent(**kwargs) -> Agent:
+    llm = moondream.CloudVLM(
+        api_key=os.getenv("MOONDREAM_API_KEY"),
+    )
+    # create an agent to run with Stream's edge, openAI llm
+    agent = Agent(
+        edge=getstream.Edge(),  # low latency edge. clients for React, iOS, Android, RN, Flutter etc.
+        agent_user=User(
+            name="My happy AI friend", id="agent"
+        ),
+        llm=llm,
+        tts=elevenlabs.TTS(),
+        stt=deepgram.STT(),
+    )
+    return agent
+
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
+    # ensure the agent user is created
+    await agent.create_user()
+    # Create a call
+    call = await agent.create_call(call_type, call_id)
+
+    @agent.events.subscribe
+    async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
+        if event.participant.user.id != "agent":
+            await asyncio.sleep(2)
+            await agent.simple_response("Describe what you currently see")
+
+    # Have the agent join the call/room
+    with await agent.join(call):
+        # Open the demo UI
+        await agent.edge.open_demo(call)
+        # run till the call ends
+        await agent.finish()
+
+
+if __name__ == "__main__":
+    cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
diff --git a/plugins/moondream/example/pyproject.toml b/plugins/moondream/example/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "moondream-example"
+version = "0.1.0"
+description = "Example using Moondream Detect and VLM with Vision Agents"
+requires-python = ">=3.10"
+dependencies = [
+    "vision-agents",
+    "vision-agents-plugins-moondream",
+    "vision-agents-plugins-getstream",
+    "vision-agents-plugins-deepgram",
+    "vision-agents-plugins-elevenlabs",
+    "vision-agents-plugins-vogent",
+    "python-dotenv",
+]
+
+[tool.uv.sources]
+vision-agents = { workspace = true }
+vision-agents-plugins-moondream = { workspace = true }
+vision-agents-plugins-getstream = { workspace = true }
+vision-agents-plugins-deepgram = { workspace = true }
+vision-agents-plugins-elevenlabs = { workspace = true }
+vision-agents-plugins-vogent = { workspace = true }
diff --git a/plugins/moondream/tests/test_moondream_local.py b/plugins/moondream/tests/test_moondream_local.py
@@ -41,7 +41,7 @@ def golf_image(self, assets_dir) -> Iterator[Image.Image]:
     @pytest.fixture
     def moondream_processor(self) -> Iterator[LocalDetectionProcessor]:
         """Create and manage MoondreamLocalProcessor lifecycle."""
-        processor = LocalDetectionProcessor(device="cpu")
+        processor = LocalDetectionProcessor(force_cpu=True)
         try:
             yield processor
         finally:
@@ -261,7 +261,7 @@ def is_available():
             processor.close()
 
         # Also test explicit MPS parameter
-        processor2 = LocalDetectionProcessor(device="mps")
+        processor2 = LocalDetectionProcessor(force_cpu=True)
         try:
             # Verify explicit MPS is also converted to CPU
             assert processor2.device == "cpu"
@@ -270,7 +270,7 @@ def is_available():
 
     def test_device_explicit_cpu(self):
         """Test explicit CPU device selection."""
-        processor = LocalDetectionProcessor(device="cpu")
+        processor = LocalDetectionProcessor(force_cpu=True)
         try:
             assert processor.device == "cpu"
         finally:
@@ -282,7 +282,7 @@ def test_device_explicit_cpu(self):
     )
     def test_device_explicit_cuda(self):
         """Test explicit CUDA device selection (only if CUDA available)."""
-        processor = LocalDetectionProcessor(device="cuda")
+        processor = LocalDetectionProcessor()
         try:
             assert processor.device == "cuda"
         finally:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		## Moondream example
		Please see root readme for details.