Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions agents-core/vision_agents/core/agents/agent_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ async def warmup(self, **kwargs) -> None:
if agent.turn_detection and hasattr(agent.turn_detection, 'warmup'):
logger.debug("Warming up turn detection: %s", agent.turn_detection.__class__.__name__)
warmup_tasks.append(agent.turn_detection.warmup())

# Warmup processors
if agent.processors and hasattr(agent.processors, 'warmup'):
logger.debug("Warming up processors")
for processor in agent.processors:
if hasattr(processor, 'warmup'):
logger.debug("Warming up processor: %s", processor.__class__.__name__)
warmup_tasks.append(processor.warmup())

# Run all warmups in parallel
if warmup_tasks:
Expand Down
5 changes: 2 additions & 3 deletions agents-core/vision_agents/core/agents/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
import time
import uuid
from dataclasses import asdict
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard, Coroutine
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard
from uuid import uuid4

import getstream.models
from aiortc import VideoStreamTrack
from getstream.video.async_call import Call
from getstream.video.rtc import Call

from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import TrackType
Expand Down Expand Up @@ -697,7 +696,7 @@ async def create_user(self) -> None:
async def create_call(self, call_type: str, call_id: str) -> Call:
"""Shortcut for creating a call/room etc."""
call = self.edge.client.video.call(call_type, call_id)
response = await call.get_or_create(data={"created_by_id": self.agent_user.id})
await call.get_or_create(data={"created_by_id": self.agent_user.id})

return call

Expand Down
160 changes: 151 additions & 9 deletions plugins/moondream/README.md
Original file line number Diff line number Diff line change
@@ -1,27 +1,48 @@
# Moondream Plugin

This plugin provides Moondream 3 detection capabilities for vision-agents, enabling real-time zero-shot object detection on video streams. Choose between cloud-hosted or local processing depending on your needs.
This plugin provides Moondream 3 vision capabilities for vision-agents, including:
- **Object Detection**: Real-time zero-shot object detection on video streams
- **Visual Question Answering (VQA)**: Answer questions about video frames
- **Image Captioning**: Generate descriptions of video frames

Choose between cloud-hosted or local processing depending on your needs. When running locally, we recommend you do so on CUDA enabled devices.

## Installation

```bash
uv add vision-agents-plugins-moondream
uv add vision-agents[moondream]
```

## Choosing the Right Processor
## Choosing the Right Component

### Detection Processors

### CloudDetectionProcessor (Recommended for Most Users)
#### CloudDetectionProcessor (Recommended for Most Users)
- **Use when:** You want a simple setup with no infrastructure management
- **Pros:** No model download, no GPU required, automatic updates
- **Cons:** Requires API key, 2 RPS rate limit by default (can be increased)
- **Best for:** Development, testing, low-to-medium volume applications

### LocalDetectionProcessor (For Advanced Users)
#### LocalDetectionProcessor (For Advanced Users)
- **Use when:** You need higher throughput, have your own GPU infrastructure, or want to avoid rate limits
- **Pros:** No rate limits, no API costs, full control over hardware
- **Cons:** Requires GPU for best performance, model download on first use, infrastructure management
- **Best for:** Production deployments, high-volume applications, Digital Ocean Gradient AI GPUs, or custom infrastructure

### Vision Language Models (VLM)

#### CloudVLM (Recommended for Most Users)
- **Use when:** You want visual question answering or captioning without managing infrastructure
- **Pros:** No model download, no GPU required, automatic updates
- **Cons:** Requires API key, rate limits apply
- **Best for:** Development, testing, applications requiring VQA or captioning

#### LocalVLM (For Advanced Users)
- **Use when:** You need VQA or captioning with higher throughput or want to avoid rate limits
- **Pros:** No rate limits, no API costs, full control over hardware
- **Cons:** Requires GPU for best performance, model download on first use, infrastructure management
- **Best for:** Production deployments, high-volume applications, or custom infrastructure

## Quick Start

### Using CloudDetectionProcessor (Hosted)
Expand Down Expand Up @@ -64,7 +85,7 @@ from vision_agents.core import Agent
processor = moondream.LocalDetectionProcessor(
detect_objects=["person", "car", "dog"],
conf_threshold=0.3,
device="cuda", # Auto-detects CUDA, MPS, or CPU
force_cpu=False, # Auto-detects CUDA, MPS, or CPU
fps=30
)

Expand All @@ -87,6 +108,107 @@ processor = moondream.CloudDetectionProcessor(
)
```

## Vision Language Model (VLM) Quick Start

### Using CloudVLM (Hosted)

The `CloudVLM` uses Moondream's hosted API for visual question answering and captioning. It automatically processes video frames and responds to questions asked via STT (Speech-to-Text).

```python
import asyncio
import os
from dotenv import load_dotenv
from vision_agents.core import User, Agent, cli
from vision_agents.core.agents import AgentLauncher
from vision_agents.plugins import deepgram, getstream, elevenlabs, moondream
from vision_agents.core.events import CallSessionParticipantJoinedEvent

load_dotenv()

async def create_agent(**kwargs) -> Agent:
# Create a cloud VLM for visual question answering
llm = moondream.CloudVLM(
api_key=os.getenv("MOONDREAM_API_KEY"), # or set MOONDREAM_API_KEY env var
mode="vqa", # or "caption" for image captioning
)

agent = Agent(
edge=getstream.Edge(),
agent_user=User(name="My happy AI friend", id="agent"),
llm=llm,
tts=elevenlabs.TTS(),
stt=deepgram.STT(),
)
return agent

async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
await agent.create_user()
call = await agent.create_call(call_type, call_id)

@agent.events.subscribe
async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
if event.participant.user.id != "agent":
await asyncio.sleep(2)
# Ask the agent to describe what it sees
await agent.simple_response("Describe what you currently see")

with await agent.join(call):
await agent.edge.open_demo(call)
await agent.finish()

if __name__ == "__main__":
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
```

### Using LocalVLM (On-Device)

The `LocalVLM` downloads the model from HuggingFace and runs on device. It supports both VQA and captioning modes.

**Note:** The moondream3-preview model is gated and requires HuggingFace authentication:
- Request access at https://huggingface.co/moondream/moondream3-preview
- Set `HF_TOKEN` environment variable: `export HF_TOKEN=your_token_here`
- Or run: `huggingface-cli login`

```python
from vision_agents.plugins import moondream
from vision_agents.core import Agent

# Create a local VLM (no API key needed)
llm = moondream.LocalVLM(
mode="vqa", # or "caption" for image captioning
force_cpu=False, # Auto-detects CUDA, MPS, or CPU
)

# Use in an agent
agent = Agent(
llm=llm,
tts=your_tts,
stt=your_stt,
# ... other components
)
```

### VLM Modes

The VLM supports two modes:

- **`"vqa"`** (Visual Question Answering): Answers questions about video frames. Questions come from STT transcripts.
- **`"caption"`** (Image Captioning): Generates descriptions of video frames automatically.

```python
# VQA mode - answers questions about frames
llm = moondream.CloudVLM(
api_key="your-api-key",
mode="vqa"
)

# Caption mode - generates automatic descriptions
llm = moondream.CloudVLM(
api_key="your-api-key",
mode="caption"
)
```

## Configuration

### CloudDetectionProcessor Parameters
Expand All @@ -107,12 +229,30 @@ processor = moondream.CloudDetectionProcessor(
- `fps`: int - Frame processing rate (default: 30)
- `interval`: int - Processing interval in seconds (default: 0)
- `max_workers`: int - Thread pool size for CPU-intensive operations (default: 10)
- `device`: str - Device to run inference on ('cuda', 'mps', or 'cpu'). Auto-detects CUDA, then MPS (Apple Silicon), then defaults to CPU. Default: `None` (auto-detect)
- `force_cpu`: bool - If True, force CPU usage even if CUDA/MPS is available. Auto-detects CUDA, then MPS (Apple Silicon), then defaults to CPU. We recommend running on CUDA for best performance. (default: False)
- `model_name`: str - Hugging Face model identifier (default: "moondream/moondream3-preview")
- `options`: AgentOptions - Model directory configuration. If not provided, uses default which defaults to tempfile.gettempdir()

**Performance:** Performance will vary depending on your hardware configuration. CUDA is recommended for best performance on NVIDIA GPUs. The model will be downloaded from HuggingFace on first use.

### CloudVLM Parameters

- `api_key`: str - API key for Moondream Cloud API. If not provided, will attempt to read from `MOONDREAM_API_KEY` environment variable.
- `mode`: Literal["vqa", "caption"] - "vqa" for visual question answering or "caption" for image captioning (default: "vqa")
- `max_workers`: int - Thread pool size for CPU-intensive operations (default: 10)

**Rate Limits:** By default, the Moondream Cloud API has rate limits. Contact the Moondream team to request higher limits.

### LocalVLM Parameters

- `mode`: Literal["vqa", "caption"] - "vqa" for visual question answering or "caption" for image captioning (default: "vqa")
- `max_workers`: int - Thread pool size for async operations (default: 10)
- `force_cpu`: bool - If True, force CPU usage even if CUDA/MPS is available. Auto-detects CUDA, then MPS (Apple Silicon), then defaults to CPU. Note: MPS is automatically converted to CPU due to model compatibility. We recommend running on CUDA for best performance. (default: False)
- `model_name`: str - Hugging Face model identifier (default: "moondream/moondream3-preview")
- `options`: AgentOptions - Model directory configuration. If not provided, uses default_agent_options()

**Performance:** Performance will vary depending on your hardware configuration. CUDA is recommended for best performance on NVIDIA GPUs. The model will be downloaded from HuggingFace on first use.

## Video Publishing

The processor publishes annotated video frames with bounding boxes drawn on detected objects:
Expand Down Expand Up @@ -146,16 +286,18 @@ pytest plugins/moondream/tests/ -k "annotation" -v

### Required
- `vision-agents` - Core framework
- `moondream` - Moondream SDK for cloud API (CloudDetectionProcessor only)
- `moondream` - Moondream SDK for cloud API (CloudDetectionProcessor and CloudVLM)
- `numpy>=2.0.0` - Array operations
- `pillow>=10.0.0` - Image processing
- `opencv-python>=4.8.0` - Video annotation
- `aiortc` - WebRTC support

### LocalDetectionProcessor Additional Dependencies
### Local Components Additional Dependencies
- `torch` - PyTorch for model inference
- `transformers` - HuggingFace transformers library for model loading

**Note:** LocalDetectionProcessor and LocalVLM both require these dependencies. We recommend only running the model locally on CUDA devices.

## Links

- [Moondream Documentation](https://docs.moondream.ai/)
Expand Down
2 changes: 2 additions & 0 deletions plugins/moondream/example/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
## Moondream example
Please see root readme for details.
Empty file.
53 changes: 53 additions & 0 deletions plugins/moondream/example/moondream_vlm_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import asyncio
import logging
from dotenv import load_dotenv

from vision_agents.core import User, Agent, cli
from vision_agents.core.agents import AgentLauncher
from vision_agents.plugins import deepgram, getstream, elevenlabs, moondream
from vision_agents.core.events import CallSessionParticipantJoinedEvent
import os

logger = logging.getLogger(__name__)

load_dotenv()

async def create_agent(**kwargs) -> Agent:
llm = moondream.CloudVLM(
api_key=os.getenv("MOONDREAM_API_KEY"),
)
# create an agent to run with Stream's edge, openAI llm
agent = Agent(
edge=getstream.Edge(), # low latency edge. clients for React, iOS, Android, RN, Flutter etc.
agent_user=User(
name="My happy AI friend", id="agent"
),
llm=llm,
tts=elevenlabs.TTS(),
stt=deepgram.STT(),
)
return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
# ensure the agent user is created
await agent.create_user()
# Create a call
call = await agent.create_call(call_type, call_id)

@agent.events.subscribe
async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
if event.participant.user.id != "agent":
await asyncio.sleep(2)
await agent.simple_response("Describe what you currently see")

# Have the agent join the call/room
with await agent.join(call):
# Open the demo UI
await agent.edge.open_demo(call)
# run till the call ends
await agent.finish()


if __name__ == "__main__":
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
22 changes: 22 additions & 0 deletions plugins/moondream/example/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[project]
name = "moondream-example"
version = "0.1.0"
description = "Example using Moondream Detect and VLM with Vision Agents"
requires-python = ">=3.10"
dependencies = [
"vision-agents",
"vision-agents-plugins-moondream",
"vision-agents-plugins-getstream",
"vision-agents-plugins-deepgram",
"vision-agents-plugins-elevenlabs",
"vision-agents-plugins-vogent",
"python-dotenv",
]

[tool.uv.sources]
vision-agents = { workspace = true }
vision-agents-plugins-moondream = { workspace = true }
vision-agents-plugins-getstream = { workspace = true }
vision-agents-plugins-deepgram = { workspace = true }
vision-agents-plugins-elevenlabs = { workspace = true }
vision-agents-plugins-vogent = { workspace = true }
8 changes: 4 additions & 4 deletions plugins/moondream/tests/test_moondream_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def golf_image(self, assets_dir) -> Iterator[Image.Image]:
@pytest.fixture
def moondream_processor(self) -> Iterator[LocalDetectionProcessor]:
"""Create and manage MoondreamLocalProcessor lifecycle."""
processor = LocalDetectionProcessor(device="cpu")
processor = LocalDetectionProcessor(force_cpu=True)
try:
yield processor
finally:
Expand Down Expand Up @@ -261,7 +261,7 @@ def is_available():
processor.close()

# Also test explicit MPS parameter
processor2 = LocalDetectionProcessor(device="mps")
processor2 = LocalDetectionProcessor(force_cpu=True)
try:
# Verify explicit MPS is also converted to CPU
assert processor2.device == "cpu"
Expand All @@ -270,7 +270,7 @@ def is_available():

def test_device_explicit_cpu(self):
"""Test explicit CPU device selection."""
processor = LocalDetectionProcessor(device="cpu")
processor = LocalDetectionProcessor(force_cpu=True)
try:
assert processor.device == "cpu"
finally:
Expand All @@ -282,7 +282,7 @@ def test_device_explicit_cpu(self):
)
def test_device_explicit_cuda(self):
"""Test explicit CUDA device selection (only if CUDA available)."""
processor = LocalDetectionProcessor(device="cuda")
processor = LocalDetectionProcessor()
try:
assert processor.device == "cuda"
finally:
Expand Down
Loading