wip

tschellenbach · tschellenbach · commit e12112d4b6c4 · 2025-10-17T11:49:07.000-06:00
diff --git a/agents-core/vision_agents/core/llm/llm.py b/agents-core/vision_agents/core/llm/llm.py
@@ -23,9 +23,10 @@
 
 
 class LLMResponseEvent(Generic[T]):
-    def __init__(self, original: T, text: str):
+    def __init__(self, original: T, text: str, exception: Optional[Exception] = None):
         self.original = original
         self.text = text
+        self.exception = exception
 
 
 BeforeCb = Callable[[List[Any]], None]
diff --git a/conftest.py b/conftest.py
@@ -77,6 +77,17 @@ def mia_audio_16khz():
     return pcm
 
 
+@pytest.fixture
+def golf_swing_image():
+    """Load golf_swing.png image and return as bytes."""
+    image_file_path = os.path.join(get_assets_dir(), "golf_swing.png")
+    
+    with open(image_file_path, "rb") as f:
+        image_bytes = f.read()
+    
+    return image_bytes
+
+
 @pytest.fixture
 async def bunny_video_track():
     """Create RealVideoTrack from video file."""
diff --git a/plugins/bedrock/README.md b/plugins/bedrock/README.md
@@ -1,6 +1,6 @@
 # AWS Bedrock Plugin for Vision Agents
 
-AWS Bedrock LLM integration for Vision Agents framework.
+AWS Bedrock LLM integration for Vision Agents framework with support for both standard and realtime interactions.
 
 ## Installation
 
@@ -10,6 +10,8 @@ pip install vision-agents-plugins-bedrock
 
 ## Usage
 
+### Standard LLM Usage
+
 ```python
 from vision_agents.plugins import bedrock
 
@@ -24,6 +26,35 @@ response = await llm.simple_response("Hello, how are you?")
 print(response.text)
 ```
 
+### Realtime Audio/Video Usage
+
+```python
+from vision_agents.plugins import bedrock
+
+# Initialize Bedrock Realtime with Nova Sonic for speech-to-speech
+realtime = bedrock.Realtime(
+    model="us.amazon.nova-sonic-v1:0",
+    region_name="us-east-1",
+    sample_rate=16000
+)
+
+# Connect to the session
+await realtime.connect()
+
+# Send text message
+await realtime.simple_response("Describe what you see")
+
+# Send audio
+pcm_data = PcmData(...)  # Your audio data
+await realtime.simple_audio_response(pcm_data)
+
+# Watch video track
+await realtime._watch_video_track(video_track)
+
+# Close when done
+await realtime.close()
+```
+
 ## Configuration
 
 The plugin uses boto3 for AWS authentication. You can configure credentials using:
@@ -33,11 +64,22 @@ The plugin uses boto3 for AWS authentication. You can configure credentials usin
 
 ## Supported Models
 
+### Standard Models (LLM class)
 All AWS Bedrock models are supported, including:
 - Claude 3.5 models (anthropic.claude-*)
 - Amazon Titan models (amazon.titan-*)
 - Meta Llama models (meta.llama-*)
 - And more
 
+### Realtime Models (Realtime class)
+Realtime audio/video models optimized for speech-to-speech:
+- **Amazon Nova Sonic (us.amazon.nova-sonic-v1:0)** - Primary model for realtime interactions with ultra-low latency
+- Amazon Nova Lite (us.amazon.nova-lite-v1:0)
+- Amazon Nova Micro (us.amazon.nova-micro-v1:0)
+- Amazon Nova Pro (us.amazon.nova-pro-v1:0)
+- And other Nova models
+
+**Note:** Nova Sonic is specifically designed for realtime speech-to-speech conversations and is the recommended default for the Realtime class.
+
 See [AWS Bedrock documentation](https://docs.aws.amazon.com/bedrock/) for available models.
 
diff --git a/plugins/bedrock/tests/test_bedrock.py b/plugins/bedrock/tests/test_bedrock.py
@@ -1,25 +1,52 @@
 """Tests for AWS Bedrock plugin."""
+import os
+
 import pytest
 from dotenv import load_dotenv
 
-from plugins.bedrock.vision_agents.plugins.bedrock.bedrock_llm import BedrockLLM
 from vision_agents.core.agents.conversation import InMemoryConversation
 from vision_agents.core.agents.conversation import Message
 from vision_agents.core.llm.events import LLMResponseChunkEvent
+from vision_agents.core.utils.utils import Instructions
+from vision_agents.plugins.bedrock.bedrock_llm import BedrockLLM
 
 load_dotenv()
 
+"""
+TODO:
+- Cleanup how we do llm.parsed_instructions
+- Remove duplication between streaming and non streaming
+"""
+
 
 class TestBedrockLLM:
     """Test suite for BedrockLLM class with real API calls."""
 
+    def assert_response_successful(self, response):
+        """
+        Utility method to verify a response is successful.
+        
+        A successful response has:
+        - response.text is set (not None and not empty)
+        - response.exception is None
+        
+        Args:
+            response: LLMResponseEvent to check
+        """
+        assert response.text is not None, "Response text should not be None"
+        assert len(response.text) > 0, "Response text should not be empty"
+        assert not hasattr(response, 'exception') or response.exception is None, f"Response should not have an exception, got: {getattr(response, 'exception', None)}"
+
     @pytest.fixture
     async def llm(self) -> BedrockLLM:
         """Test BedrockLLM initialization with a provided client."""
         llm = BedrockLLM(
-            model="anthropic.claude-3-5-sonnet-20241022-v2:0",
+            model="qwen.qwen3-32b-v1:0",
             region_name="us-east-1"
         )
+        if not os.environ.get("AWS_BEARER_TOKEN_BEDROCK"):
+            raise Exception("Please set AWS_BEARER_TOKEN_BEDROCK")
+
         llm._conversation = InMemoryConversation("be friendly", [])
         return llm
 
@@ -45,16 +72,15 @@ async def test_simple(self, llm: BedrockLLM):
         response = await llm.simple_response(
             "Explain quantum computing in 1 paragraph",
         )
-        assert response.text
+        self.assert_response_successful(response)
 
     @pytest.mark.integration
     async def test_native_api(self, llm: BedrockLLM):
         response = await llm.converse(
             messages=[{"role": "user", "content": [{"text": "say hi"}]}],
         )
 
-        # Assertions
-        assert response.text
+        self.assert_response_successful(response)
 
     @pytest.mark.integration
     async def test_stream(self, llm: BedrockLLM):
@@ -96,3 +122,54 @@ async def test_native_memory(self, llm: BedrockLLM):
         )
         assert "8" in response.text or "eight" in response.text
 
+    @pytest.mark.integration
+    async def test_image_description(self, golf_swing_image):
+        # Use a vision-capable model (Claude 3 Haiku supports images and is widely available)
+        vision_llm = BedrockLLM(
+            model="anthropic.claude-3-haiku-20240307-v1:0",
+            region_name="us-east-1"
+        )
+        
+        image_bytes = golf_swing_image
+        response = await vision_llm.converse(
+            messages=[{
+                "role": "user",
+                "content": [
+                    {
+                        "image": {
+                            "format": "png",
+                            "source": {
+                                "bytes": image_bytes
+                            }
+                        }
+                    },
+                    {
+                        "text": "What sport do you see in this image?"
+                    }
+                ]
+            }]
+        )
+
+        self.assert_response_successful(response)
+        assert "golf" in response.text.lower()
+
+    @pytest.mark.integration
+    async def test_instruction_following(self, llm: BedrockLLM):
+        llm = BedrockLLM(
+            model="qwen.qwen3-32b-v1:0",
+            region_name="us-east-1",
+        )
+        llm.parsed_instructions = Instructions(
+            input_text="only reply in 2 letter country shortcuts",
+            markdown_contents={}
+        )
+
+        response = await llm.simple_response(
+            text="Which country is rainy, protected from water with dikes and below sea level?",
+        )
+        
+        self.assert_response_successful(response)
+        assert "nl" in response.text.lower()
+
+
+
diff --git a/plugins/bedrock/tests/test_bedrock_realtime.py b/plugins/bedrock/tests/test_bedrock_realtime.py
@@ -0,0 +1,111 @@
+"""Tests for AWS Bedrock Realtime plugin."""
+import asyncio
+import pytest
+from dotenv import load_dotenv
+
+from vision_agents.plugins.bedrock import Realtime
+from vision_agents.core.llm.events import RealtimeAudioOutputEvent
+
+# Load environment variables
+load_dotenv()
+
+
+class TestBedrockRealtime:
+    """Integration tests for Bedrock Realtime connect flow"""
+
+    @pytest.fixture
+    async def realtime(self):
+        """Create and manage Realtime connection lifecycle"""
+        realtime = Realtime(
+            model="us.amazon.nova-sonic-v1:0",
+            region_name="us-east-1",
+        )
+        try:
+            yield realtime
+        finally:
+            await realtime.close()
+
+    @pytest.mark.integration
+    async def test_simple_response_flow(self, realtime):
+        """Test sending a simple text message and receiving response"""
+        # Send a simple message
+        events = []
+        
+        @realtime.events.subscribe
+        async def on_audio(event: RealtimeAudioOutputEvent):
+            events.append(event)
+        
+        await asyncio.sleep(0.01)
+        await realtime.connect()
+        await realtime.simple_response("Hello, can you hear me? Please respond with a short greeting.")
+
+        # Wait for response
+        await asyncio.sleep(5.0)
+        
+        # Note: Depending on model capabilities, audio events may or may not be generated
+        # The test passes if no exceptions are raised
+        assert True
+
+    @pytest.mark.integration
+    async def test_audio_sending_flow(self, realtime, mia_audio_16khz):
+        """Test sending real audio data and verify connection remains stable"""
+        events = []
+        
+        @realtime.events.subscribe
+        async def on_audio(event: RealtimeAudioOutputEvent):
+            events.append(event)
+        
+        await asyncio.sleep(0.01)
+        await realtime.connect()
+        
+        await realtime.simple_response("Listen to the following story, what is Mia looking for?")
+        await asyncio.sleep(10.0)
+        await realtime.simple_audio_response(mia_audio_16khz)
+
+        # Wait a moment to ensure processing
+        await asyncio.sleep(10.0)
+        
+        # Test passes if no exceptions are raised
+        assert True
+
+    @pytest.mark.integration
+    async def test_video_sending_flow(self, realtime, bunny_video_track):
+        """Test sending real video data and verify connection remains stable"""
+        events = []
+        
+        @realtime.events.subscribe
+        async def on_audio(event: RealtimeAudioOutputEvent):
+            events.append(event)
+        
+        await asyncio.sleep(0.01)
+        await realtime.connect()
+        await realtime.simple_response("Describe what you see in this video please")
+        await asyncio.sleep(5.0)
+        
+        # Start video sender with low FPS to avoid overwhelming the connection
+        await realtime._watch_video_track(bunny_video_track)
+        
+        # Let it run for a few seconds
+        await asyncio.sleep(10.0)
+        
+        # Stop video sender
+        await realtime._stop_watching_video_track()
+        
+        # Test passes if no exceptions are raised
+        assert True
+
+    @pytest.mark.integration
+    async def test_connection_lifecycle(self, realtime):
+        """Test that connection can be established and closed properly"""
+        # Connect
+        await realtime.connect()
+        assert realtime._connected is True
+        
+        # Send a simple message
+        await realtime.simple_response("Test message")
+        await asyncio.sleep(2.0)
+        
+        # Close
+        await realtime.close()
+        assert realtime._connected is False
+
diff --git a/plugins/bedrock/vision_agents/plugins/bedrock/__init__.py b/plugins/bedrock/vision_agents/plugins/bedrock/__init__.py
@@ -1,4 +1,5 @@
 from .bedrock_llm import BedrockLLM as LLM
+from .bedrock_realtime import Realtime
 
-__all__ = ["LLM"]
+__all__ = ["LLM", "Realtime"]
 
diff --git a/plugins/bedrock/vision_agents/plugins/bedrock/bedrock_llm.py b/plugins/bedrock/vision_agents/plugins/bedrock/bedrock_llm.py
diff --git a/plugins/bedrock/vision_agents/plugins/bedrock/bedrock_realtime.py b/plugins/bedrock/vision_agents/plugins/bedrock/bedrock_realtime.py