refact openai v2v

TomasBack2Future · Nov 25, 2024 · c19ec72 · c19ec72
1 parent 45049bd
commit c19ec72
Show file tree

Hide file tree

Showing 19 changed files with 569 additions and 779 deletions.
diff --git a/agents/examples/demo/property.json b/agents/examples/demo/property.json
@@ -518,7 +518,7 @@
               "language": "en-US",
               "server_vad": true,
               "dump": true,
-              "history": 10
+              "max_history": 10
             }
           },
           {
@@ -848,7 +848,7 @@
               "language": "en-US",
               "server_vad": true,
               "dump": true,
-              "history": 10
+              "max_history": 10
             }
           },
           {

diff --git a/agents/examples/experimental/property.json b/agents/examples/experimental/property.json
@@ -233,11 +233,11 @@
               "language": "en-US",
               "server_vad": true,
               "dump": true,
-              "history": 10,
+              "max_history": 10,
               "vendor": "azure",
               "base_uri": "${env:AZURE_OPENAI_REALTIME_BASE_URI}",
               "path": "/openai/realtime?api-version=2024-10-01-preview&deployment=gpt-4o-realtime-preview",
-              "system_message": ""
+              "prompt": ""
             }
           },
           {
@@ -2444,7 +2444,7 @@
               "language": "en-US",
               "server_vad": true,
               "dump": true,
-              "history": 10
+              "max_history": 10
             }
           },
           {
@@ -2566,7 +2566,7 @@
               "language": "en-US",
               "server_vad": true,
               "dump": true,
-              "history": 10
+              "max_history": 10
             }
           },
           {
@@ -2724,7 +2724,7 @@
               "language": "en-US",
               "server_vad": true,
               "dump": true,
-              "history": 10,
+              "max_history": 10,
               "enable_storage": true
             }
           },

diff --git a/agents/ten_packages/extension/glue_python_async/examples/openai_wrapper.py b/agents/ten_packages/extension/glue_python_async/examples/openai_wrapper.py
@@ -14,11 +14,6 @@
 from fastapi import Depends, FastAPI, HTTPException, Request
 import asyncio
 
-# Enable Pydantic debug mode
-from pydantic import BaseConfig
-
-BaseConfig.debug = True
-
 # Set up logging
 logging.config.dictConfig({
     "version": 1,

diff --git a/agents/ten_packages/extension/glue_python_async/extension.py b/agents/ten_packages/extension/glue_python_async/extension.py
@@ -7,10 +7,12 @@
 import traceback
 import aiohttp
 import json
+import time
+import re
 
-from datetime import datetime
+import numpy as np
 from typing import List, Any, AsyncGenerator
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pydantic import BaseModel
 
 from ten import (
@@ -23,7 +25,7 @@
     Data,
 )
 
-from ten_ai_base import BaseConfig, ChatMemory
+from ten_ai_base import BaseConfig, ChatMemory, LLMUsage, LLMCompletionTokensDetails, LLMPromptTokensDetails, EVENT_MEMORY_APPENDED
 from ten_ai_base.llm import AsyncLLMBaseExtension, LLMCallCompletionArgs, LLMDataCompletionArgs, LLMToolMetadata
 from ten_ai_base.types import LLMChatCompletionUserMessageParam, LLMToolResult
 
@@ -84,27 +86,9 @@ class Choice(BaseModel):
     index: int
     finish_reason: str | None
 
-class CompletionTokensDetails(BaseModel):
-    accepted_prediction_tokens: int = 0
-    audio_tokens: int = 0
-    reasoning_tokens: int = 0
-    rejected_prediction_tokens: int = 0
-
-class PromptTokensDetails(BaseModel):
-    audio_tokens: int = 0
-    cached_tokens: int = 0
-
-class Usage(BaseModel):
-    completion_tokens: int = 0
-    prompt_tokens: int = 0
-    total_tokens: int = 0
-
-    completion_tokens_details: CompletionTokensDetails | None = None
-    prompt_tokens_details: PromptTokensDetails | None = None
-
 class ResponseChunk(BaseModel):
     choices: List[Choice]
-    usage: Usage | None = None
+    usage: LLMUsage | None = None
 
 @dataclass
 class GlueConfig(BaseConfig):
@@ -113,17 +97,29 @@ class GlueConfig(BaseConfig):
     prompt: str = ""
     max_history: int = 10
     greeting: str = ""
+    failure_info: str = ""
+    modalities: List[str] = field(default_factory=lambda: ["text"])
+    rtm_enabled: bool = True
+    ssml_enabled: bool = False
+    context_enabled: bool = False
+    extra_context: dict = field(default_factory=dict)
+    enable_storage: bool = False
 
 class AsyncGlueExtension(AsyncLLMBaseExtension):
     config : GlueConfig = None
-    sentence_fragment: str = ""
     ten_env: AsyncTenEnv = None
     loop: asyncio.AbstractEventLoop = None
     stopped: bool = False
     memory: ChatMemory = None
-    total_usage: Usage = Usage()
+    total_usage: LLMUsage = LLMUsage()
     users_count = 0
 
+    completion_times = []
+    connect_times = []
+    first_token_times = []
+
+    remote_stream_id: int = 999 # TODO
+
     async def on_init(self, ten_env: AsyncTenEnv) -> None:
         await super().on_init(ten_env)
         ten_env.log_debug("on_init")
@@ -138,6 +134,7 @@ async def on_start(self, ten_env: AsyncTenEnv) -> None:
         ten_env.log_info(f"config: {self.config}")
 
         self.memory = ChatMemory(self.config.max_history)
+        self.memory.on(EVENT_MEMORY_APPENDED, self._on_memory_appended)
 
         self.ten_env = ten_env
 
@@ -187,7 +184,21 @@ async def on_data_chat_completion(self, ten_env: AsyncTenEnv, **kargs: LLMDataCo
         messages = []
         if self.config.prompt:
             messages.append({"role": "system", "content": self.config.prompt})
-        messages.extend(self.memory.get())
+
+        history = self.memory.get()
+        while history:
+            if history[0].get("role") == "tool":
+                history = history[1:]
+                continue
+            if history[0].get("role") == "assistant" and history[0].get("tool_calls"):
+                history = history[1:]
+                continue
+
+            # Skip the first tool role
+            break
+
+        messages.extend(history)
+
         if not input:
             ten_env.log_warn("No message in data")
         else:
@@ -220,6 +231,10 @@ def tool_dict(tool: LLMToolMetadata):
                     json["function"]["parameters"]["required"].append(param.name)
 
             return json
+
+        def trim_xml(input_string): 
+            return re.sub(r'<[^>]+>', '', input_string).strip()
+
         tools = []
         for tool in self.available_tools:
             tools.append(tool_dict(tool))
@@ -229,16 +244,25 @@ def tool_dict(tool: LLMToolMetadata):
         calls = {}
 
         sentences = []
+        start_time = time.time()
+        first_token_time = None
         response = self._stream_chat(messages=messages, tools=tools)
         async for message in response:
-            self.ten_env.log_info(f"content: {message}")
+            self.ten_env.log_debug(f"content: {message}")
             # TODO: handle tool call
             try:
                 c = ResponseChunk(**message)
                 if c.choices:
                     if c.choices[0].delta.content:
-                        total_output += c.choices[0].delta.content
-                        sentences, sentence_fragment = parse_sentences(sentence_fragment, c.choices[0].delta.content)
+                        if first_token_time is None:
+                            first_token_time = time.time()
+                            self.first_token_times.append(first_token_time - start_time)
+
+                        content = c.choices[0].delta.content
+                        if self.config.ssml_enabled and content.startswith("<speak>"):
+                            content = trim_xml(content)
+                        total_output += content
+                        sentences, sentence_fragment = parse_sentences(sentence_fragment, content)
                         for s in sentences:
                             await self._send_text(s)
                     if c.choices[0].delta.tool_calls:
@@ -252,10 +276,14 @@ def tool_dict(tool: LLMToolMetadata):
                                 calls[call.index].function.arguments += call.function.arguments
                 if c.usage:
                     self.ten_env.log_info(f"usage: {c.usage}")
-                    self._update_usage(c.usage)
+                    await self._update_usage(c.usage)
             except Exception as e:
                 self.ten_env.log_error(f"Failed to parse response: {message} {e}")
                 traceback.print_exc()
+        if sentence_fragment:
+            await self._send_text(sentence_fragment)
+        end_time = time.time()
+        self.completion_times.append(end_time - start_time)
 
         if total_output:
             self.memory.put({"role": "assistant", "content": total_output})
@@ -343,48 +371,67 @@ async def _send_text(self, text: str) -> None:
         self.ten_env.send_data(data)
 
     async def _stream_chat(self, messages: List[Any], tools: List[Any]) -> AsyncGenerator[dict, None]:
-        session = aiohttp.ClientSession()
-        try:
-            payload = {
-                "messages": messages,
-                "tools": tools,
-                "tools_choice": "auto" if tools else "none",
-                "model": "gpt-3.5-turbo",
-                "stream": True,
-                "stream_options": {"include_usage": True}
-            }
-            self.ten_env.log_info(f"payload before sending: {json.dumps(payload)}")
-            headers = {
-                "Authorization": f"Bearer {self.config.token}",
-                "Content-Type": "application/json"
-            }
-
-            async with session.post(self.config.api_url, json=payload, headers=headers) as response:
-                if response.status != 200:
-                    r = await response.json()
-                    self.ten_env.log_error(f"Received unexpected status {r} from the server.")
-                    return
+        async with aiohttp.ClientSession() as session:
+            try:
+                payload = {
+                    "messages": messages,
+                    "tools": tools,
+                    "tools_choice": "auto" if tools else "none",
+                    "model": "gpt-3.5-turbo",
+                    "stream": True,
+                    "stream_options": {"include_usage": True},
+                    "ssml_enabled": self.config.ssml_enabled
+                }
+                if self.config.context_enabled:
+                    payload["context"] = {
+                        **self.config.extra_context
+                    }
+                self.ten_env.log_info(f"payload before sending: {json.dumps(payload)}")
+                headers = {
+                    "Authorization": f"Bearer {self.config.token}",
+                    "Content-Type": "application/json"
+                }
 
-                async for line in response.content:
-                    if line:
-                        l = line.decode('utf-8').strip()
-                        if l.startswith("data:"):
-                            content = l[5:].strip()
-                            if content == "[DONE]":
-                                break
-                            self.ten_env.log_info(f"content: {content}")
-                            yield json.loads(content)
-        except Exception as e:
-            self.ten_env.log_error(f"Failed to handle {e}")
-        finally:
-            await session.close()
-            session = None
+                start_time = time.time()
+                async with session.post(self.config.api_url, json=payload, headers=headers) as response:
+                    if response.status != 200:
+                        r = await response.json()
+                        self.ten_env.log_error(f"Received unexpected status {r} from the server.")
+                        if self.config.failure_info:
+                            await self._send_text(self.config.failure_info)
+                        return
+                    end_time = time.time()
+                    self.connect_times.append(end_time - start_time)
+
+                    async for line in response.content:
+                        if line:
+                            l = line.decode('utf-8').strip()
+                            if l.startswith("data:"):
+                                content = l[5:].strip()
+                                if content == "[DONE]":
+                                    break
+                                self.ten_env.log_debug(f"content: {content}")
+                                yield json.loads(content)
+            except Exception as e:
+                traceback.print_exc()
+                self.ten_env.log_error(f"Failed to handle {e}")
+            finally:
+                await session.close()
+                session = None
+
+    async def _update_usage(self, usage: LLMUsage) -> None:
+        if not self.config.rtm_enabled:
+            return
 
-    async def _update_usage(self, usage: Usage) -> None:
         self.total_usage.completion_tokens += usage.completion_tokens
         self.total_usage.prompt_tokens += usage.prompt_tokens
         self.total_usage.total_tokens += usage.total_tokens
 
+        if self.total_usage.completion_tokens_details is None:
+            self.total_usage.completion_tokens_details = LLMCompletionTokensDetails()
+        if self.total_usage.prompt_tokens_details is None:
+            self.total_usage.prompt_tokens_details = LLMPromptTokensDetails()
+
         if usage.completion_tokens_details:
             self.total_usage.completion_tokens_details.accepted_prediction_tokens += usage.completion_tokens_details.accepted_prediction_tokens
             self.total_usage.completion_tokens_details.audio_tokens += usage.completion_tokens_details.audio_tokens
@@ -395,4 +442,33 @@ async def _update_usage(self, usage: Usage) -> None:
             self.total_usage.prompt_tokens_details.audio_tokens += usage.prompt_tokens_details.audio_tokens
             self.total_usage.prompt_tokens_details.cached_tokens += usage.prompt_tokens_details.cached_tokens
 
-        self.ten_env.log_info(f"total usage: {self.total_usage}")
+        self.ten_env.log_info(f"total usage: {self.total_usage}")
+
+        data = Data.create("llm_stat")
+        data.set_property_from_json("usage", json.dumps(self.total_usage.model_dump()))
+        if self.connect_times and self.completion_times and self.first_token_times:
+            data.set_property_from_json("latency", json.dumps({
+                "connection_latency_95": np.percentile(self.connect_times, 95),
+                "completion_latency_95": np.percentile(self.completion_times, 95),
+                "first_token_latency_95": np.percentile(self.first_token_times, 95),
+                "connection_latency_99": np.percentile(self.connect_times, 99),
+                "completion_latency_99": np.percentile(self.completion_times, 99),
+                "first_token_latency_99": np.percentile(self.first_token_times, 99)
+            }))
+        self.ten_env.send_data(data)
+
+    async def _on_memory_appended(self, message: dict) -> None:
+        self.ten_env.log_info(f"Memory appended: {message}")
+        if not self.config.enable_storage:
+            return
+
+        role = message.get("role")
+        stream_id = self.remote_stream_id if role == "user" else 0
+        try:
+            d = Data.create("append")
+            d.set_property_string("text", message.get("content"))
+            d.set_property_string("role", role)
+            d.set_property_int("stream_id", stream_id)
+            self.ten_env.send_data(d)
+        except Exception as e:
+            self.ten_env.log_error(f"Error send append_context data {message} {e}")