From 82b057327d13c34e9461269750b75db9a25907cf Mon Sep 17 00:00:00 2001 From: zhangqianze Date: Tue, 19 Nov 2024 00:16:46 +0800 Subject: [PATCH] feat: support vision for multimodel --- agents/property.json | 304 ++++++++++++++++++ .../bingsearch_tool_python/extension.py | 2 +- .../openai_chatgpt_python/extension.py | 27 +- .../extension/openai_chatgpt_python/openai.py | 2 + .../extension/openai_v2v_python/extension.py | 1 + .../vision_analyze_tool_python/extension.py | 10 +- .../extension/vision_tool_python/extension.py | 1 + .../weatherapi_tool_python/extension.py | 2 +- .../interface/ten_ai_base/types.py | 2 + 9 files changed, 334 insertions(+), 17 deletions(-) diff --git a/agents/property.json b/agents/property.json index 852c904d..095e0bfa 100644 --- a/agents/property.json +++ b/agents/property.json @@ -2335,6 +2335,310 @@ } ] }, + { + "name": "tool2_va_openai_azure", + "auto_start": false, + "nodes": [ + { + "type": "extension", + "extension_group": "default", + "addon": "agora_rtc", + "name": "agora_rtc", + "property": { + "app_id": "${env:AGORA_APP_ID}", + "token": "", + "channel": "ten_agent_test", + "stream_id": 1234, + "remote_stream_id": 123, + "subscribe_audio": true, + "subscribe_video": true, + "publish_audio": true, + "publish_data": true, + "enable_agora_asr": true, + "agora_asr_vendor_name": "microsoft", + "agora_asr_language": "en-US", + "agora_asr_vendor_key": "${env:AZURE_STT_KEY}", + "agora_asr_vendor_region": "${env:AZURE_STT_REGION}", + "agora_asr_session_control_file_path": "session_control.conf", + "subscribe_video_pix_fmt": 4 + } + }, + { + "type": "extension", + "extension_group": "default", + "addon": "interrupt_detector", + "name": "interrupt_detector" + }, + { + "type": "extension", + "extension_group": "chatgpt", + "addon": "openai_chatgpt_python", + "name": "openai_chatgpt", + "property": { + "base_url": "${env:OPENAI_API_BASE}", + "api_key": "${env:OPENAI_API_KEY}", + "frequency_penalty": 0.9, + "model": "gpt-4o", + "max_tokens": 512, + "prompt": "", + "proxy_url": "${env:OPENAI_PROXY_URL}", + "greeting": "TEN Agent connected. How can I help you today?", + "checking_vision_text_items": "[\"Let me take a look...\",\"Let me check your camera...\",\"Please wait for a second...\"]", + "max_memory_length": 10 + } + }, + { + "type": "extension", + "extension_group": "chatgpt", + "addon": "openai_chatgpt_python", + "name": "qwen_vl", + "property": { + "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", + "api_key": "${env:QWEN_API_KEY}", + "frequency_penalty": 0.9, + "model": "qwen-vl-plus", + "max_tokens": 512, + "prompt": "You will analyze images, keep your response less than 20 words.", + "proxy_url": "${env:OPENAI_PROXY_URL}", + "greeting": "TEN Agent connected. How can I help you today?", + "checking_vision_text_items": "[\"Let me take a look...\",\"Let me check your camera...\",\"Please wait for a second...\"]", + "max_memory_length": 10 + } + }, + { + "type": "extension", + "extension_group": "chatgpt", + "addon": "vision_analyze_tool_python", + "name": "vision_tool" + }, + { + "type": "extension", + "extension_group": "chatgpt", + "addon": "weatherapi_tool_python", + "name": "weatherapi_tool_python", + "property": { + "api_key": "${env:WEATHERAPI_API_KEY}" + } + }, + { + "type": "extension", + "extension_group": "tts", + "addon": "azure_tts", + "name": "azure_tts", + "property": { + "azure_subscription_key": "${env:AZURE_TTS_KEY}", + "azure_subscription_region": "${env:AZURE_TTS_REGION}", + "azure_synthesis_voice_name": "en-US-AndrewMultilingualNeural" + } + }, + { + "type": "extension", + "extension_group": "transcriber", + "addon": "message_collector", + "name": "message_collector" + } + ], + "connections": [ + { + "extension_group": "default", + "extension": "agora_rtc", + "data": [ + { + "name": "text_data", + "dest": [ + { + "extension_group": "default", + "extension": "interrupt_detector" + }, + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + }, + { + "extension_group": "transcriber", + "extension": "message_collector" + } + ] + } + ], + "video_frame": [ + { + "name": "video_frame", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "vision_tool" + } + ] + } + ], + "cmd": [ + { + "name": "on_user_joined", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + } + ] + }, + { + "name": "on_user_left", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + } + ] + } + ] + }, + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt", + "data": [ + { + "name": "text_data", + "dest": [ + { + "extension_group": "tts", + "extension": "azure_tts" + }, + { + "extension_group": "transcriber", + "extension": "message_collector" + } + ] + } + ], + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension_group": "tts", + "extension": "azure_tts" + } + ] + }, + { + "name": "tool_call", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "vision_tool" + }, + { + "extension_group": "chatgpt", + "extension": "weatherapi_tool_python" + } + ] + } + ] + }, + { + "extension_group": "chatgpt", + "extension": "vision_tool", + "cmd": [ + { + "name": "tool_register", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + } + ] + } + ] + }, + { + "extension_group": "chatgpt", + "extension": "vision_tool", + "cmd": [ + { + "name": "chat_completion_call", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "qwen_vl" + } + ] + } + ] + }, + { + "extension_group": "chatgpt", + "extension": "weatherapi_tool_python", + "cmd": [ + { + "name": "tool_register", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + } + ] + } + ] + }, + { + "extension_group": "tts", + "extension": "azure_tts", + "audio_frame": [ + { + "name": "pcm_frame", + "dest": [ + { + "extension_group": "default", + "extension": "agora_rtc" + } + ] + } + ], + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension_group": "default", + "extension": "agora_rtc" + } + ] + } + ] + }, + { + "extension_group": "transcriber", + "extension": "message_collector", + "data": [ + { + "name": "data", + "dest": [ + { + "extension_group": "default", + "extension": "agora_rtc" + } + ] + } + ] + }, + { + "extension_group": "default", + "extension": "interrupt_detector", + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + } + ] + } + ] + } + ] + }, { "name": "va_gemini_azure", "auto_start": false, diff --git a/agents/ten_packages/extension/bingsearch_tool_python/extension.py b/agents/ten_packages/extension/bingsearch_tool_python/extension.py index 59d20305..e77c03f2 100644 --- a/agents/ten_packages/extension/bingsearch_tool_python/extension.py +++ b/agents/ten_packages/extension/bingsearch_tool_python/extension.py @@ -128,7 +128,7 @@ async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMTool if name == TOOL_NAME: result = await self._do_search(args) # result = LLMCompletionContentItemText(text="I see something") - return {"content": json.dumps(result)} + return {"type":"query", "content": json.dumps(result)} async def _do_search(self, args: dict) -> Any: if "query" not in args: diff --git a/agents/ten_packages/extension/openai_chatgpt_python/extension.py b/agents/ten_packages/extension/openai_chatgpt_python/extension.py index 0ffe243f..8ae974b8 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/extension.py +++ b/agents/ten_packages/extension/openai_chatgpt_python/extension.py @@ -166,6 +166,7 @@ async def on_tools_update(self, ten_env: TenEnv, tool: LLMToolMetadata) -> None: async def on_call_chat_completion(self, ten_env: TenEnv, **kargs: LLMCallCompletionArgs) -> any: kmessages: LLMChatCompletionUserMessageParam = kargs.get( "messages", []) + stream: bool = kargs.get("stream", False) ten_env.log_info(f"on_call_chat_completion: {kmessages}") response = await self.openai_chatgpt.get_chat_completions( @@ -204,8 +205,9 @@ async def on_data_chat_completion(self, ten_env: TenEnv, **kargs: LLMDataComplet tools = [] if not no_tool and len( self.available_tools) > 0 else None - for tool in self.available_tools: - tools.append(self._convert_tools_to_dict(tool)) + if tools is not None: + for tool in self.available_tools: + tools.append(self._convert_tools_to_dict(tool)) self.sentence_fragment = "" @@ -236,14 +238,19 @@ async def handle_tool_call(tool_call): # self.memory_cache = [] self.memory_cache.pop() result_content = tool_result["content"] - nonlocal message - new_message = { - "role": "user", - "content": self._convert_to_content_parts(message["content"]) - } - new_message["content"] = new_message["content"] + \ - self._convert_to_content_parts(result_content) - await self.queue_input_item(True, messages=[new_message]) + type = tool_result["type"] + + if type == "direct_reply": + self.send_text_output(ten_env, result_content, True) + else: + nonlocal message + new_message = { + "role": "user", + "content": self._convert_to_content_parts(message["content"]) + } + new_message["content"] = new_message["content"] + \ + self._convert_to_content_parts(result_content) + await self.queue_input_item(True, messages=[new_message], no_tool=True) else: ten_env.log_error(f"Tool call failed") self.tool_task_future.set_result(None) diff --git a/agents/ten_packages/extension/openai_chatgpt_python/openai.py b/agents/ten_packages/extension/openai_chatgpt_python/openai.py index 284127ae..fcfed03d 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/openai.py +++ b/agents/ten_packages/extension/openai_chatgpt_python/openai.py @@ -94,6 +94,8 @@ async def get_chat_completions(self, messages, tools = None) -> ChatCompletion: "seed": self.config.seed, } + logger.info(f"get_chat_completions: {req}") + try: response = await self.client.chat.completions.create(**req) except Exception as e: diff --git a/agents/ten_packages/extension/openai_v2v_python/extension.py b/agents/ten_packages/extension/openai_v2v_python/extension.py index 38922d00..49408d8a 100644 --- a/agents/ten_packages/extension/openai_v2v_python/extension.py +++ b/agents/ten_packages/extension/openai_v2v_python/extension.py @@ -724,6 +724,7 @@ async def _on_tool_output(self, tool_call_id:str, result:CmdResult): logger.info(f"_on_tool_output {tool_call_id} {tool_result}") result_content = tool_result["content"] + # v2v always treat result as query output = json.dumps(self._convert_to_content_parts(result_content)) tool_response = ItemCreate( item=FunctionCallOutputItemParam( diff --git a/agents/ten_packages/extension/vision_analyze_tool_python/extension.py b/agents/ten_packages/extension/vision_analyze_tool_python/extension.py index eef5f680..ecc92e27 100644 --- a/agents/ten_packages/extension/vision_analyze_tool_python/extension.py +++ b/agents/ten_packages/extension/vision_analyze_tool_python/extension.py @@ -173,10 +173,10 @@ async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMTool cmd.set_property_from_json("arguments", json.dumps({"messages":[message]})) ten_env.log_info("send_cmd {}".format(message)) cmd_result: CmdResult = await ten_env.send_cmd(cmd) - result = cmd_result.get_property_to_json("response") + result = json.loads(cmd_result.get_property_to_json("response")) + + response_text = result["choices"][0]["message"]["content"] return { - "content": [{ - "type": "text", - "text": result - }] + "type": "direct_reply", + "content": response_text } \ No newline at end of file diff --git a/agents/ten_packages/extension/vision_tool_python/extension.py b/agents/ten_packages/extension/vision_tool_python/extension.py index d0096467..27ddcf4d 100644 --- a/agents/ten_packages/extension/vision_tool_python/extension.py +++ b/agents/ten_packages/extension/vision_tool_python/extension.py @@ -145,6 +145,7 @@ async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMTool self.image_data, self.image_width, self.image_height) # return LLMToolResult(message=LLMCompletionArgsMessage(role="user", content=[result])) return { + "type": "query", "content": [{ "type": "image_url", "image_url": { diff --git a/agents/ten_packages/extension/weatherapi_tool_python/extension.py b/agents/ten_packages/extension/weatherapi_tool_python/extension.py index f9d38543..b0ebd552 100644 --- a/agents/ten_packages/extension/weatherapi_tool_python/extension.py +++ b/agents/ten_packages/extension/weatherapi_tool_python/extension.py @@ -181,7 +181,7 @@ async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMTool elif name == FORECAST_TOOL_NAME: result = await self._get_future_weather(args) # result = LLMCompletionContentItemText(text="I see something") - return {"content": json.dumps(result)} + return {"type": "query", "content": json.dumps(result)} async def _get_current_weather(self, args: dict) -> Any: if "location" not in args: diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py index 04fb7f50..0a058be6 100644 --- a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py +++ b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py @@ -86,10 +86,12 @@ class LLMChatCompletionUserMessageParam(TypedDict, total=False): ] class LLMToolResult(TypedDict, total=False): + type: Union[Literal["query"], Literal["direct_reply"]] content: Required[Union[str, Iterable[LLMChatCompletionContentPartParam]]] class LLMCallCompletionArgs(TypedDict, total=False): messages: Iterable[LLMChatCompletionMessageParam] + stream: Optional[bool] = False class LLMDataCompletionArgs(TypedDict, total=False): messages: Iterable[LLMChatCompletionMessageParam]