From 14f7d3c8fa28a5aff89b5f35dff6b6b1d736ded1 Mon Sep 17 00:00:00 2001 From: Zhang Qianze Date: Wed, 31 Jul 2024 03:03:48 +0800 Subject: [PATCH 1/9] feat: support vision for openai --- .../openai_chatgpt_python/manifest.json | 3 + .../openai_chatgpt_extension.py | 93 ++++++++++++++++++- .../openai_chatgpt_python/requirements.txt | 4 +- 3 files changed, 98 insertions(+), 2 deletions(-) diff --git a/agents/addon/extension/openai_chatgpt_python/manifest.json b/agents/addon/extension/openai_chatgpt_python/manifest.json index 9fc1b362..52643aa5 100644 --- a/agents/addon/extension/openai_chatgpt_python/manifest.json +++ b/agents/addon/extension/openai_chatgpt_python/manifest.json @@ -47,6 +47,9 @@ }, "max_memory_length": { "type": "int64" + }, + "enable_vision": { + "type": "bool" } }, "data_in": [ diff --git a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py index 6cabe6cb..cf238a94 100644 --- a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py +++ b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py @@ -5,6 +5,7 @@ # Copyright (c) 2024 Agora IO. All rights reserved. # # +from rte.image_frame import ImageFrame from .openai_chatgpt import OpenAIChatGPT, OpenAIChatGPTConfig from datetime import datetime from threading import Thread @@ -20,6 +21,10 @@ MetadataInfo, ) from .log import logger +from base64 import b64encode +import numpy as np +from io import BytesIO +from PIL import Image CMD_IN_FLUSH = "flush" @@ -39,6 +44,7 @@ PROPERTY_TOP_P = "top_p" # Optional PROPERTY_MAX_TOKENS = "max_tokens" # Optional PROPERTY_GREETING = "greeting" # Optional +PROPERTY_ENABLE_VISION = "enable_vision" # Optional PROPERTY_PROXY_URL = "proxy_url" # Optional PROPERTY_MAX_MEMORY_LENGTH = "max_memory_length" # Optional @@ -73,11 +79,69 @@ def parse_sentence(sentence, content): return sentence, remain, found_punc +def yuv420_to_rgb(yuv_data, width, height): + # Calculate the size of each plane + frame_size = width * height + chroma_size = frame_size // 4 + + y_plane = yuv_data[0:frame_size].reshape((height, width)) + u_plane = yuv_data[frame_size:frame_size + chroma_size].reshape((height // 2, width // 2)) + v_plane = yuv_data[frame_size + chroma_size:].reshape((height // 2, width // 2)) + + u_plane = u_plane.repeat(2, axis=0).repeat(2, axis=1) + v_plane = v_plane.repeat(2, axis=0).repeat(2, axis=1) + + # Ensure calculations are done in a wider data type to prevent overflow + y_plane = y_plane.astype(np.int16) + u_plane = u_plane.astype(np.int16) + v_plane = v_plane.astype(np.int16) + + # Convert YUV to RGB using the standard conversion formula + r_plane = y_plane + 1.402 * (v_plane - 128) + g_plane = y_plane - 0.344136 * (u_plane - 128) - 0.714136 * (v_plane - 128) + b_plane = y_plane + 1.772 * (u_plane - 128) + + # Clip values to the 0-255 range and convert to uint8 + r_plane = np.clip(r_plane, 0, 255).astype(np.uint8) + g_plane = np.clip(g_plane, 0, 255).astype(np.uint8) + b_plane = np.clip(b_plane, 0, 255).astype(np.uint8) + + # Stack the RGB planes into an image + rgb_image = np.stack([r_plane, g_plane, b_plane], axis=-1) + + return rgb_image + +def yuv2base64png(yuv_data, width, height): + # Convert YUV to RGB + rgb_image = yuv420_to_rgb(np.frombuffer(yuv_data, dtype=np.uint8), width, height) + + # Convert the RGB image to a PIL Image + pil_image = Image.fromarray(rgb_image) + + # Save the image to a BytesIO object in PNG format + buffered = BytesIO() + pil_image.save(buffered, format="JPEG") + + # Get the byte data of the PNG image + png_image_data = buffered.getvalue() + + # Convert the PNG byte data to a Base64 encoded string + base64_encoded_image = b64encode(png_image_data).decode('utf-8') + + # Create the data URL + mime_type = 'image/jpeg' + base64_url = f"data:{mime_type};base64,{base64_encoded_image}" + return base64_url + class OpenAIChatGPTExtension(Extension): memory = [] max_memory_length = 10 outdate_ts = 0 openai_chatgpt = None + enable_vision = False + image_data = None + image_width = 0 + image_height = 0 def on_init( self, rte: RteEnv, manifest: MetadataInfo, property: MetadataInfo @@ -168,6 +232,11 @@ def on_start(self, rte: RteEnv) -> None: except Exception as err: logger.info(f"GetProperty optional {PROPERTY_GREETING} failed, err: {err}") + try: + self.enable_vision = rte.get_property_bool(PROPERTY_ENABLE_VISION) + except Exception as err: + logger.info(f"GetProperty optional {PROPERTY_ENABLE_VISION} failed, err: {err}") + try: prop_max_memory_length = rte.get_property_int(PROPERTY_MAX_MEMORY_LENGTH) if prop_max_memory_length > 0: @@ -233,6 +302,13 @@ def on_cmd(self, rte: RteEnv, cmd: Cmd) -> None: cmd_result.set_property_string("detail", "success") rte.return_result(cmd_result, cmd) + def on_image_frame(self, rte_env: RteEnv, image_frame: ImageFrame) -> None: + # logger.info(f"OpenAIChatGPTExtension on_image_frame {image_frame.get_width()} {image_frame.get_height()}") + self.image_data = image_frame.get_buf() + self.image_width = image_frame.get_width() + self.image_height = image_frame.get_height() + return + def on_data(self, rte: RteEnv, data: Data) -> None: """ on_data receives data from rte graph. @@ -271,7 +347,22 @@ def on_data(self, rte: RteEnv, data: Data) -> None: # Prepare memory if len(self.memory) > self.max_memory_length: self.memory.pop(0) - self.memory.append({"role": "user", "content": input_text}) + if self.image_data is not None and self.enable_vision is True: + url = yuv2base64png(self.image_data, self.image_width, self.image_height) + # logger.info(f"image url: {url}") + self.memory.append({"role": "user", "content": [ + {"type": "text", "text": input_text}, + { + "type": "image_url", + "image_url": { + "url": url, + } + } + ]}) + # clear image after use + self.image_data = None + else: + self.memory.append({"role": "user", "content": input_text}) def chat_completions_stream_worker(start_time, input_text, memory): try: diff --git a/agents/addon/extension/openai_chatgpt_python/requirements.txt b/agents/addon/extension/openai_chatgpt_python/requirements.txt index de1e7f46..288b987d 100644 --- a/agents/addon/extension/openai_chatgpt_python/requirements.txt +++ b/agents/addon/extension/openai_chatgpt_python/requirements.txt @@ -1,2 +1,4 @@ openai==1.35.13 -requests==2.32.3 \ No newline at end of file +requests==2.32.3 +numpy==2.0.1 +pillow==10.4.0 \ No newline at end of file From c1951b1eed53926f3c312c9f7dccfecfcacbcbc2 Mon Sep 17 00:00:00 2001 From: zhangqianze Date: Fri, 9 Aug 2024 00:37:22 +0800 Subject: [PATCH 2/9] feat: add tool support --- .../openai_chatgpt_python/manifest.json | 7 +- .../openai_chatgpt_python/openai_chatgpt.py | 3 +- .../openai_chatgpt_extension.py | 398 +++++++++++++----- 3 files changed, 292 insertions(+), 116 deletions(-) diff --git a/agents/addon/extension/openai_chatgpt_python/manifest.json b/agents/addon/extension/openai_chatgpt_python/manifest.json index 3f6e520b..f325cdd8 100644 --- a/agents/addon/extension/openai_chatgpt_python/manifest.json +++ b/agents/addon/extension/openai_chatgpt_python/manifest.json @@ -81,6 +81,11 @@ { "name": "flush" } + ], + "image_frame_in": [ + { + "name": "image_frame" + } ] } -} \ No newline at end of file +} diff --git a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt.py b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt.py index 5ad5b6cc..7f2a659c 100644 --- a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt.py +++ b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt.py @@ -65,7 +65,7 @@ def __init__(self, config: OpenAIChatGPTConfig): self.session.proxies.update(proxies) self.client.session = self.session - def get_chat_completions_stream(self, messages): + def get_chat_completions_stream(self, messages, tools = None): req = { "model": self.config.model, "messages": [ @@ -75,6 +75,7 @@ def get_chat_completions_stream(self, messages): }, *messages, ], + "tools": tools, "temperature": self.config.temperature, "top_p": self.config.top_p, "presence_penalty": self.config.presence_penalty, diff --git a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py index b27d9a7a..00eac652 100644 --- a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py +++ b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py @@ -111,16 +111,38 @@ def yuv420_to_rgb(yuv_data, width, height): return rgb_image +def rgb2base64jpeg(rgb_data, width, height): + # Convert the RGB image to a PIL Image + pil_image = Image.fromarray(rgb_data) + + # Save the image to a BytesIO object in JPEG format + buffered = BytesIO() + pil_image.save(buffered, format="JPEG") + pil_image.save("test.jpg", format="JPEG") + + # Get the byte data of the JPEG image + jpeg_image_data = buffered.getvalue() + + # Convert the JPEG byte data to a Base64 encoded string + base64_encoded_image = b64encode(jpeg_image_data).decode('utf-8') + + # Create the data URL + mime_type = 'image/jpeg' + base64_url = f"data:{mime_type};base64,{base64_encoded_image}" + return base64_url + def yuv2base64png(yuv_data, width, height): # Convert YUV to RGB rgb_image = yuv420_to_rgb(np.frombuffer(yuv_data, dtype=np.uint8), width, height) # Convert the RGB image to a PIL Image pil_image = Image.fromarray(rgb_image) + pil_image = pil_image.resize((width//2, height//2)) # Save the image to a BytesIO object in PNG format buffered = BytesIO() pil_image.save(buffered, format="JPEG") + pil_image.save("test.jpg", format="JPEG") # Get the byte data of the PNG image png_image_data = buffered.getvalue() @@ -143,6 +165,15 @@ class OpenAIChatGPTExtension(Extension): image_width = 0 image_height = 0 + available_tools = [{ + "type": "function", + "function": { + "name": "get_camera_image", + "description": "Get the camera image which is being used. Call this whenever you need to understand the camera video like you have an eye, for example when a customer asks 'What can you see?'", + }, + "strict": True, + }] + def on_start(self, rte: RteEnv) -> None: logger.info("OpenAIChatGPTExtension on_start") # Prepare configuration @@ -269,6 +300,240 @@ def on_stop(self, rte: RteEnv) -> None: logger.info("OpenAIChatGPTExtension on_stop") rte.on_stop_done() + def append_memory(self, message): + if len(self.memory) > self.max_memory_length: + self.memory.pop(0) + self.memory.append(message) + + def chat_completion_with_vision(self, rte: RteEnv, start_time, input_text, memory): + try: + logger.info( + f"for input text: [{input_text}] memory: {memory}" + ) + + message = {"role": "user", "content": input_text} + + if self.image_data is not None: + url = yuv2base64png(self.image_data, self.image_width, self.image_height) + # logger.info(f"image url: {url}") + message = {"role": "user", "content": [ + {"type": "text", "text": input_text}, + { + "type": "image_url", + "image_url": { + "url": url, + } + } + ]} + + + # Get result from AI + resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message]) + self.append_memory({"role": "user", "content": input_text}) + if resp is None: + logger.info( + f"for input text: [{input_text}] failed" + ) + return + + sentence = "" + full_content = "" + first_sentence_sent = False + + for chat_completions in resp: + if start_time < self.outdate_ts: + logger.info( + f"recv interrupt and flushing for input text: [{input_text}], startTs: {start_time}, outdateTs: {self.outdate_ts}" + ) + break + + if ( + len(chat_completions.choices) > 0 + ): + if chat_completions.choices[0].delta.tool_calls is not None: + for tool_call in chat_completions.choices[0].delta.tool_calls: + logger.info(f"tool_call: {tool_call}") + if tool_call.function.name == "get_camera_image": + self.chat_completion(rte, start_time, input_text, memory, True) + return + elif chat_completions.choices[0].delta.content is not None: + content = chat_completions.choices[0].delta.content + else: + content = "" + + full_content += content + + while True: + sentence, content, sentence_is_final = parse_sentence( + sentence, content + ) + if len(sentence) == 0 or not sentence_is_final: + logger.info(f"sentence {sentence} is empty or not final") + break + logger.info( + f"recv for input text: [{input_text}] got sentence: [{sentence}]" + ) + + # send sentence + try: + output_data = Data.create("text_data") + output_data.set_property_string( + DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence + ) + output_data.set_property_bool( + DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, False + ) + rte.send_data(output_data) + logger.info( + f"recv for input text: [{input_text}] sent sentence [{sentence}]" + ) + except Exception as err: + logger.info( + f"recv for input text: [{input_text}] send sentence [{sentence}] failed, err: {err}" + ) + break + + sentence = "" + if not first_sentence_sent: + first_sentence_sent = True + logger.info( + f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms" + ) + + # remember response as assistant content in memory + self.append_memory({"role": "assistant", "content": full_content}) + + # send end of segment + try: + output_data = Data.create("text_data") + output_data.set_property_string( + DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence + ) + output_data.set_property_bool( + DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, True + ) + rte.send_data(output_data) + logger.info( + f"for input text: [{input_text}] end of segment with sentence [{sentence}] sent" + ) + except Exception as err: + logger.info( + f"for input text: [{input_text}] end of segment with sentence [{sentence}] send failed, err: {err}" + ) + + except Exception as e: + logger.info( + f"for input text: [{input_text}] failed, err: {e}" + ) + + def chat_completion(self, rte: RteEnv, start_time, input_text, memory): + try: + logger.info( + f"for input text: [{input_text}] memory: {memory}" + ) + + message = {"role": "user", "content": input_text} + + + # Get result from AI + resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message], self.available_tools) + self.append_memory({"role": "user", "content": input_text}) + if resp is None: + logger.info( + f"for input text: [{input_text}] failed" + ) + return + + sentence = "" + full_content = "" + first_sentence_sent = False + + for chat_completions in resp: + if start_time < self.outdate_ts: + logger.info( + f"recv interrupt and flushing for input text: [{input_text}], startTs: {start_time}, outdateTs: {self.outdate_ts}" + ) + break + + if ( + len(chat_completions.choices) > 0 + ): + if chat_completions.choices[0].delta.tool_calls is not None: + for tool_call in chat_completions.choices[0].delta.tool_calls: + logger.info(f"tool_call: {tool_call}") + if tool_call.function.name == "get_camera_image": + self.chat_completion_with_vision(rte, start_time, "tell me about this image using language in previous context", memory) + return + elif chat_completions.choices[0].delta.content is not None: + content = chat_completions.choices[0].delta.content + else: + content = "" + + full_content += content + + while True: + sentence, content, sentence_is_final = parse_sentence( + sentence, content + ) + if len(sentence) == 0 or not sentence_is_final: + logger.info(f"sentence {sentence} is empty or not final") + break + logger.info( + f"recv for input text: [{input_text}] got sentence: [{sentence}]" + ) + + # send sentence + try: + output_data = Data.create("text_data") + output_data.set_property_string( + DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence + ) + output_data.set_property_bool( + DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, False + ) + rte.send_data(output_data) + logger.info( + f"recv for input text: [{input_text}] sent sentence [{sentence}]" + ) + except Exception as err: + logger.info( + f"recv for input text: [{input_text}] send sentence [{sentence}] failed, err: {err}" + ) + break + + sentence = "" + if not first_sentence_sent: + first_sentence_sent = True + logger.info( + f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms" + ) + + # remember response as assistant content in memory + self.append_memory({"role": "assistant", "content": full_content}) + + # send end of segment + try: + output_data = Data.create("text_data") + output_data.set_property_string( + DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence + ) + output_data.set_property_bool( + DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, True + ) + rte.send_data(output_data) + logger.info( + f"for input text: [{input_text}] end of segment with sentence [{sentence}] sent" + ) + except Exception as err: + logger.info( + f"for input text: [{input_text}] end of segment with sentence [{sentence}] send failed, err: {err}" + ) + + except Exception as e: + logger.info( + f"for input text: [{input_text}] failed, err: {e}" + ) + def on_cmd(self, rte: RteEnv, cmd: Cmd) -> None: logger.info("OpenAIChatGPTExtension on_cmd") cmd_json = cmd.to_json() @@ -335,122 +600,27 @@ def on_data(self, rte: RteEnv, data: Data) -> None: return # Prepare memory - if len(self.memory) > self.max_memory_length: - self.memory.pop(0) - if self.image_data is not None and self.enable_vision is True: - url = yuv2base64png(self.image_data, self.image_width, self.image_height) - # logger.info(f"image url: {url}") - self.memory.append({"role": "user", "content": [ - {"type": "text", "text": input_text}, - { - "type": "image_url", - "image_url": { - "url": url, - } - } - ]}) - # clear image after use - self.image_data = None - else: - self.memory.append({"role": "user", "content": input_text}) + # if len(self.memory) > self.max_memory_length: + # self.memory.pop(0) + # if self.image_data is not None and self.enable_vision is True: + # url = yuv2base64png(self.image_data, self.image_width, self.image_height) + # # logger.info(f"image url: {url}") + # self.memory.append({"role": "user", "content": [ + # {"type": "text", "text": input_text}, + # { + # "type": "image_url", + # "image_url": { + # "url": url, + # } + # } + # ]}) + # # clear image after use + # self.image_data = None + # else: + # self.memory.append({"role": "user", "content": input_text}) def chat_completions_stream_worker(start_time, input_text, memory): - try: - logger.info( - f"GetChatCompletionsStream for input text: [{input_text}] memory: {memory}" - ) - - # Get result from AI - resp = self.openai_chatgpt.get_chat_completions_stream(memory) - if resp is None: - logger.info( - f"GetChatCompletionsStream for input text: [{input_text}] failed" - ) - return - - sentence = "" - full_content = "" - first_sentence_sent = False - - for chat_completions in resp: - if start_time < self.outdate_ts: - logger.info( - f"GetChatCompletionsStream recv interrupt and flushing for input text: [{input_text}], startTs: {start_time}, outdateTs: {self.outdate_ts}" - ) - break - - if ( - len(chat_completions.choices) > 0 - and chat_completions.choices[0].delta.content is not None - ): - content = chat_completions.choices[0].delta.content - else: - content = "" - - full_content += content - - while True: - sentence, content, sentence_is_final = parse_sentence( - sentence, content - ) - if len(sentence) == 0 or not sentence_is_final: - logger.info(f"sentence {sentence} is empty or not final") - break - logger.info( - f"GetChatCompletionsStream recv for input text: [{input_text}] got sentence: [{sentence}]" - ) - - # send sentence - try: - output_data = Data.create("text_data") - output_data.set_property_string( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence - ) - output_data.set_property_bool( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, False - ) - rte.send_data(output_data) - logger.info( - f"GetChatCompletionsStream recv for input text: [{input_text}] sent sentence [{sentence}]" - ) - except Exception as err: - logger.info( - f"GetChatCompletionsStream recv for input text: [{input_text}] send sentence [{sentence}] failed, err: {err}" - ) - break - - sentence = "" - if not first_sentence_sent: - first_sentence_sent = True - logger.info( - f"GetChatCompletionsStream recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms" - ) - - # remember response as assistant content in memory - memory.append({"role": "assistant", "content": full_content}) - - # send end of segment - try: - output_data = Data.create("text_data") - output_data.set_property_string( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence - ) - output_data.set_property_bool( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, True - ) - rte.send_data(output_data) - logger.info( - f"GetChatCompletionsStream for input text: [{input_text}] end of segment with sentence [{sentence}] sent" - ) - except Exception as err: - logger.info( - f"GetChatCompletionsStream for input text: [{input_text}] end of segment with sentence [{sentence}] send failed, err: {err}" - ) - - except Exception as e: - logger.info( - f"GetChatCompletionsStream for input text: [{input_text}] failed, err: {e}" - ) + self.chat_completion(rte, start_time, input_text, memory) # Start thread to request and read responses from OpenAI start_time = get_current_time() From d7e838cd0e78aa0df80c146930b9faa836524418 Mon Sep 17 00:00:00 2001 From: Zhang Qianze Date: Fri, 9 Aug 2024 01:30:44 +0800 Subject: [PATCH 3/9] updates --- .../openai_chatgpt_extension.py | 541 ++++++++++-------- agents/property.json.example | 231 ++++++++ 2 files changed, 528 insertions(+), 244 deletions(-) diff --git a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py index 00eac652..ec11d11d 100644 --- a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py +++ b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py @@ -305,234 +305,228 @@ def append_memory(self, message): self.memory.pop(0) self.memory.append(message) - def chat_completion_with_vision(self, rte: RteEnv, start_time, input_text, memory): - try: - logger.info( - f"for input text: [{input_text}] memory: {memory}" - ) - - message = {"role": "user", "content": input_text} - - if self.image_data is not None: - url = yuv2base64png(self.image_data, self.image_width, self.image_height) - # logger.info(f"image url: {url}") - message = {"role": "user", "content": [ - {"type": "text", "text": input_text}, - { - "type": "image_url", - "image_url": { - "url": url, - } - } - ]} + # def chat_completion_with_vision(self, rte: RteEnv, start_time, input_text, memory): + # try: + # logger.info( + # f"for input text: [{input_text}] memory: {memory}" + # ) + + # message = {"role": "user", "content": input_text} + + # if self.image_data is not None: + # url = yuv2base64png(self.image_data, self.image_width, self.image_height) + # # logger.info(f"image url: {url}") + # message = {"role": "user", "content": [ + # {"type": "text", "text": input_text}, + # { + # "type": "image_url", + # "image_url": { + # "url": url, + # } + # } + # ]} - # Get result from AI - resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message]) - self.append_memory({"role": "user", "content": input_text}) - if resp is None: - logger.info( - f"for input text: [{input_text}] failed" - ) - return - - sentence = "" - full_content = "" - first_sentence_sent = False - - for chat_completions in resp: - if start_time < self.outdate_ts: - logger.info( - f"recv interrupt and flushing for input text: [{input_text}], startTs: {start_time}, outdateTs: {self.outdate_ts}" - ) - break - - if ( - len(chat_completions.choices) > 0 - ): - if chat_completions.choices[0].delta.tool_calls is not None: - for tool_call in chat_completions.choices[0].delta.tool_calls: - logger.info(f"tool_call: {tool_call}") - if tool_call.function.name == "get_camera_image": - self.chat_completion(rte, start_time, input_text, memory, True) - return - elif chat_completions.choices[0].delta.content is not None: - content = chat_completions.choices[0].delta.content - else: - content = "" - - full_content += content - - while True: - sentence, content, sentence_is_final = parse_sentence( - sentence, content - ) - if len(sentence) == 0 or not sentence_is_final: - logger.info(f"sentence {sentence} is empty or not final") - break - logger.info( - f"recv for input text: [{input_text}] got sentence: [{sentence}]" - ) - - # send sentence - try: - output_data = Data.create("text_data") - output_data.set_property_string( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence - ) - output_data.set_property_bool( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, False - ) - rte.send_data(output_data) - logger.info( - f"recv for input text: [{input_text}] sent sentence [{sentence}]" - ) - except Exception as err: - logger.info( - f"recv for input text: [{input_text}] send sentence [{sentence}] failed, err: {err}" - ) - break - - sentence = "" - if not first_sentence_sent: - first_sentence_sent = True - logger.info( - f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms" - ) - - # remember response as assistant content in memory - self.append_memory({"role": "assistant", "content": full_content}) - - # send end of segment - try: - output_data = Data.create("text_data") - output_data.set_property_string( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence - ) - output_data.set_property_bool( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, True - ) - rte.send_data(output_data) - logger.info( - f"for input text: [{input_text}] end of segment with sentence [{sentence}] sent" - ) - except Exception as err: - logger.info( - f"for input text: [{input_text}] end of segment with sentence [{sentence}] send failed, err: {err}" - ) - - except Exception as e: - logger.info( - f"for input text: [{input_text}] failed, err: {e}" - ) - - def chat_completion(self, rte: RteEnv, start_time, input_text, memory): - try: - logger.info( - f"for input text: [{input_text}] memory: {memory}" - ) - - message = {"role": "user", "content": input_text} + # # Get result from AI + # resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message]) + # self.append_memory({"role": "user", "content": input_text}) + # if resp is None: + # logger.info( + # f"for input text: [{input_text}] failed" + # ) + # return + + # sentence = "" + # full_content = "" + # first_sentence_sent = False + + # for chat_completions in resp: + # if start_time < self.outdate_ts: + # logger.info( + # f"recv interrupt and flushing for input text: [{input_text}], startTs: {start_time}, outdateTs: {self.outdate_ts}" + # ) + # break + + # if ( + # len(chat_completions.choices) > 0 + # ): + # if chat_completions.choices[0].delta.content is not None: + # content = chat_completions.choices[0].delta.content + # else: + # content = "" + + # full_content += content + + # while True: + # sentence, content, sentence_is_final = parse_sentence( + # sentence, content + # ) + # if len(sentence) == 0 or not sentence_is_final: + # logger.info(f"sentence {sentence} is empty or not final") + # break + # logger.info( + # f"recv for input text: [{input_text}] got sentence: [{sentence}]" + # ) + + # # send sentence + # try: + # output_data = Data.create("text_data") + # output_data.set_property_string( + # DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence + # ) + # output_data.set_property_bool( + # DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, False + # ) + # rte.send_data(output_data) + # logger.info( + # f"recv for input text: [{input_text}] sent sentence [{sentence}]" + # ) + # except Exception as err: + # logger.info( + # f"recv for input text: [{input_text}] send sentence [{sentence}] failed, err: {err}" + # ) + # break + + # sentence = "" + # if not first_sentence_sent: + # first_sentence_sent = True + # logger.info( + # f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms" + # ) + + # # remember response as assistant content in memory + # self.append_memory({"role": "assistant", "content": full_content}) + + # # send end of segment + # try: + # output_data = Data.create("text_data") + # output_data.set_property_string( + # DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence + # ) + # output_data.set_property_bool( + # DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, True + # ) + # rte.send_data(output_data) + # logger.info( + # f"for input text: [{input_text}] end of segment with sentence [{sentence}] sent" + # ) + # except Exception as err: + # logger.info( + # f"for input text: [{input_text}] end of segment with sentence [{sentence}] send failed, err: {err}" + # ) + + # except Exception as e: + # logger.info( + # f"for input text: [{input_text}] failed, err: {e}" + # ) + + # def chat_completion(self, rte: RteEnv, start_time, input_text, memory): + # try: + # logger.info( + # f"for input text: [{input_text}] memory: {memory}" + # ) + + # message = {"role": "user", "content": input_text} - # Get result from AI - resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message], self.available_tools) - self.append_memory({"role": "user", "content": input_text}) - if resp is None: - logger.info( - f"for input text: [{input_text}] failed" - ) - return - - sentence = "" - full_content = "" - first_sentence_sent = False - - for chat_completions in resp: - if start_time < self.outdate_ts: - logger.info( - f"recv interrupt and flushing for input text: [{input_text}], startTs: {start_time}, outdateTs: {self.outdate_ts}" - ) - break - - if ( - len(chat_completions.choices) > 0 - ): - if chat_completions.choices[0].delta.tool_calls is not None: - for tool_call in chat_completions.choices[0].delta.tool_calls: - logger.info(f"tool_call: {tool_call}") - if tool_call.function.name == "get_camera_image": - self.chat_completion_with_vision(rte, start_time, "tell me about this image using language in previous context", memory) - return - elif chat_completions.choices[0].delta.content is not None: - content = chat_completions.choices[0].delta.content - else: - content = "" - - full_content += content - - while True: - sentence, content, sentence_is_final = parse_sentence( - sentence, content - ) - if len(sentence) == 0 or not sentence_is_final: - logger.info(f"sentence {sentence} is empty or not final") - break - logger.info( - f"recv for input text: [{input_text}] got sentence: [{sentence}]" - ) - - # send sentence - try: - output_data = Data.create("text_data") - output_data.set_property_string( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence - ) - output_data.set_property_bool( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, False - ) - rte.send_data(output_data) - logger.info( - f"recv for input text: [{input_text}] sent sentence [{sentence}]" - ) - except Exception as err: - logger.info( - f"recv for input text: [{input_text}] send sentence [{sentence}] failed, err: {err}" - ) - break - - sentence = "" - if not first_sentence_sent: - first_sentence_sent = True - logger.info( - f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms" - ) - - # remember response as assistant content in memory - self.append_memory({"role": "assistant", "content": full_content}) - - # send end of segment - try: - output_data = Data.create("text_data") - output_data.set_property_string( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence - ) - output_data.set_property_bool( - DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, True - ) - rte.send_data(output_data) - logger.info( - f"for input text: [{input_text}] end of segment with sentence [{sentence}] sent" - ) - except Exception as err: - logger.info( - f"for input text: [{input_text}] end of segment with sentence [{sentence}] send failed, err: {err}" - ) - - except Exception as e: - logger.info( - f"for input text: [{input_text}] failed, err: {e}" - ) + # # Get result from AI + # resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message], self.available_tools) + # self.append_memory({"role": "user", "content": input_text}) + # if resp is None: + # logger.info( + # f"for input text: [{input_text}] failed" + # ) + # return + + # sentence = "" + # full_content = "" + # first_sentence_sent = False + + # for chat_completions in resp: + # if start_time < self.outdate_ts: + # logger.info( + # f"recv interrupt and flushing for input text: [{input_text}], startTs: {start_time}, outdateTs: {self.outdate_ts}" + # ) + # break + + # if ( + # len(chat_completions.choices) > 0 + # ): + # if chat_completions.choices[0].delta.tool_calls is not None: + # for tool_call in chat_completions.choices[0].delta.tool_calls: + # logger.info(f"tool_call: {tool_call}") + # if tool_call.function.name == "get_camera_image": + # self.chat_completion_with_vision(rte, start_time, "tell me about this image using language in previous context", memory) + # return + # elif chat_completions.choices[0].delta.content is not None: + # content = chat_completions.choices[0].delta.content + # else: + # content = "" + + # full_content += content + + # while True: + # sentence, content, sentence_is_final = parse_sentence( + # sentence, content + # ) + # if len(sentence) == 0 or not sentence_is_final: + # logger.info(f"sentence {sentence} is empty or not final") + # break + # logger.info( + # f"recv for input text: [{input_text}] got sentence: [{sentence}]" + # ) + + # # send sentence + # try: + # output_data = Data.create("text_data") + # output_data.set_property_string( + # DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence + # ) + # output_data.set_property_bool( + # DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, False + # ) + # rte.send_data(output_data) + # logger.info( + # f"recv for input text: [{input_text}] sent sentence [{sentence}]" + # ) + # except Exception as err: + # logger.info( + # f"recv for input text: [{input_text}] send sentence [{sentence}] failed, err: {err}" + # ) + # break + + # sentence = "" + # if not first_sentence_sent: + # first_sentence_sent = True + # logger.info( + # f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms" + # ) + + # # remember response as assistant content in memory + # self.append_memory({"role": "assistant", "content": full_content}) + + # # send end of segment + # try: + # output_data = Data.create("text_data") + # output_data.set_property_string( + # DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence + # ) + # output_data.set_property_bool( + # DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, True + # ) + # rte.send_data(output_data) + # logger.info( + # f"for input text: [{input_text}] end of segment with sentence [{sentence}] sent" + # ) + # except Exception as err: + # logger.info( + # f"for input text: [{input_text}] end of segment with sentence [{sentence}] send failed, err: {err}" + # ) + + # except Exception as e: + # logger.info( + # f"for input text: [{input_text}] failed, err: {e}" + # ) def on_cmd(self, rte: RteEnv, cmd: Cmd) -> None: logger.info("OpenAIChatGPTExtension on_cmd") @@ -599,26 +593,6 @@ def on_data(self, rte: RteEnv, data: Data) -> None: ) return - # Prepare memory - # if len(self.memory) > self.max_memory_length: - # self.memory.pop(0) - # if self.image_data is not None and self.enable_vision is True: - # url = yuv2base64png(self.image_data, self.image_width, self.image_height) - # # logger.info(f"image url: {url}") - # self.memory.append({"role": "user", "content": [ - # {"type": "text", "text": input_text}, - # { - # "type": "image_url", - # "image_url": { - # "url": url, - # } - # } - # ]}) - # # clear image after use - # self.image_data = None - # else: - # self.memory.append({"role": "user", "content": input_text}) - def chat_completions_stream_worker(start_time, input_text, memory): self.chat_completion(rte, start_time, input_text, memory) @@ -631,6 +605,85 @@ def chat_completions_stream_worker(start_time, input_text, memory): thread.start() logger.info(f"OpenAIChatGPTExtension on_data end") + def send_data(rte, sentence, end_of_segment, input_text, logger): + try: + output_data = Data.create("text_data") + output_data.set_property_string(DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence) + output_data.set_property_bool(DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, end_of_segment) + rte.send_data(output_data) + logger.info(f"for input text: [{input_text}] {'end of segment ' if end_of_segment else ''}sent sentence [{sentence}]") + except Exception as err: + logger.info(f"for input text: [{input_text}] send sentence [{sentence}] failed, err: {err}") + + def process_completions(self, chat_completions, rte, start_time, input_text, memory, logger): + sentence = "" + full_content = "" + first_sentence_sent = False + + for chat_completion in chat_completions: + if start_time < self.outdate_ts: + logger.info(f"recv interrupt and flushing for input text: [{input_text}], startTs: {start_time}, outdateTs: {self.outdate_ts}") + break + + content = chat_completion.choices[0].delta.content if len(chat_completion.choices) > 0 and chat_completion.choices[0].delta.content is not None else "" + + full_content += content + + while True: + sentence, content, sentence_is_final = parse_sentence(sentence, content) + if len(sentence) == 0 or not sentence_is_final: + logger.info(f"sentence {sentence} is empty or not final") + break + logger.info(f"recv for input text: [{input_text}] got sentence: [{sentence}]") + send_data(rte, sentence, False, input_text, logger) + sentence = "" + + if not first_sentence_sent: + first_sentence_sent = True + logger.info(f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms") + + self.append_memory({"role": "assistant", "content": full_content}) + send_data(rte, sentence, True, input_text, logger) + + def chat_completion_with_vision(self, rte: RteEnv, start_time, input_text, memory): + try: + logger.info(f"for input text: [{input_text}] memory: {memory}") + message = {"role": "user", "content": input_text} + + if self.image_data is not None: + url = yuv2base64png(self.image_data, self.image_width, self.image_height) + message = {"role": "user", "content": [ + {"type": "text", "text": input_text}, + {"type": "image_url", "image_url": {"url": url}} + ]} + + resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message]) + self.append_memory({"role": "user", "content": input_text}) + if resp is None: + log_and_return(logger, input_text, "Response is None") + return + + process_completions(resp, rte, start_time, input_text, memory, logger) + + except Exception as e: + log_and_return(logger, input_text, str(e)) + + def chat_completion(self, rte: RteEnv, start_time, input_text, memory): + try: + logger.info(f"for input text: [{input_text}] memory: {memory}") + message = {"role": "user", "content": input_text} + + resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message], self.available_tools) + self.append_memory({"role": "user", "content": input_text}) + if resp is None: + log_and_return(logger, input_text, "Response is None") + return + + process_completions(resp, rte, start_time, input_text, memory, logger) + + except Exception as e: + log_and_return(logger, input_text, str(e)) + @register_addon_as_extension("openai_chatgpt_python") class OpenAIChatGPTExtensionAddon(Addon): def on_create_instance(self, rte: RteEnv, addon_name: str, context) -> None: diff --git a/agents/property.json.example b/agents/property.json.example index e3af5b44..b25bc1f1 100644 --- a/agents/property.json.example +++ b/agents/property.json.example @@ -1165,6 +1165,237 @@ ] } ] + }, + { + "name": "camera.va.openai.azure", + "auto_start": true, + "nodes": [ + { + "type": "extension", + "extension_group": "default", + "addon": "agora_rtc", + "name": "agora_rtc", + "property": { + "app_id": "", + "token": "", + "channel": "astra_agents_test", + "stream_id": 1234, + "remote_stream_id": 123, + "subscribe_audio": true, + "subscribe_video": true, + "publish_audio": true, + "publish_data": true, + "enable_agora_asr": true, + "agora_asr_vendor_name": "microsoft", + "agora_asr_language": "en-US", + "agora_asr_vendor_key": "", + "agora_asr_vendor_region": "", + "agora_asr_session_control_file_path": "session_control.conf" + } + }, + { + "type": "extension", + "extension_group": "default", + "addon": "interrupt_detector", + "name": "interrupt_detector" + }, + { + "type": "extension", + "extension_group": "chatgpt", + "addon": "openai_chatgpt_python", + "name": "openai_chatgpt", + "property": { + "base_url": "", + "api_key": "", + "frequency_penalty": 0.9, + "model": "gpt-4o-mini", + "max_tokens": 512, + "prompt": "", + "proxy_url": "", + "greeting": "ASTRA agent connected. How can i help you today?", + "max_memory_length": 10 + } + }, + { + "type": "extension", + "extension_group": "tts", + "addon": "azure_tts", + "name": "azure_tts", + "property": { + "azure_subscription_key": "", + "azure_subscription_region": "", + "azure_synthesis_voice_name": "en-US-JaneNeural" + } + }, + { + "type": "extension", + "extension_group": "transcriber", + "addon": "chat_transcriber", + "name": "chat_transcriber" + }, + { + "type": "extension_group", + "addon": "default_extension_group", + "name": "default" + }, + { + "type": "extension_group", + "addon": "default_extension_group", + "name": "chatgpt" + }, + { + "type": "extension_group", + "addon": "default_extension_group", + "name": "tts" + }, + { + "type": "extension_group", + "addon": "default_extension_group", + "name": "transcriber" + } + ], + "connections": [ + { + "extension_group": "default", + "extension": "agora_rtc", + "data": [ + { + "name": "text_data", + "dest": [ + { + "extension_group": "default", + "extension": "interrupt_detector" + }, + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + }, + { + "extension_group": "transcriber", + "extension": "chat_transcriber" + } + ] + } + ], + "image_frame": [ + { + "name": "image_frame", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + } + ] + } + ] + }, + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt", + "data": [ + { + "name": "text_data", + "dest": [ + { + "extension_group": "tts", + "extension": "azure_tts" + }, + { + "extension_group": "transcriber", + "extension": "chat_transcriber", + "cmd_conversions": [ + { + "cmd": { + "type": "per_property", + "keep_original": true, + "rules": [ + { + "path": "is_final", + "type": "fixed_value", + "value": "bool(true)" + }, + { + "path": "stream_id", + "type": "fixed_value", + "value": "uint32(999)" + } + ] + } + } + ] + } + ] + } + ], + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension_group": "tts", + "extension": "azure_tts" + } + ] + } + ] + }, + { + "extension_group": "tts", + "extension": "azure_tts", + "pcm_frame": [ + { + "name": "pcm_frame", + "dest": [ + { + "extension_group": "default", + "extension": "agora_rtc" + } + ] + } + ], + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension_group": "default", + "extension": "agora_rtc" + } + ] + } + ] + }, + { + "extension_group": "transcriber", + "extension": "chat_transcriber", + "data": [ + { + "name": "data", + "dest": [ + { + "extension_group": "default", + "extension": "agora_rtc" + } + ] + } + ] + }, + { + "extension_group": "default", + "extension": "interrupt_detector", + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + } + ] + } + ] + } + ] } ] } From 44bbb9b19c52182e6a317c78d4bade42262fa7d7 Mon Sep 17 00:00:00 2001 From: zhangqianze Date: Sat, 10 Aug 2024 22:42:22 +0800 Subject: [PATCH 4/9] feat: update property.json.example --- agents/property.json.example | 231 +++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) diff --git a/agents/property.json.example b/agents/property.json.example index 899e7b73..80e0d74a 100644 --- a/agents/property.json.example +++ b/agents/property.json.example @@ -1617,6 +1617,237 @@ ] } ] + }, + { + "name": "camera.va.openai.azure", + "auto_start": true, + "nodes": [ + { + "type": "extension", + "extension_group": "default", + "addon": "agora_rtc", + "name": "agora_rtc", + "property": { + "app_id": "", + "token": "", + "channel": "astra_agents_test", + "stream_id": 1234, + "remote_stream_id": 123, + "subscribe_audio": true, + "subscribe_video": true, + "publish_audio": true, + "publish_data": true, + "enable_agora_asr": true, + "agora_asr_vendor_name": "microsoft", + "agora_asr_language": "en-US", + "agora_asr_vendor_key": "", + "agora_asr_vendor_region": "", + "agora_asr_session_control_file_path": "session_control.conf" + } + }, + { + "type": "extension", + "extension_group": "default", + "addon": "interrupt_detector", + "name": "interrupt_detector" + }, + { + "type": "extension", + "extension_group": "chatgpt", + "addon": "openai_chatgpt_python", + "name": "openai_chatgpt", + "property": { + "base_url": "", + "api_key": "", + "frequency_penalty": 0.9, + "model": "gpt-4o-mini", + "max_tokens": 512, + "prompt": "", + "proxy_url": "", + "greeting": "ASTRA agent connected. How can i help you today?", + "max_memory_length": 10 + } + }, + { + "type": "extension", + "extension_group": "tts", + "addon": "azure_tts", + "name": "azure_tts", + "property": { + "azure_subscription_key": "", + "azure_subscription_region": "", + "azure_synthesis_voice_name": "en-US-JaneNeural" + } + }, + { + "type": "extension", + "extension_group": "transcriber", + "addon": "chat_transcriber", + "name": "chat_transcriber" + }, + { + "type": "extension_group", + "addon": "default_extension_group", + "name": "default" + }, + { + "type": "extension_group", + "addon": "default_extension_group", + "name": "chatgpt" + }, + { + "type": "extension_group", + "addon": "default_extension_group", + "name": "tts" + }, + { + "type": "extension_group", + "addon": "default_extension_group", + "name": "transcriber" + } + ], + "connections": [ + { + "extension_group": "default", + "extension": "agora_rtc", + "data": [ + { + "name": "text_data", + "dest": [ + { + "extension_group": "default", + "extension": "interrupt_detector" + }, + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + }, + { + "extension_group": "transcriber", + "extension": "chat_transcriber" + } + ] + } + ], + "image_frame": [ + { + "name": "image_frame", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + } + ] + } + ] + }, + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt", + "data": [ + { + "name": "text_data", + "dest": [ + { + "extension_group": "tts", + "extension": "azure_tts" + }, + { + "extension_group": "transcriber", + "extension": "chat_transcriber", + "cmd_conversions": [ + { + "cmd": { + "type": "per_property", + "keep_original": true, + "rules": [ + { + "path": "is_final", + "type": "fixed_value", + "value": "bool(true)" + }, + { + "path": "stream_id", + "type": "fixed_value", + "value": "uint32(999)" + } + ] + } + } + ] + } + ] + } + ], + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension_group": "tts", + "extension": "azure_tts" + } + ] + } + ] + }, + { + "extension_group": "tts", + "extension": "azure_tts", + "pcm_frame": [ + { + "name": "pcm_frame", + "dest": [ + { + "extension_group": "default", + "extension": "agora_rtc" + } + ] + } + ], + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension_group": "default", + "extension": "agora_rtc" + } + ] + } + ] + }, + { + "extension_group": "transcriber", + "extension": "chat_transcriber", + "data": [ + { + "name": "data", + "dest": [ + { + "extension_group": "default", + "extension": "agora_rtc" + } + ] + } + ] + }, + { + "extension_group": "default", + "extension": "interrupt_detector", + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension_group": "chatgpt", + "extension": "openai_chatgpt" + } + ] + } + ] + } + ] } ] } From 1e30d1b3790fcc1c3de057fac9699fd5fcb1fc3b Mon Sep 17 00:00:00 2001 From: zhangqianze Date: Sun, 11 Aug 2024 01:03:23 +0800 Subject: [PATCH 5/9] feat: finalize camera video feature --- .../openai_chatgpt_python/manifest.json | 2 +- .../openai_chatgpt_extension.py | 379 ++++-------------- 2 files changed, 78 insertions(+), 303 deletions(-) diff --git a/agents/addon/extension/openai_chatgpt_python/manifest.json b/agents/addon/extension/openai_chatgpt_python/manifest.json index f325cdd8..8193a637 100644 --- a/agents/addon/extension/openai_chatgpt_python/manifest.json +++ b/agents/addon/extension/openai_chatgpt_python/manifest.json @@ -48,7 +48,7 @@ "max_memory_length": { "type": "int64" }, - "enable_vision": { + "enable_tools": { "type": "bool" } }, diff --git a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py index ec11d11d..1964d997 100644 --- a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py +++ b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py @@ -5,6 +5,7 @@ # Copyright (c) 2024 Agora IO. All rights reserved. # # +import traceback from rte.image_frame import ImageFrame from .openai_chatgpt import OpenAIChatGPT, OpenAIChatGPTConfig from datetime import datetime @@ -44,7 +45,7 @@ PROPERTY_TOP_P = "top_p" # Optional PROPERTY_MAX_TOKENS = "max_tokens" # Optional PROPERTY_GREETING = "greeting" # Optional -PROPERTY_ENABLE_VISION = "enable_vision" # Optional +PROPERTY_ENABLE_TOOLS = "enable_tools" # Optional PROPERTY_PROXY_URL = "proxy_url" # Optional PROPERTY_MAX_MEMORY_LENGTH = "max_memory_length" # Optional @@ -78,42 +79,13 @@ def parse_sentence(sentence, content): return sentence, remain, found_punc - -def yuv420_to_rgb(yuv_data, width, height): - # Calculate the size of each plane - frame_size = width * height - chroma_size = frame_size // 4 - - y_plane = yuv_data[0:frame_size].reshape((height, width)) - u_plane = yuv_data[frame_size:frame_size + chroma_size].reshape((height // 2, width // 2)) - v_plane = yuv_data[frame_size + chroma_size:].reshape((height // 2, width // 2)) - - u_plane = u_plane.repeat(2, axis=0).repeat(2, axis=1) - v_plane = v_plane.repeat(2, axis=0).repeat(2, axis=1) - - # Ensure calculations are done in a wider data type to prevent overflow - y_plane = y_plane.astype(np.int16) - u_plane = u_plane.astype(np.int16) - v_plane = v_plane.astype(np.int16) - - # Convert YUV to RGB using the standard conversion formula - r_plane = y_plane + 1.402 * (v_plane - 128) - g_plane = y_plane - 0.344136 * (u_plane - 128) - 0.714136 * (v_plane - 128) - b_plane = y_plane + 1.772 * (u_plane - 128) - - # Clip values to the 0-255 range and convert to uint8 - r_plane = np.clip(r_plane, 0, 255).astype(np.uint8) - g_plane = np.clip(g_plane, 0, 255).astype(np.uint8) - b_plane = np.clip(b_plane, 0, 255).astype(np.uint8) - - # Stack the RGB planes into an image - rgb_image = np.stack([r_plane, g_plane, b_plane], axis=-1) - - return rgb_image - def rgb2base64jpeg(rgb_data, width, height): # Convert the RGB image to a PIL Image - pil_image = Image.fromarray(rgb_data) + pil_image = Image.frombytes('RGBA', (width, height), bytes(rgb_data)) + pil_image = pil_image.convert('RGB') + + # Resize the image while maintaining its aspect ratio + pil_image = resize_image_keep_aspect(pil_image, 320) # Save the image to a BytesIO object in JPEG format buffered = BytesIO() @@ -131,36 +103,44 @@ def rgb2base64jpeg(rgb_data, width, height): base64_url = f"data:{mime_type};base64,{base64_encoded_image}" return base64_url -def yuv2base64png(yuv_data, width, height): - # Convert YUV to RGB - rgb_image = yuv420_to_rgb(np.frombuffer(yuv_data, dtype=np.uint8), width, height) - - # Convert the RGB image to a PIL Image - pil_image = Image.fromarray(rgb_image) - pil_image = pil_image.resize((width//2, height//2)) - - # Save the image to a BytesIO object in PNG format - buffered = BytesIO() - pil_image.save(buffered, format="JPEG") - pil_image.save("test.jpg", format="JPEG") - - # Get the byte data of the PNG image - png_image_data = buffered.getvalue() - - # Convert the PNG byte data to a Base64 encoded string - base64_encoded_image = b64encode(png_image_data).decode('utf-8') - - # Create the data URL - mime_type = 'image/jpeg' - base64_url = f"data:{mime_type};base64,{base64_encoded_image}" - return base64_url +def resize_image_keep_aspect(image, max_size=512): + """ + Resize an image while maintaining its aspect ratio, ensuring the larger dimension is max_size. + If both dimensions are smaller than max_size, the image is not resized. + + :param image: A PIL Image object + :param max_size: The maximum size for the larger dimension (width or height) + :return: A PIL Image object (resized or original) + """ + # Get current width and height + width, height = image.size + + # If both dimensions are already smaller than max_size, return the original image + if width <= max_size and height <= max_size: + return image + + # Calculate the aspect ratio + aspect_ratio = width / height + + # Determine the new dimensions + if width > height: + new_width = max_size + new_height = int(max_size / aspect_ratio) + else: + new_height = max_size + new_width = int(max_size * aspect_ratio) + + # Resize the image with the new dimensions + resized_image = image.resize((new_width, new_height)) + + return resized_image class OpenAIChatGPTExtension(Extension): memory = [] max_memory_length = 10 outdate_ts = 0 openai_chatgpt = None - enable_vision = False + enable_tools = False image_data = None image_width = 0 image_height = 0 @@ -168,8 +148,8 @@ class OpenAIChatGPTExtension(Extension): available_tools = [{ "type": "function", "function": { - "name": "get_camera_image", - "description": "Get the camera image which is being used. Call this whenever you need to understand the camera video like you have an eye, for example when a customer asks 'What can you see?'", + "name": "get_vision_image", + "description": "Get the image from camera. Call this whenever you need to understand the input camera image like you have vision capability, for example when user asks 'What can you see?' or 'Can you see me?'", }, "strict": True, }] @@ -258,9 +238,9 @@ def on_start(self, rte: RteEnv) -> None: logger.info(f"GetProperty optional {PROPERTY_GREETING} failed, err: {err}") try: - self.enable_vision = rte.get_property_bool(PROPERTY_ENABLE_VISION) + self.enable_tools = rte.get_property_bool(PROPERTY_ENABLE_TOOLS) except Exception as err: - logger.info(f"GetProperty optional {PROPERTY_ENABLE_VISION} failed, err: {err}") + logger.info(f"GetProperty optional {PROPERTY_ENABLE_TOOLS} failed, err: {err}") try: prop_max_memory_length = rte.get_property_int(PROPERTY_MAX_MEMORY_LENGTH) @@ -305,229 +285,6 @@ def append_memory(self, message): self.memory.pop(0) self.memory.append(message) - # def chat_completion_with_vision(self, rte: RteEnv, start_time, input_text, memory): - # try: - # logger.info( - # f"for input text: [{input_text}] memory: {memory}" - # ) - - # message = {"role": "user", "content": input_text} - - # if self.image_data is not None: - # url = yuv2base64png(self.image_data, self.image_width, self.image_height) - # # logger.info(f"image url: {url}") - # message = {"role": "user", "content": [ - # {"type": "text", "text": input_text}, - # { - # "type": "image_url", - # "image_url": { - # "url": url, - # } - # } - # ]} - - - # # Get result from AI - # resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message]) - # self.append_memory({"role": "user", "content": input_text}) - # if resp is None: - # logger.info( - # f"for input text: [{input_text}] failed" - # ) - # return - - # sentence = "" - # full_content = "" - # first_sentence_sent = False - - # for chat_completions in resp: - # if start_time < self.outdate_ts: - # logger.info( - # f"recv interrupt and flushing for input text: [{input_text}], startTs: {start_time}, outdateTs: {self.outdate_ts}" - # ) - # break - - # if ( - # len(chat_completions.choices) > 0 - # ): - # if chat_completions.choices[0].delta.content is not None: - # content = chat_completions.choices[0].delta.content - # else: - # content = "" - - # full_content += content - - # while True: - # sentence, content, sentence_is_final = parse_sentence( - # sentence, content - # ) - # if len(sentence) == 0 or not sentence_is_final: - # logger.info(f"sentence {sentence} is empty or not final") - # break - # logger.info( - # f"recv for input text: [{input_text}] got sentence: [{sentence}]" - # ) - - # # send sentence - # try: - # output_data = Data.create("text_data") - # output_data.set_property_string( - # DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence - # ) - # output_data.set_property_bool( - # DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, False - # ) - # rte.send_data(output_data) - # logger.info( - # f"recv for input text: [{input_text}] sent sentence [{sentence}]" - # ) - # except Exception as err: - # logger.info( - # f"recv for input text: [{input_text}] send sentence [{sentence}] failed, err: {err}" - # ) - # break - - # sentence = "" - # if not first_sentence_sent: - # first_sentence_sent = True - # logger.info( - # f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms" - # ) - - # # remember response as assistant content in memory - # self.append_memory({"role": "assistant", "content": full_content}) - - # # send end of segment - # try: - # output_data = Data.create("text_data") - # output_data.set_property_string( - # DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence - # ) - # output_data.set_property_bool( - # DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, True - # ) - # rte.send_data(output_data) - # logger.info( - # f"for input text: [{input_text}] end of segment with sentence [{sentence}] sent" - # ) - # except Exception as err: - # logger.info( - # f"for input text: [{input_text}] end of segment with sentence [{sentence}] send failed, err: {err}" - # ) - - # except Exception as e: - # logger.info( - # f"for input text: [{input_text}] failed, err: {e}" - # ) - - # def chat_completion(self, rte: RteEnv, start_time, input_text, memory): - # try: - # logger.info( - # f"for input text: [{input_text}] memory: {memory}" - # ) - - # message = {"role": "user", "content": input_text} - - - # # Get result from AI - # resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message], self.available_tools) - # self.append_memory({"role": "user", "content": input_text}) - # if resp is None: - # logger.info( - # f"for input text: [{input_text}] failed" - # ) - # return - - # sentence = "" - # full_content = "" - # first_sentence_sent = False - - # for chat_completions in resp: - # if start_time < self.outdate_ts: - # logger.info( - # f"recv interrupt and flushing for input text: [{input_text}], startTs: {start_time}, outdateTs: {self.outdate_ts}" - # ) - # break - - # if ( - # len(chat_completions.choices) > 0 - # ): - # if chat_completions.choices[0].delta.tool_calls is not None: - # for tool_call in chat_completions.choices[0].delta.tool_calls: - # logger.info(f"tool_call: {tool_call}") - # if tool_call.function.name == "get_camera_image": - # self.chat_completion_with_vision(rte, start_time, "tell me about this image using language in previous context", memory) - # return - # elif chat_completions.choices[0].delta.content is not None: - # content = chat_completions.choices[0].delta.content - # else: - # content = "" - - # full_content += content - - # while True: - # sentence, content, sentence_is_final = parse_sentence( - # sentence, content - # ) - # if len(sentence) == 0 or not sentence_is_final: - # logger.info(f"sentence {sentence} is empty or not final") - # break - # logger.info( - # f"recv for input text: [{input_text}] got sentence: [{sentence}]" - # ) - - # # send sentence - # try: - # output_data = Data.create("text_data") - # output_data.set_property_string( - # DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence - # ) - # output_data.set_property_bool( - # DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, False - # ) - # rte.send_data(output_data) - # logger.info( - # f"recv for input text: [{input_text}] sent sentence [{sentence}]" - # ) - # except Exception as err: - # logger.info( - # f"recv for input text: [{input_text}] send sentence [{sentence}] failed, err: {err}" - # ) - # break - - # sentence = "" - # if not first_sentence_sent: - # first_sentence_sent = True - # logger.info( - # f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms" - # ) - - # # remember response as assistant content in memory - # self.append_memory({"role": "assistant", "content": full_content}) - - # # send end of segment - # try: - # output_data = Data.create("text_data") - # output_data.set_property_string( - # DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence - # ) - # output_data.set_property_bool( - # DATA_OUT_TEXT_DATA_PROPERTY_TEXT_END_OF_SEGMENT, True - # ) - # rte.send_data(output_data) - # logger.info( - # f"for input text: [{input_text}] end of segment with sentence [{sentence}] sent" - # ) - # except Exception as err: - # logger.info( - # f"for input text: [{input_text}] end of segment with sentence [{sentence}] send failed, err: {err}" - # ) - - # except Exception as e: - # logger.info( - # f"for input text: [{input_text}] failed, err: {e}" - # ) - def on_cmd(self, rte: RteEnv, cmd: Cmd) -> None: logger.info("OpenAIChatGPTExtension on_cmd") cmd_json = cmd.to_json() @@ -605,7 +362,7 @@ def chat_completions_stream_worker(start_time, input_text, memory): thread.start() logger.info(f"OpenAIChatGPTExtension on_data end") - def send_data(rte, sentence, end_of_segment, input_text, logger): + def send_data(self, rte, sentence, end_of_segment, input_text): try: output_data = Data.create("text_data") output_data.set_property_string(DATA_OUT_TEXT_DATA_PROPERTY_TEXT, sentence) @@ -615,17 +372,34 @@ def send_data(rte, sentence, end_of_segment, input_text, logger): except Exception as err: logger.info(f"for input text: [{input_text}] send sentence [{sentence}] failed, err: {err}") - def process_completions(self, chat_completions, rte, start_time, input_text, memory, logger): + def process_completions(self, chat_completions, rte, start_time, input_text, memory): sentence = "" full_content = "" first_sentence_sent = False for chat_completion in chat_completions: + content = "" if start_time < self.outdate_ts: logger.info(f"recv interrupt and flushing for input text: [{input_text}], startTs: {start_time}, outdateTs: {self.outdate_ts}") break - content = chat_completion.choices[0].delta.content if len(chat_completion.choices) > 0 and chat_completion.choices[0].delta.content is not None else "" + # content = chat_completion.choices[0].delta.content if len(chat_completion.choices) > 0 and chat_completion.choices[0].delta.content is not None else "" + if ( + len(chat_completion.choices) > 0 + ): + if chat_completion.choices[0].delta.tool_calls is not None: + for tool_call in chat_completion.choices[0].delta.tool_calls: + logger.info(f"tool_call: {tool_call}") + if tool_call.function.name == "get_vision_image": + self.chat_completion_with_vision(rte, start_time, input_text, memory) + return + elif chat_completion.choices[0].delta.content is not None: + content = chat_completion.choices[0].delta.content + else: + content = "" + + # memory is only confirmed when tool is confirmed + self.append_memory({"role": "user", "content": input_text}) full_content += content @@ -635,7 +409,7 @@ def process_completions(self, chat_completions, rte, start_time, input_text, mem logger.info(f"sentence {sentence} is empty or not final") break logger.info(f"recv for input text: [{input_text}] got sentence: [{sentence}]") - send_data(rte, sentence, False, input_text, logger) + self.send_data(rte, sentence, False, input_text) sentence = "" if not first_sentence_sent: @@ -643,7 +417,7 @@ def process_completions(self, chat_completions, rte, start_time, input_text, mem logger.info(f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms") self.append_memory({"role": "assistant", "content": full_content}) - send_data(rte, sentence, True, input_text, logger) + self.send_data(rte, sentence, True, input_text) def chat_completion_with_vision(self, rte: RteEnv, start_time, input_text, memory): try: @@ -651,38 +425,39 @@ def chat_completion_with_vision(self, rte: RteEnv, start_time, input_text, memor message = {"role": "user", "content": input_text} if self.image_data is not None: - url = yuv2base64png(self.image_data, self.image_width, self.image_height) + url = rgb2base64jpeg(self.image_data, self.image_width, self.image_height) message = {"role": "user", "content": [ {"type": "text", "text": input_text}, {"type": "image_url", "image_url": {"url": url}} ]} + logger.info(f"msg: {message}") resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message]) - self.append_memory({"role": "user", "content": input_text}) if resp is None: - log_and_return(logger, input_text, "Response is None") + logger.error(f"get_chat_completions_stream Response is None: {input_text}") return - process_completions(resp, rte, start_time, input_text, memory, logger) + self.process_completions(resp, rte, start_time, input_text, memory) except Exception as e: - log_and_return(logger, input_text, str(e)) + logger.error(f"err: {str(e)}: {input_text}") def chat_completion(self, rte: RteEnv, start_time, input_text, memory): try: logger.info(f"for input text: [{input_text}] memory: {memory}") message = {"role": "user", "content": input_text} - - resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message], self.available_tools) - self.append_memory({"role": "user", "content": input_text}) + + tools = self.available_tools if self.enable_tools else None + logger.info(f"chat_completion tools: {tools}") + resp = self.openai_chatgpt.get_chat_completions_stream(memory + [message], tools) if resp is None: - log_and_return(logger, input_text, "Response is None") + logger.error(f"get_chat_completions_stream Response is None: {input_text}") return - process_completions(resp, rte, start_time, input_text, memory, logger) + self.process_completions(resp, rte, start_time, input_text, memory) except Exception as e: - log_and_return(logger, input_text, str(e)) + logger.error(f"err: {traceback.format_exc()}: {input_text}") @register_addon_as_extension("openai_chatgpt_python") class OpenAIChatGPTExtensionAddon(Addon): From 0ebe90903cb4b4b21d936128951367af2808a187 Mon Sep 17 00:00:00 2001 From: zhangqianze Date: Sun, 11 Aug 2024 01:08:25 +0800 Subject: [PATCH 6/9] feat: update propery.json.example --- .../openai_chatgpt_python/openai_chatgpt_extension.py | 1 + agents/property.json.example | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py index 1964d997..f8d82e8e 100644 --- a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py +++ b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py @@ -148,6 +148,7 @@ class OpenAIChatGPTExtension(Extension): available_tools = [{ "type": "function", "function": { + # ensure you use gpt-4o or later model if you need image recognition, gpt-4o-mini does not work quite well in this case "name": "get_vision_image", "description": "Get the image from camera. Call this whenever you need to understand the input camera image like you have vision capability, for example when user asks 'What can you see?' or 'Can you see me?'", }, diff --git a/agents/property.json.example b/agents/property.json.example index b6483236..86c3bdc7 100644 --- a/agents/property.json.example +++ b/agents/property.json.example @@ -1443,7 +1443,8 @@ "prompt": "", "proxy_url": "", "greeting": "ASTRA agent connected. How can i help you today?", - "max_memory_length": 10 + "max_memory_length": 10, + "enable_tools": true } }, { From 4616fcc10eb4feadb4dad7bcc22e05061f2c9468 Mon Sep 17 00:00:00 2001 From: zhangqianze Date: Sun, 11 Aug 2024 02:04:47 +0800 Subject: [PATCH 7/9] feat: finalize camera change --- .../openai_chatgpt_python/openai_chatgpt_extension.py | 5 ++++- agents/property.json.example | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py index f8d82e8e..146215b6 100644 --- a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py +++ b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py @@ -90,7 +90,7 @@ def rgb2base64jpeg(rgb_data, width, height): # Save the image to a BytesIO object in JPEG format buffered = BytesIO() pil_image.save(buffered, format="JPEG") - pil_image.save("test.jpg", format="JPEG") + # pil_image.save("test.jpg", format="JPEG") # Get the byte data of the JPEG image jpeg_image_data = buffered.getvalue() @@ -392,6 +392,9 @@ def process_completions(self, chat_completions, rte, start_time, input_text, mem for tool_call in chat_completion.choices[0].delta.tool_calls: logger.info(f"tool_call: {tool_call}") if tool_call.function.name == "get_vision_image": + if full_content is "": + # if no text content, send a message to ask user to wait + self.send_data(rte, "Let me take a look...", True, input_text) self.chat_completion_with_vision(rte, start_time, input_text, memory) return elif chat_completion.choices[0].delta.content is not None: diff --git a/agents/property.json.example b/agents/property.json.example index 86c3bdc7..91b1424d 100644 --- a/agents/property.json.example +++ b/agents/property.json.example @@ -1420,7 +1420,8 @@ "agora_asr_language": "en-US", "agora_asr_vendor_key": "", "agora_asr_vendor_region": "", - "agora_asr_session_control_file_path": "session_control.conf" + "agora_asr_session_control_file_path": "session_control.conf", + "subscribe_video_pix_fmt": 4 } }, { From be456d8a84dd3a4d14b2929bce00ee2763f64bd6 Mon Sep 17 00:00:00 2001 From: Zhang Qianze Date: Sun, 11 Aug 2024 11:53:23 +0800 Subject: [PATCH 8/9] fix: avoice duplicate memory appending --- .../openai_chatgpt_python/openai_chatgpt_extension.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py index 146215b6..e22f5e29 100644 --- a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py +++ b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py @@ -402,9 +402,6 @@ def process_completions(self, chat_completions, rte, start_time, input_text, mem else: content = "" - # memory is only confirmed when tool is confirmed - self.append_memory({"role": "user", "content": input_text}) - full_content += content while True: @@ -420,6 +417,9 @@ def process_completions(self, chat_completions, rte, start_time, input_text, mem first_sentence_sent = True logger.info(f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms") + + # memory is only confirmed when tool is confirmed + self.append_memory({"role": "user", "content": input_text}) self.append_memory({"role": "assistant", "content": full_content}) self.send_data(rte, sentence, True, input_text) From 074145e99f3997a6c53e20052e2c4cc826747c72 Mon Sep 17 00:00:00 2001 From: Zhang Qianze Date: Sun, 11 Aug 2024 11:56:38 +0800 Subject: [PATCH 9/9] fix: adjust comments --- .../openai_chatgpt_python/openai_chatgpt_extension.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py index e22f5e29..e8f670e9 100644 --- a/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py +++ b/agents/addon/extension/openai_chatgpt_python/openai_chatgpt_extension.py @@ -395,6 +395,7 @@ def process_completions(self, chat_completions, rte, start_time, input_text, mem if full_content is "": # if no text content, send a message to ask user to wait self.send_data(rte, "Let me take a look...", True, input_text) + # for get_vision_image, re-run the completion with vision, memory should not be affected self.chat_completion_with_vision(rte, start_time, input_text, memory) return elif chat_completion.choices[0].delta.content is not None: @@ -418,7 +419,7 @@ def process_completions(self, chat_completions, rte, start_time, input_text, mem logger.info(f"recv for input text: [{input_text}] first sentence sent, first_sentence_latency {get_current_time() - start_time}ms") - # memory is only confirmed when tool is confirmed + # memory is recorded only when completion is completely done, with single pair of user and assistant message self.append_memory({"role": "user", "content": input_text}) self.append_memory({"role": "assistant", "content": full_content}) self.send_data(rte, sentence, True, input_text)