diff --git a/autogen/oai/gemini.py b/autogen/oai/gemini.py index 33790c9851c6..d44e41112ac9 100644 --- a/autogen/oai/gemini.py +++ b/autogen/oai/gemini.py @@ -32,6 +32,8 @@ from __future__ import annotations import base64 +import copy +import json import logging import os import random @@ -39,24 +41,39 @@ import time import warnings from io import BytesIO -from typing import Any, Dict, List, Mapping, Union +from typing import Any, Dict, List, Union import google.generativeai as genai import requests import vertexai -from google.ai.generativelanguage import Content, Part +from google.ai.generativelanguage import Content, FunctionCall, FunctionDeclaration, FunctionResponse, Part, Tool from google.api_core.exceptions import InternalServerError from google.auth.credentials import Credentials -from openai.types.chat import ChatCompletion +from openai.types.chat import ChatCompletion, ChatCompletionMessageToolCall from openai.types.chat.chat_completion import ChatCompletionMessage, Choice +from openai.types.chat.chat_completion_message_tool_call import Function from openai.types.completion_usage import CompletionUsage from PIL import Image -from vertexai.generative_models import Content as VertexAIContent +from vertexai.generative_models import ( + Content as VertexAIContent, +) +from vertexai.generative_models import ( + FunctionDeclaration as VertexAIFunctionDeclaration, +) +from vertexai.generative_models import ( + GenerationConfig as VertexAIGenerationConfig, +) from vertexai.generative_models import GenerativeModel from vertexai.generative_models import HarmBlockThreshold as VertexAIHarmBlockThreshold from vertexai.generative_models import HarmCategory as VertexAIHarmCategory from vertexai.generative_models import Part as VertexAIPart from vertexai.generative_models import SafetySetting as VertexAISafetySetting +from vertexai.generative_models import ( + Tool as VertexAITool, +) +from vertexai.generative_models import ( + ToolConfig as VertexAIToolConfig, +) logger = logging.getLogger(__name__) @@ -107,7 +124,7 @@ def __init__(self, **kwargs): Args: api_key (str): The API key for using Gemini. - credentials (google.auth.credentials.Credentials): credentials to be used for authentication with vertexai. + credentials (google.auth.credentials.Credentials): credentials to be used for authentication with vertexai. google_application_credentials (str): Path to the JSON service account key file of the service account. Alternatively, the GOOGLE_APPLICATION_CREDENTIALS environment variable can also be set instead of using this argument. @@ -171,6 +188,8 @@ def create(self, params: Dict) -> ChatCompletion: params.get("api_type", "google") # not used messages = params.get("messages", []) + tools = params.get("tools", []) + tool_config = params.get("tool_config", {}) stream = params.get("stream", False) n_response = params.get("n", 1) system_instruction = params.get("system_instruction", None) @@ -183,6 +202,7 @@ def create(self, params: Dict) -> ChatCompletion: } if self.use_vertexai: safety_settings = GeminiClient._to_vertexai_safety_settings(params.get("safety_settings", {})) + tool_config = GeminiClient._to_vertexai_tool_config(tool_config, tools) else: safety_settings = params.get("safety_settings", {}) @@ -198,12 +218,15 @@ def create(self, params: Dict) -> ChatCompletion: if "vision" not in model_name: # A. create and call the chat model. gemini_messages = self._oai_messages_to_gemini_messages(messages) + gemini_tools = self._oai_tools_to_gemini_tools(tools) if self.use_vertexai: model = GenerativeModel( model_name, generation_config=generation_config, safety_settings=safety_settings, system_instruction=system_instruction, + tools=gemini_tools, + tool_config=tool_config, ) chat = model.start_chat(history=gemini_messages[:-1], response_validation=response_validation) else: @@ -213,12 +236,13 @@ def create(self, params: Dict) -> ChatCompletion: generation_config=generation_config, safety_settings=safety_settings, system_instruction=system_instruction, + tools=gemini_tools, ) genai.configure(api_key=self.api_key) chat = model.start_chat(history=gemini_messages[:-1]) max_retries = 5 for attempt in range(max_retries): - ans = None + ans: Union[Content, VertexAIContent] = None try: response = chat.send_message( gemini_messages[-1].parts, stream=stream, safety_settings=safety_settings @@ -234,7 +258,7 @@ def create(self, params: Dict) -> ChatCompletion: raise RuntimeError(f"Google GenAI exception occurred while calling Gemini API: {e}") else: # `ans = response.text` is unstable. Use the following code instead. - ans: str = chat.history[-1].parts[0].text + ans: Union[Content, VertexAIContent] = chat.history[-1] break if ans is None: @@ -262,7 +286,7 @@ def create(self, params: Dict) -> ChatCompletion: # Gemini's vision model does not support chat history yet # chat = model.start_chat(history=gemini_messages[:-1]) # response = chat.send_message(gemini_messages[-1].parts) - user_message = self._oai_content_to_gemini_content(messages[-1]["content"]) + user_message = self._oai_content_to_gemini_content(messages[-1]) if len(messages) > 2: warnings.warn( "Warning: Gemini's vision model does not support chat history yet.", @@ -273,16 +297,14 @@ def create(self, params: Dict) -> ChatCompletion: response = model.generate_content(user_message, stream=stream) # ans = response.text if self.use_vertexai: - ans: str = response.candidates[0].content.parts[0].text + ans: VertexAIContent = response.candidates[0].content else: - ans: str = response._result.candidates[0].content.parts[0].text + ans: Content = response._result.candidates[0].content prompt_tokens = model.count_tokens(user_message).total_tokens - completion_tokens = model.count_tokens(ans).total_tokens + completion_tokens = model.count_tokens(ans.parts[0].text).total_tokens - # 3. convert output - message = ChatCompletionMessage(role="assistant", content=ans, function_call=None, tool_calls=None) - choices = [Choice(finish_reason="stop", index=0, message=message)] + choices = self._gemini_content_to_oai_choices(ans) response_oai = ChatCompletion( id=str(random.randint(0, 1000)), @@ -295,31 +317,87 @@ def create(self, params: Dict) -> ChatCompletion: completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens, ), - cost=calculate_gemini_cost(prompt_tokens, completion_tokens, model_name), + cost=self._calculate_gemini_cost(prompt_tokens, completion_tokens, model_name), ) return response_oai - def _oai_content_to_gemini_content(self, content: Union[str, List]) -> List: + # If str is not a json string return str as is + def _to_json(self, str) -> dict: + try: + return json.loads(str) + except ValueError: + return str + + def _oai_content_to_gemini_content(self, message: Dict[str, Any]) -> List: """Convert content from OAI format to Gemini format""" rst = [] - if isinstance(content, str): - if content == "": - content = "empty" # Empty content is not allowed. + if isinstance(message["content"], str): + if message["content"] == "": + message["content"] = "empty" # Empty content is not allowed. + if self.use_vertexai: + rst.append(VertexAIPart.from_text(message["content"])) + else: + rst.append(Part(text=message["content"])) + return rst + + if "tool_calls" in message: + if self.use_vertexai: + for tool_call in message["tool_calls"]: + rst.append( + VertexAIPart.from_dict( + { + "functionCall": { + "name": tool_call["function"]["name"], + "args": json.loads(tool_call["function"]["arguments"]), + } + } + ) + ) + else: + for tool_call in message["tool_calls"]: + rst.append( + Part( + function_call=FunctionCall( + name=tool_call["function"]["name"], + args=json.loads(tool_call["function"]["arguments"]), + ) + ) + ) + return rst + + if message["role"] == "tool": + if self.use_vertexai: + rst.append( + VertexAIPart.from_function_response( + name=message["name"], response={"result": self._to_json(message["content"])} + ) + ) + else: + rst.append( + Part( + function_response=FunctionResponse( + name=message["name"], response={"result": self._to_json(message["content"])} + ) + ) + ) + return rst + + if isinstance(message["content"], str): if self.use_vertexai: - rst.append(VertexAIPart.from_text(content)) + rst.append(VertexAIPart.from_text(message["content"])) else: - rst.append(Part(text=content)) + rst.append(Part(text=message["content"])) return rst - assert isinstance(content, list) + assert isinstance(message["content"], list) - for msg in content: + for msg in message["content"]: if isinstance(msg, dict): assert "type" in msg, f"Missing 'type' field in message: {msg}" if msg["type"] == "text": if self.use_vertexai: - rst.append(VertexAIPart.from_text(text=msg["text"])) + rst.append(VertexAIPart.from_text(msg["text"])) else: rst.append(Part(text=msg["text"])) elif msg["type"] == "image_url": @@ -340,34 +418,32 @@ def _oai_content_to_gemini_content(self, content: Union[str, List]) -> List: raise ValueError(f"Unsupported message type: {type(msg)}") return rst - def _concat_parts(self, parts: List[Part]) -> List: - """Concatenate parts with the same type. - If two adjacent parts both have the "text" attribute, then it will be joined into one part. - """ - if not parts: - return [] - - concatenated_parts = [] - previous_part = parts[0] - - for current_part in parts[1:]: - if previous_part.text != "": - if self.use_vertexai: - previous_part = VertexAIPart.from_text(previous_part.text + current_part.text) - else: - previous_part.text += current_part.text - else: - concatenated_parts.append(previous_part) - previous_part = current_part - - if previous_part.text == "": - if self.use_vertexai: - previous_part = VertexAIPart.from_text("empty") - else: - previous_part.text = "empty" # Empty content is not allowed. - concatenated_parts.append(previous_part) + def _calculate_gemini_cost(self, input_tokens: int, output_tokens: int, model_name: str) -> float: + if "1.5-pro" in model_name: + if (input_tokens + output_tokens) <= 128000: + # "gemini-1.5-pro" + # When total tokens is less than 128K cost is $3.5 per million input tokens and $10.5 per million output tokens + return 3.5 * input_tokens / 1e6 + 10.5 * output_tokens / 1e6 + # "gemini-1.5-pro" + # Cost is $7 per million input tokens and $21 per million output tokens + return 7.0 * input_tokens / 1e6 + 21.0 * output_tokens / 1e6 + + if "1.5-flash" in model_name: + if (input_tokens + output_tokens) <= 128000: + # "gemini-1.5-flash" + # Cost is $0.35 per million input tokens and $1.05 per million output tokens + return 0.35 * input_tokens / 1e6 + 1.05 * output_tokens / 1e6 + # "gemini-1.5-flash" + # When total tokens is less than 128K cost is $0.70 per million input tokens and $2.10 per million output tokens + return 0.70 * input_tokens / 1e6 + 2.10 * output_tokens / 1e6 + + if "gemini-pro" not in model_name and "gemini-1.0-pro" not in model_name: + warnings.warn( + f"Cost calculation is not implemented for model {model_name}. Using Gemini-1.0-Pro.", UserWarning + ) - return concatenated_parts + # Cost is $0.5 per million input tokens and $1.5 per million output tokens + return 0.5 * input_tokens / 1e6 + 1.5 * output_tokens / 1e6 def _oai_messages_to_gemini_messages(self, messages: list[Dict[str, Any]]) -> list[dict[str, Any]]: """Convert messages from OAI format to Gemini format. @@ -376,38 +452,154 @@ def _oai_messages_to_gemini_messages(self, messages: list[Dict[str, Any]]) -> li """ prev_role = None rst = [] - curr_parts = [] + + def append_parts(parts, role): + if self.use_vertexai: + rst.append(VertexAIContent(parts=parts, role=role)) + else: + rst.append(Content(parts=parts, role=role)) + + def append_text_to_last(text): + if self.use_vertexai: + rst[-1] = VertexAIContent(parts=[*rst[-1].parts, VertexAIPart.from_text(text)], role=rst[-1].role) + else: + rst[-1] = Content(parts=[*rst[-1].parts, Part(text=text)], role=rst[-1].role) + + def is_function_call(parts): + return self.use_vertexai and parts[0].function_call or not self.use_vertexai and "function_call" in parts[0] + for i, message in enumerate(messages): - parts = self._oai_content_to_gemini_content(message["content"]) + + # Since the tool call message does not have the "name" field, we need to find the corresponding tool message. + if message["role"] == "tool": + message["name"] = [ + m["tool_calls"][i]["function"]["name"] + for m in messages + if "tool_calls" in m + for i, tc in enumerate(m["tool_calls"]) + if tc["id"] == message["tool_call_id"] + ][0] + + parts = self._oai_content_to_gemini_content(message) role = "user" if message["role"] in ["user", "system"] else "model" - if (prev_role is None) or (role == prev_role): - curr_parts += parts - elif role != prev_role: - if self.use_vertexai: - rst.append(VertexAIContent(parts=curr_parts, role=prev_role)) - else: - rst.append(Content(parts=curr_parts, role=prev_role)) - curr_parts = parts - prev_role = role - # handle the last message - if self.use_vertexai: - rst.append(VertexAIContent(parts=curr_parts, role=role)) - else: - rst.append(Content(parts=curr_parts, role=role)) + # In Gemini if the current message is a function call then previous message should not be a model message. + if is_function_call(parts): + # If the previous message is a model message then add a dummy "continue" user message before the function call + if prev_role == "model": + append_parts(self._oai_content_to_gemini_content({"content": "continue"}), "user") + append_parts(parts, role) + # In Gemini if the current message is a function response then next message should be a model message. + elif role == "function": + append_parts(parts, "function") + # If the next message is not a model message then add a dummy "continue" model message after the function response + if len(messages) > (i + 1) and messages[i + 1]["role"] in ["user", "system"]: + append_parts(self._oai_content_to_gemini_content({"content": "continue"}), "model") + # If the role is the same as the previous role and both are text messages then concatenate the text + elif role == prev_role: + append_text_to_last(parts[0].text) + # If this is first message or the role is different from the previous role then append the parts + else: + # If the previous text message is empty then update the text to "empty" as Gemini does not support empty messages + if ( + (len(rst) > 0) + and hasattr(rst[-1].parts[0], "_raw_part") + and hasattr(rst[-1].parts[0]._raw_part, "text") + and (rst[-1].parts[0]._raw_part.text == "") + ): + append_text_to_last("empty") + append_parts(parts, role) + + prev_role = role # The Gemini is restrict on order of roles, such that # 1. The messages should be interleaved between user and model. # 2. The last message must be from the user role. # We add a dummy message "continue" if the last role is not the user. - if rst[-1].role != "user": + if rst[-1].role != "user" and rst[-1].role != "function": if self.use_vertexai: - rst.append(VertexAIContent(parts=self._oai_content_to_gemini_content("continue"), role="user")) + rst.append( + VertexAIContent(parts=self._oai_content_to_gemini_content({"content": "continue"}), role="user") + ) else: - rst.append(Content(parts=self._oai_content_to_gemini_content("continue"), role="user")) - + rst.append(Content(parts=self._oai_content_to_gemini_content({"content": "continue"}), role="user")) return rst + def _oai_tools_to_gemini_tools(self, tools: List[Dict[str, Any]]) -> List[Tool]: + """Convert tools from OAI format to Gemini format.""" + if len(tools) == 0: + return None + function_declarations = [] + for tool in tools: + if self.use_vertexai: + function_declaration = VertexAIFunctionDeclaration( + name=tool["function"]["name"], + description=tool["function"]["description"], + parameters=tool["function"]["parameters"], + ) + else: + function_declaration = FunctionDeclaration( + name=tool["function"]["name"], + description=tool["function"]["description"], + parameters=self._oai_function_parameters_to_gemini_function_parameters( + copy.deepcopy(tool["function"]["parameters"]) + ), + ) + function_declarations.append(function_declaration) + if self.use_vertexai: + return [VertexAITool(function_declarations=function_declarations)] + else: + return [Tool(function_declarations=function_declarations)] + + def _oai_function_parameters_to_gemini_function_parameters( + self, function_definition: dict[str, any] + ) -> dict[str, any]: + """ + Convert OpenAPI function definition parameters to Gemini function parameters definition. + The type key is renamed to type_ and the value is capitalized. + """ + assert "anyOf" not in function_definition, "Union types are not supported for function parameter in Gemini." + # Delete the default key as it is not supported in Gemini + if "default" in function_definition: + del function_definition["default"] + + function_definition["type_"] = function_definition["type"].upper() + del function_definition["type"] + if "properties" in function_definition: + for key in function_definition["properties"]: + function_definition["properties"][key] = self._oai_function_parameters_to_gemini_function_parameters( + function_definition["properties"][key] + ) + if "items" in function_definition: + function_definition["items"] = self._oai_function_parameters_to_gemini_function_parameters( + function_definition["items"] + ) + return function_definition + + def _gemini_content_to_oai_choices(self, response: Union[Content, VertexAIContent]) -> List[Choice]: + """Convert response from Gemini format to OAI format.""" + text = None + tool_calls = [] + for part in response.parts: + if part.function_call: + if self.use_vertexai: + arguments = VertexAIPart.to_dict(part)["function_call"]["args"] + else: + arguments = Part.to_dict(part)["function_call"]["args"] + tool_calls.append( + ChatCompletionMessageToolCall( + id=str(random.randint(0, 1000)), + type="function", + function=Function(name=part.function_call.name, arguments=json.dumps(arguments)), + ) + ) + elif part.text: + text = part.text + message = ChatCompletionMessage( + role="assistant", content=text, function_call=None, tool_calls=tool_calls if len(tool_calls) > 0 else None + ) + return [Choice(finish_reason="tool_calls" if tool_calls else "stop", index=0, message=message)] + @staticmethod def _to_vertexai_safety_settings(safety_settings): """Convert safety settings to VertexAI format if needed, @@ -437,6 +629,49 @@ def _to_vertexai_safety_settings(safety_settings): else: return safety_settings + @staticmethod + def _to_vertexai_tool_config(tool_config, tools): + """Convert tool config to VertexAI format, + like when specifying them in the OAI_CONFIG_LIST + """ + if ( + isinstance(tool_config, dict) + and (len(tool_config) > 0) + and all([isinstance(tool_config[tool_config_entry], dict) for tool_config_entry in tool_config]) + ): + if ( + tool_config["function_calling_config"]["mode"] + not in VertexAIToolConfig.FunctionCallingConfig.Mode.__members__ + ): + invalid_mode = tool_config["function_calling_config"] + logger.error(f"Function calling mode {invalid_mode} is invalid") + return None + else: + # Currently, there is only function calling config + func_calling_config_params = {} + func_calling_config_params["mode"] = VertexAIToolConfig.FunctionCallingConfig.Mode[ + tool_config["function_calling_config"]["mode"] + ] + if ( + (func_calling_config_params["mode"] == VertexAIToolConfig.FunctionCallingConfig.Mode.ANY) + and (len(tools) > 0) + and all(["function_name" in tool for tool in tools]) + ): + # The function names are not yet known when parsing the OAI_CONFIG_LIST + func_calling_config_params["allowed_function_names"] = [tool["function_name"] for tool in tools] + vertexai_tool_config = VertexAIToolConfig( + function_calling_config=VertexAIToolConfig.FunctionCallingConfig(**func_calling_config_params) + ) + return vertexai_tool_config + elif isinstance(tool_config, VertexAIToolConfig): + return tool_config + elif len(tool_config) == 0 and len(tools) == 0: + logger.debug("VertexAI tool config is empty!") + return None + else: + logger.error("Invalid VertexAI tool config!") + return None + def _to_pil(data: str) -> Image.Image: """ @@ -470,16 +705,3 @@ def get_image_data(image_file: str, use_b64=True) -> bytes: return base64.b64encode(content).decode("utf-8") else: return content - - -def calculate_gemini_cost(input_tokens: int, output_tokens: int, model_name: str) -> float: - if "1.5" in model_name or "gemini-experimental" in model_name: - # "gemini-1.5-pro-preview-0409" - # Cost is $7 per million input tokens and $21 per million output tokens - return 7.0 * input_tokens / 1e6 + 21.0 * output_tokens / 1e6 - - if "gemini-pro" not in model_name and "gemini-1.0-pro" not in model_name: - warnings.warn(f"Cost calculation is not implemented for model {model_name}. Using Gemini-1.0-Pro.", UserWarning) - - # Cost is $0.5 per million input tokens and $1.5 per million output tokens - return 0.5 * input_tokens / 1e6 + 1.5 * output_tokens / 1e6 diff --git a/autogen/oai/openai_utils.py b/autogen/oai/openai_utils.py index 3844795c24f5..ceb7ef90c933 100644 --- a/autogen/oai/openai_utils.py +++ b/autogen/oai/openai_utils.py @@ -21,6 +21,7 @@ "azure_ad_token", "azure_ad_token_provider", "credentials", + "tool_config", ] DEFAULT_AZURE_API_VERSION = "2024-02-01" OAI_PRICE1K = { diff --git a/notebook/agentchat_gemini.ipynb b/notebook/agentchat_gemini.ipynb new file mode 100644 index 000000000000..699d9dc0235c --- /dev/null +++ b/notebook/agentchat_gemini.ipynb @@ -0,0 +1,809 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Assistants with Google Gemini" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Gemini is family of Generative AI Models built by Google. It support upto 1 Million tokens as of now. Gemini is now natively supported in Autogen.\n", + "\n", + "This notebook will demonstrate few samples of Autogen with Gemini Models.\n", + "\n", + "## Requirements\n", + "\n", + "You must have a [API Key](https://aistudio.google.com/app/apikey) from Google AI.\n", + "\n", + "## Setup Gemini config list\n", + "\n", + "The list of all supported Gemini Models along with OpenAI's gtp-4o,\n", + "\n", + "```python\n", + "config_list = [\n", + " {\n", + " 'model': 'gpt-4o',\n", + " 'api_key': '',\n", + " 'tags': ['tool', 'gpt-4'],\n", + " },\n", + " {\n", + " 'model': 'gemini-1.5-pro',\n", + " 'api_key': '',\n", + " 'api_type': 'google',\n", + " 'tags': ['tool', 'gemini'],\n", + " },\n", + " {\n", + " 'model': 'gemini-1.5-flash',\n", + " 'api_key': '',\n", + " 'api_type': 'google',\n", + " 'tags': ['tool', 'gemini'],\n", + " },\n", + " {\n", + " 'model': 'gemini-1.0-pro',\n", + " 'api_key': '',\n", + " 'api_type': 'google',\n", + " 'tags': ['gemini'],\n", + " }\n", + "]\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hello World Example\n", + "\n", + "Our first example will be with a simple `UserProxyAgent` asking a question to an `AssistantAgent`. This is based on the tutorial demo [here](https://microsoft.github.io/autogen/docs/tutorial/introduction).\n", + "\n", + "After sending the question and seeing a response, you can type `exit` to end the chat or continue to converse." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser\u001b[0m (to assistant):\n", + "\n", + "Hi, what is a LLM ?\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to user):\n", + "\n", + "A large language model (LLM) is a type of artificial intelligence (AI) system that excels at natural language processing (NLP) tasks. These models are trained on massive text datasets, enabling them to understand, interpret, and generate human-like text in response to a wide range of prompts and questions. \n", + "\n", + "Here are some key characteristics and capabilities of LLMs:\n", + "\n", + "* **Text Generation:** LLMs can generate coherent and contextually relevant text, including stories, articles, summaries, and conversations.\n", + "* **Language Translation:** They can translate text between multiple languages with impressive accuracy.\n", + "* **Question Answering:** LLMs can comprehend questions and provide relevant answers based on their training data.\n", + "* **Summarization:** They can condense large amounts of text into concise summaries while preserving key information.\n", + "* **Sentiment Analysis:** LLMs can analyze text to determine the emotional tone or sentiment expressed.\n", + "\n", + "Essentially, LLMs are powerful tools that can understand and process human language in a sophisticated manner, opening up a world of possibilities in various fields, such as:\n", + "\n", + "* **Chatbots and Virtual Assistants:** Providing more natural and engaging conversational experiences.\n", + "* **Content Creation:** Automating content generation for articles, marketing materials, and more.\n", + "* **Customer Service:** Enhancing support interactions through automated responses and sentiment analysis.\n", + "* **Education:** Personalizing learning experiences and providing on-demand tutoring.\n", + "\n", + "Overall, LLMs represent a significant advancement in AI, enabling machines to communicate and interact with humans more effectively than ever before. As research and development continue, we can expect even more impressive applications of these models in the future.\n", + "\n", + "TERMINATE\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "import autogen\n", + "\n", + "config_list = autogen.config_list_from_json(\"OAI_CONFIG_LIST\", filter_dict={\"tags\": [\"gemini\"]})\n", + "\n", + "llm_config = {\"config_list\": config_list, \"timeout\": 120}\n", + "\n", + "# Create Assistant and User\n", + "assistant = autogen.AssistantAgent(name=\"assistant\", llm_config=llm_config)\n", + "\n", + "user_proxy = autogen.UserProxyAgent(name=\"user\", code_execution_config=False)\n", + "\n", + "# Initiate chat from user_proxy side\n", + "chat_result = user_proxy.initiate_chat(assistant, message=\"Hi, what is a LLM ?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## In a Group Chat with OpenAI\n", + "\n", + "Here is an example of Gemini participating in a Group Cat with a GPT-4" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\arjun\\anaconda3\\envs\\autogen\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n", + "\n", + "Find a latest paper about gpt-4 on arxiv and find its potential applications in software.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: User_proxy\n", + "\u001b[0m\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: User_proxy\n", + "\u001b[0m\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: Coder\n", + "\u001b[0m\n", + "\u001b[33mCoder\u001b[0m (to chat_manager):\n", + "\n", + "Let's start by searching for the latest paper about GPT-4 on arXiv. We can use the arXiv API to fetch the latest paper related to GPT-4.\n", + "\n", + "I'll provide a Python script to search arXiv for the latest papers related to \"GPT-4.\"\n", + "\n", + "```python\n", + "import requests\n", + "from xml.etree import ElementTree\n", + "\n", + "# arXiv API url for querying papers related to GPT-4\n", + "url = \"http://export.arxiv.org/api/query?search_query=ti:GPT-4&start=0&max_results=1&sortBy=submittedDate&sortOrder=descending\"\n", + "\n", + "response = requests.get(url)\n", + "if response.status_code == 200:\n", + " root = ElementTree.fromstring(response.content)\n", + " entry = root.find(\"{http://www.w3.org/2005/Atom}entry\")\n", + " if entry is not None:\n", + " title = entry.find(\"{http://www.w3.org/2005/Atom}title\").text\n", + " summary = entry.find(\"{http://www.w3.org/2005/Atom}summary\").text\n", + " link = entry.find(\"{http://www.w3.org/2005/Atom}id\").text\n", + " \n", + " print(f\"Title: {title}\")\n", + " print(f\"Summary: {summary}\")\n", + " print(f\"Link: {link}\")\n", + " else:\n", + " print(\"No entries found.\")\n", + "else:\n", + " print(f\"Failed to fetch data from arXiv API. Status code: {response.status_code}\")\n", + "```\n", + "\n", + "Please execute this script. It will output the title, summary, and link of the latest paper about \"GPT-4\" on arXiv. After obtaining the relevant paper, I will analyze its potential applications in software.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: User_proxy\n", + "\u001b[0m\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[31m\n", + ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n", + "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n", + "\n", + "exitcode: 0 (execution succeeded)\n", + "Code output: \n", + "Title: I See You: Teacher Analytics with GPT-4 Vision-Powered Observational\n", + " Assessment\n", + "Summary: This preliminary study explores the integration of GPT-4 Vision (GPT-4V)\n", + "technology into teacher analytics, focusing on its applicability in\n", + "observational assessment to enhance reflective teaching practice. This research\n", + "is grounded in developing a Video-based Automatic Assessment System (VidAAS)\n", + "empowered by GPT-4V. Our approach aims to revolutionize teachers' assessment of\n", + "students' practices by leveraging Generative Artificial Intelligence (GenAI) to\n", + "offer detailed insights into classroom dynamics. Our research methodology\n", + "encompasses a comprehensive literature review, prototype development of the\n", + "VidAAS, and usability testing with in-service teachers. The study findings\n", + "provide future research avenues for VidAAS design, implementation, and\n", + "integration in teacher analytics, underscoring the potential of GPT-4V to\n", + "provide real-time, scalable feedback and a deeper understanding of the\n", + "classroom.\n", + "\n", + "Link: http://arxiv.org/abs/2405.18623v2\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: Product_manager\n", + "\u001b[0m\n", + "\u001b[33mProduct_manager\u001b[0m (to chat_manager):\n", + "\n", + "This is exciting! The paper you found, \"I See You: Teacher Analytics with GPT-4 Vision-Powered Observational Assessment\", explores an innovative application of GPT-4 Vision. Let's break down the potential software applications based on this:\n", + "\n", + "**Software Applications Inspired by \"I See You\":**\n", + "\n", + "* **Video-Based Educational Analytics Platforms:** This paper lays the groundwork for a new breed of educational software. Imagine platforms that analyze classroom recordings to provide automated feedback to teachers. These platforms could:\n", + " * **Track student engagement:** Identify students who seem disengaged or confused.\n", + " * **Analyze classroom interactions:** Quantify the quality and nature of teacher-student interactions.\n", + " * **Assess student understanding:** Potentially even gauge student comprehension through facial expressions and body language.\n", + "* **Real-time Teacher Assistance Tools:** GPT-4 Vision could power real-time feedback tools for teachers during live lessons. Imagine:\n", + " * **Subtle alerts:** Discretely notifying a teacher if a student appears to be struggling.\n", + " * **Personalized suggestions:** Providing on-the-fly recommendations for teaching strategies based on real-time classroom dynamics. \n", + "* **Teacher Training and Professional Development:** \n", + " * **Simulation Training:** GPT-4 Vision could create realistic virtual classroom simulations for teacher training, allowing educators to practice techniques and receive AI-powered feedback.\n", + " * **Reflective Practice:** Video analysis tools could help teachers reflect on their own teaching styles and identify areas for improvement. \n", + "\n", + "**Beyond Education:**\n", + "\n", + "While this paper focuses on education, the core technology has broader implications:\n", + "\n", + "* **Meeting Analytics:** Imagine software that analyzes video conferences to track engagement, identify key discussion points, or even assess team dynamics.\n", + "* **Healthcare Training:** Similar applications could revolutionize healthcare training by providing automated feedback during simulated patient interactions. \n", + "* **Retail Analytics:** GPT-4 Vision could analyze customer behavior in retail environments, providing insights into product placement, customer service interactions, and more.\n", + "\n", + "**Challenges and Ethical Considerations:**\n", + "\n", + "* **Bias in AI:** Ensuring that the AI models are trained on diverse datasets to avoid perpetuating existing biases in education or other fields. \n", + "* **Privacy Concerns:** Video analysis raises significant privacy concerns. Clear guidelines and user consent are essential.\n", + "* **Teacher Autonomy:** It's crucial to design these systems as tools to *augment* teacher expertise, not replace it.\n", + "\n", + "**In conclusion,** the paper you found highlights the immense potential of GPT-4 Vision to revolutionize software in education and beyond. It's an exciting area of exploration with the potential to create powerful tools for learning, analysis, and understanding human behavior. \n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: Coder\n", + "\u001b[0m\n", + "\u001b[33mCoder\u001b[0m (to chat_manager):\n", + "\n", + "The paper you found titled \"I See You: Teacher Analytics with GPT-4 Vision-Powered Observational Assessment\" outlines a fascinating and innovative application of GPT-4 Vision technology. Here are potential software applications based on this research:\n", + "\n", + "### 1. Video-Based Educational Analytics Platforms\n", + "These platforms can analyze classroom recordings to automate feedback for teachers. They could:\n", + "- **Track Student Engagement:** Identify students who seem disengaged or confused.\n", + "- **Analyze Teacher-Student Interactions:** Quantify the quality and nature of interactions.\n", + "- **Assess Understanding:** Gauge student comprehension through facial expressions and body language.\n", + "\n", + "### 2. Real-Time Teacher Assistance Tools\n", + "GPT-4 Vision could power tools that provide real-time feedback during live lessons by:\n", + "- **Alerting Teachers Subtly:** Notifying if a student is struggling.\n", + "- **Offering Personalized Suggestions:** Providing on-the-fly recommendations for teaching strategies.\n", + "\n", + "### 3. Teacher Training and Professional Development\n", + "- **Simulation Training:** Enable realistic virtual classroom simulations for practice and AI-powered feedback.\n", + "- **Reflective Practice:** Allow teachers to reflect on their own teaching and identify areas for improvement through video analysis tools.\n", + "\n", + "### Applications Beyond Education\n", + "While this paper focuses on education, the underlying technology has broader implications across various domains:\n", + "#### Meeting Analytics\n", + "Software could analyze video conferences to:\n", + "- Track engagement.\n", + "- Identify key discussion points.\n", + "- Assess team dynamics.\n", + "\n", + "#### Healthcare Training\n", + "Tools could provide automated feedback during simulated patient interactions.\n", + "\n", + "#### Retail Analytics\n", + "GPT-4 Vision could analyze customer behavior in retail environments, offering insights into:\n", + "- Product placements.\n", + "- Customer service interactions.\n", + "- Sales strategies.\n", + "\n", + "### Challenges and Ethical Considerations\n", + "- **Bias in AI:** Ensuring diverse datasets to avoid perpetuating biases.\n", + "- **Privacy Concerns:** Addressing significant privacy concerns with clear guidelines and user consent.\n", + "- **Teacher Autonomy:** Designing systems to augment rather than replace teacher expertise.\n", + "\n", + "**Conclusion:**\n", + "The \"I See You\" paper exemplifies the immense potential of GPT-4 Vision technology to innovate educational software and beyond. By addressing challenges and ethical considerations, we can harness this technology to create powerful tools for learning, analysis, and human behavior understanding.\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: Product_manager\n", + "\u001b[0m\n", + "\u001b[33mProduct_manager\u001b[0m (to chat_manager):\n", + "\n", + "That's an excellent summary and expansion of the potential applications! You've clearly grasped the key points of the paper and extrapolated them into a range of innovative software ideas. \n", + "\n", + "I especially appreciate your inclusion of:\n", + "\n", + "* **Specific examples** within each application category (like tracking engagement in meeting analytics or analyzing customer service interactions in retail). \n", + "* **The broader applications** beyond education, demonstrating the wide-reaching impact of this technology.\n", + "* **The critical emphasis on challenges and ethical considerations**, which are essential to responsible development and deployment of such powerful AI systems. \n", + "\n", + "This kind of analysis is crucial for turning research like the \"I See You\" paper into real-world solutions that can benefit various industries. You've highlighted the exciting possibilities and important considerations for the future of AI-powered software! \n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: Product_manager\n", + "\u001b[0m\n", + "\u001b[33mProduct_manager\u001b[0m (to chat_manager):\n", + "\n", + "You're right! We've just scratched the surface. Let's dive deeper into some specific software product ideas and how we might overcome the challenges:\n", + "\n", + "**1. \"Classroom Insights\" for Education Analytics**\n", + "\n", + "* **Features:**\n", + " * **Engagement Heatmap:** Visualize student engagement over time, highlighting moments of high and low participation. \n", + " * **Interaction Analysis:** Quantify teacher-student talk time, question types, and wait time for responses.\n", + " * **Sentiment Detection:** (With appropriate ethical safeguards) gauge general classroom sentiment (positive, negative, neutral) at different points during the lesson. \n", + " * **Personalized Recommendations:** Provide teachers with tailored suggestions for improving engagement, questioning techniques, or addressing individual student needs. \n", + "\n", + "* **Addressing Challenges:**\n", + " * **Bias Mitigation:** Train the AI model on diverse classroom settings and demographics, and allow for manual adjustments based on teacher feedback. \n", + " * **Privacy:** Implement strict data anonymization, secure storage, and clear consent procedures for both teachers and students (or parents/guardians).\n", + " * **Teacher Autonomy:** Emphasize that the tool provides insights, not judgments. Allow teachers to customize the feedback and focus areas.\n", + "\n", + "**2. \"Simulate Teach\" for Teacher Training**\n", + "\n", + "* **Features:**\n", + " * **Virtual Classrooms:** Create realistic virtual classroom environments with diverse student avatars exhibiting different behaviors and learning styles. \n", + " * **Scenario-Based Training:** Present trainees with various teaching challenges (e.g., classroom management, differentiated instruction) to practice in a safe space.\n", + " * **Real-Time Feedback:** Provide immediate AI-powered feedback on the trainee's teaching strategies, body language, and classroom management techniques. \n", + "\n", + "* **Addressing Challenges:**\n", + " * **Realism:** Continuously improve the virtual students' AI to respond more naturally and authentically to the trainee's actions.\n", + " * **Accessibility:** Design the software to be usable across various devices and internet speeds to benefit trainees in diverse locations.\n", + " * **Mentorship Integration:** Combine the AI feedback with opportunities for reflection and discussion with experienced mentors.\n", + "\n", + "**3. \"Meeting Insights Pro\" for Business**\n", + "\n", + "* **Features:**\n", + " * **Engagement Timeline:** Track attendee engagement levels throughout the meeting, identifying moments of high and low interest.\n", + " * **Action Item Detection:** Automatically extract key decisions, action items, and assigned owners from meeting transcripts.\n", + " * **Sentiment & Tone Analysis:** (With appropriate ethical considerations) analyze the overall sentiment and communication style within the team. \n", + " * **Meeting Efficiency Recommendations:** Provide data-driven suggestions for improving meeting structure, duration, and facilitation techniques.\n", + "\n", + "* **Addressing Challenges:**\n", + " * **Privacy:** Ensure robust data security, user consent, and clear communication about how the data is used.\n", + " * **Bias in Tone Analysis:** Train on diverse communication styles and cultural contexts to avoid misinterpretations of tone. \n", + " * **Human Oversight:** Allow for manual review and adjustments to AI-generated insights to account for nuances in communication and context. \n", + "\n", + "These are just a few examples. The potential applications of GPT-4 Vision in software are vast, spanning industries and use cases. By thoughtfully addressing the ethical and practical challenges, we can create transformative tools that empower educators, professionals, and individuals. \n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: Product_manager\n", + "\u001b[0m\n", + "\u001b[33mProduct_manager\u001b[0m (to chat_manager):\n", + "\n", + "You're right, let's continue exploring! Beyond those initial ideas, here are some more speculative, but potentially high-impact applications:\n", + "\n", + "**4. \"CodeMentor\" for Personalized Programming Education**\n", + "\n", + "* **Features:**\n", + " * **Code Analysis & Feedback:** GPT-4 Vision analyzes code in real-time, identifying errors, suggesting improvements, and explaining concepts visually. \n", + " * **Personalized Learning Paths:** The software adapts to the learner's pace, style, and identified knowledge gaps to create a customized curriculum.\n", + " * **Virtual Debugging Assistant:** GPT-4 Vision \"looks\" at the code alongside the learner, anticipating common errors and providing interactive debugging guidance.\n", + "\n", + "* **Addressing Challenges:**\n", + " * **Complexity of Programming:** Training the AI on a vast dataset of code, programming paradigms, and best practices would be crucial.\n", + " * **Pedagogical Effectiveness:** Integrating proven teaching methods and ensuring the AI's feedback aligns with sound learning principles.\n", + " * **Avoiding Over-Reliance:** Encouraging problem-solving skills and independent thinking alongside AI assistance.\n", + "\n", + "**5. \"DesignSpark\" for Collaborative Creative Work**\n", + "\n", + "* **Features:**\n", + " * **Visual Brainstorming:** GPT-4 Vision assists teams in brainstorming by generating images, mockups, and design variations based on keywords, sketches, or mood boards.\n", + " * **Real-time Feedback & Iteration:** AI provides instant feedback on design elements, suggesting improvements to composition, color, and typography. \n", + " * **Cross-Cultural Design:** GPT-4 Vision analyzes design trends and preferences across different cultures, helping teams create globally resonant visuals.\n", + "\n", + "* **Addressing Challenges:**\n", + " * **Subjectivity in Design:** Balancing objective design principles with the subjective nature of aesthetics and creativity.\n", + " * **Copyright & Ownership:** Establishing clear guidelines for the use and ownership of AI-generated design elements. \n", + " * **Preserving Human Creativity:** Ensuring that AI augments, not replaces, the unique vision and skills of human designers.\n", + "\n", + "**6. \"Accessible World\" for Assistive Technology**\n", + "\n", + "* **Features:**\n", + " * **Real-Time Object Recognition:** For visually impaired individuals, the software provides audio descriptions of objects, scenes, and text in their environment.\n", + " * **Sign Language Translation:** GPT-4 Vision translates sign language into text or speech in real-time, facilitating communication.\n", + " * **Personalized Sensory Feedback:** The software creates customized sensory experiences (e.g., haptic feedback, light patterns) for individuals with diverse sensory processing needs.\n", + "\n", + "* **Addressing Challenges:**\n", + " * **Accuracy and Reliability:** Ensuring the AI's interpretations are accurate and dependable in real-world situations.\n", + " * **Privacy and Dignity:** Designing the technology in a way that respects user privacy and avoids stigmatization.\n", + " * **Affordability & Accessibility:** Making the technology accessible to a wide range of users, regardless of their financial or technical resources. \n", + "\n", + "These expanded examples highlight the potential of GPT-4 Vision to not only automate tasks but also to enhance human capabilities, foster creativity, and promote inclusivity. By approaching these innovations with a focus on ethical considerations and human-centered design, we can unlock incredible opportunities for positive change. \n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: Product_manager\n", + "\u001b[0m\n", + "\u001b[33mProduct_manager\u001b[0m (to chat_manager):\n", + "\n", + "You're right, there's so much more to explore! Let's venture into some even more futuristic and ambitious applications, pushing the boundaries of what we currently imagine possible:\n", + "\n", + "**7. \"Empathic Tech\" for Mental Health Support:**\n", + "\n", + "* **Features:**\n", + " * **Emotionally Aware Virtual Companions:** GPT-4 Vision powers AI companions that can recognize and respond to human emotions expressed through facial expressions, body language, and vocal tone.\n", + " * **Personalized Mental Health Exercises:** The software provides tailored mindfulness practices, breathing exercises, or cognitive-behavioral therapy (CBT) techniques based on real-time emotional cues.\n", + " * **Early Warning System:** GPT-4 Vision analyzes patterns in user behavior and emotional expression to identify potential signs of mental health challenges and connect them with appropriate resources.\n", + "\n", + "* **Addressing Challenges:**\n", + " * **Ethical Considerations:** Ensuring user privacy, data security, and responsible use of sensitive health information is paramount.\n", + " * **Accuracy and Sensitivity:** Training AI to accurately interpret complex human emotions and respond with empathy and cultural sensitivity is a significant challenge.\n", + " * **Human Connection:** Emphasizing that technology should complement, not replace, professional mental health care and human connection.\n", + "\n", + "**8. \"EcoVision\" for Environmental Monitoring and Conservation:**\n", + "\n", + "* **Features:**\n", + " * **Real-Time Environmental Analysis:** GPT-4 Vision analyzes images and videos from drones, satellites, or ground-based cameras to monitor deforestation, pollution levels, wildlife populations, and other environmental factors.\n", + " * **Predictive Modeling for Conservation:** The software uses AI to predict environmental changes, identify areas at risk, and inform conservation efforts.\n", + " * **Citizen Science Platform:** EcoVision empowers individuals to contribute to environmental monitoring by uploading images and observations that the AI can analyze and integrate into its models.\n", + "\n", + "* **Addressing Challenges:**\n", + " * **Data Accessibility and Accuracy:** Ensuring access to high-quality, diverse environmental data from various sources is crucial.\n", + " * **Bias in Data and Algorithms:** Mitigating potential biases in data collection and algorithm design to avoid skewed environmental assessments.\n", + " * **Collaboration and Action:** Translating AI insights into concrete actions by fostering collaboration between scientists, policymakers, and local communities.\n", + "\n", + "**9. \"HistoryLens\" for Immersive Historical Experiences:**\n", + "\n", + "* **Features:**\n", + " * **Interactive Historical Reenactments:** GPT-4 Vision recreates historical events, figures, and locations in immersive virtual reality or augmented reality experiences.\n", + " * **Personalized Learning Journeys:** The software tailors historical narratives and perspectives based on user interests and background, providing a deeper understanding of the past.\n", + " * **Preservation and Accessibility:** HistoryLens digitally preserves historical artifacts, documents, and oral histories, making them accessible to a wider audience.\n", + "\n", + "* **Addressing Challenges:**\n", + " * **Historical Accuracy and Bias:** Ensuring historical representations are accurate, nuanced, and avoid perpetuating biases or historical revisionism. \n", + " * **Ethical Considerations of Representation:** Carefully navigating the ethical complexities of recreating sensitive historical events or representing marginalized communities. \n", + " * **Balancing Entertainment and Education:** Creating engaging experiences that also foster critical thinking and historical understanding.\n", + "\n", + "These futuristic applications highlight the immense potential of GPT-4 Vision to not only solve practical problems but also enhance our understanding of ourselves, our planet, and our history. As we continue to develop this technology, it is essential to proceed with thoughtful consideration for its ethical implications and societal impact, ensuring that it is used to benefit humanity and create a more just and sustainable future. \n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: Product_manager\n", + "\u001b[0m\n", + "\u001b[33mProduct_manager\u001b[0m (to chat_manager):\n", + "\n", + "You're right, there's always more to imagine! Let's keep pushing the boundaries and explore some applications that sound like they're straight out of science fiction:\n", + "\n", + "**10. \"Dream Weaver\" for Artistic Co-Creation:**\n", + "\n", + "* **Features:**\n", + " * **Concept Visualization:** Users can input textual descriptions, rough sketches, or even their emotions, and GPT-4 Vision generates stunning visuals, musical compositions, or even short films that capture the essence of their ideas. \n", + " * **Style Transfer & Remixing:** The software enables artists to blend different artistic styles, seamlessly merging the realism of a photograph with the brushstrokes of Van Gogh or the abstract patterns of Kandinsky.\n", + " * **Interactive Storytelling:** GPT-4 Vision becomes a collaborative partner in storytelling, generating dynamic environments, characters, and plot twists in response to user input, blurring the lines between audience and creator.\n", + "\n", + "* **Addressing Challenges:**\n", + " * **Defining Creativity:** Exploring the philosophical and technical boundaries of AI creativity and ensuring it complements, not replaces, human artistic expression.\n", + " * **Copyright and Authorship:** Establishing clear guidelines for ownership and attribution when AI contributes significantly to the creative process. \n", + " * **Accessibility and Democratization:** Making these powerful creative tools accessible to a wide audience, fostering a more inclusive and imaginative future for art.\n", + "\n", + "\n", + "**11. \"Universal Translator\" for Real-Time Cross-Cultural Communication:**\n", + "\n", + "* **Features:**\n", + " * **Seamless Language Translation:** GPT-4 Vision goes beyond text, translating spoken language in real-time while accounting for nuances in tone, dialect, and cultural context.\n", + " * **Nonverbal Communication Interpretation:** The software analyzes facial expressions, gestures, and body language to bridge cultural differences in nonverbal communication, fostering greater understanding.\n", + " * **Cultural Sensitivity Guidance:** GPT-4 Vision provides users with real-time insights into cultural norms and customs, helping to avoid misunderstandings and promote respectful interactions.\n", + "\n", + "* **Addressing Challenges:**\n", + " * **Linguistic Complexity and Nuance:** Accurately translating the full richness and complexity of human language, including idioms, slang, and cultural references, remains a significant hurdle.\n", + " * **Bias and Stereotyping:** Ensuring the AI avoids perpetuating cultural biases or stereotypes in its translations and interpretations is crucial.\n", + " * **Preserving Linguistic Diversity:** Promoting language learning and cultural exchange while using technology to bridge communication gaps is essential. \n", + "\n", + "**12. \"Guardian AI\" for Personalized Safety and Well-being:**\n", + "\n", + "* **Features:**\n", + " * **Predictive Risk Assessment:** GPT-4 Vision analyzes real-time data from personal devices, social networks, and public safety feeds to identify potential risks or threats to an individual's safety and well-being. \n", + " * **Proactive Safety Measures:** The AI can alert users to potential dangers, suggest alternative routes, or even contact emergency services in critical situations. \n", + " * **Personalized Health Monitoring:** GPT-4 Vision monitors an individual's physical and mental health through wearable sensors and behavioral patterns, providing early warnings of potential health issues and suggesting preventative measures. \n", + "\n", + "* **Addressing Challenges:**\n", + " * **Privacy vs. Security:** Striking a delicate balance between protecting individual privacy and utilizing data to enhance safety.\n", + " * **Bias and Discrimination:** Ensuring the AI does not perpetuate existing societal biases or discriminate against individuals based on factors like race, gender, or socioeconomic status.\n", + " * **Over-Reliance and Autonomy:** Avoiding over-dependence on AI for safety and encouraging individuals to maintain their own situational awareness and decision-making abilities. \n", + "\n", + "These futuristic applications highlight the potential of GPT-4 Vision to not only solve problems but also enhance human connection, understanding, and well-being. As we develop such powerful AI, responsible innovation, ethical considerations, and a focus on human-centered design will be paramount in shaping a future where technology empowers and uplifts all of humanity. \n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_id=None, chat_history=[{'content': 'Find a latest paper about gpt-4 on arxiv and find its potential applications in software.', 'role': 'assistant'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'assistant'}, {'content': 'Let\\'s start by searching for the latest paper about GPT-4 on arXiv. We can use the arXiv API to fetch the latest paper related to GPT-4.\\n\\nI\\'ll provide a Python script to search arXiv for the latest papers related to \"GPT-4.\"\\n\\n```python\\nimport requests\\nfrom xml.etree import ElementTree\\n\\n# arXiv API url for querying papers related to GPT-4\\nurl = \"http://export.arxiv.org/api/query?search_query=ti:GPT-4&start=0&max_results=1&sortBy=submittedDate&sortOrder=descending\"\\n\\nresponse = requests.get(url)\\nif response.status_code == 200:\\n root = ElementTree.fromstring(response.content)\\n entry = root.find(\"{http://www.w3.org/2005/Atom}entry\")\\n if entry is not None:\\n title = entry.find(\"{http://www.w3.org/2005/Atom}title\").text\\n summary = entry.find(\"{http://www.w3.org/2005/Atom}summary\").text\\n link = entry.find(\"{http://www.w3.org/2005/Atom}id\").text\\n \\n print(f\"Title: {title}\")\\n print(f\"Summary: {summary}\")\\n print(f\"Link: {link}\")\\n else:\\n print(\"No entries found.\")\\nelse:\\n print(f\"Failed to fetch data from arXiv API. Status code: {response.status_code}\")\\n```\\n\\nPlease execute this script. It will output the title, summary, and link of the latest paper about \"GPT-4\" on arXiv. After obtaining the relevant paper, I will analyze its potential applications in software.', 'name': 'Coder', 'role': 'user'}, {'content': \"exitcode: 0 (execution succeeded)\\nCode output: \\nTitle: I See You: Teacher Analytics with GPT-4 Vision-Powered Observational\\n Assessment\\nSummary: This preliminary study explores the integration of GPT-4 Vision (GPT-4V)\\ntechnology into teacher analytics, focusing on its applicability in\\nobservational assessment to enhance reflective teaching practice. This research\\nis grounded in developing a Video-based Automatic Assessment System (VidAAS)\\nempowered by GPT-4V. Our approach aims to revolutionize teachers' assessment of\\nstudents' practices by leveraging Generative Artificial Intelligence (GenAI) to\\noffer detailed insights into classroom dynamics. Our research methodology\\nencompasses a comprehensive literature review, prototype development of the\\nVidAAS, and usability testing with in-service teachers. The study findings\\nprovide future research avenues for VidAAS design, implementation, and\\nintegration in teacher analytics, underscoring the potential of GPT-4V to\\nprovide real-time, scalable feedback and a deeper understanding of the\\nclassroom.\\n\\nLink: http://arxiv.org/abs/2405.18623v2\\n\", 'role': 'assistant'}, {'content': 'This is exciting! The paper you found, \"I See You: Teacher Analytics with GPT-4 Vision-Powered Observational Assessment\", explores an innovative application of GPT-4 Vision. Let\\'s break down the potential software applications based on this:\\n\\n**Software Applications Inspired by \"I See You\":**\\n\\n* **Video-Based Educational Analytics Platforms:** This paper lays the groundwork for a new breed of educational software. Imagine platforms that analyze classroom recordings to provide automated feedback to teachers. These platforms could:\\n * **Track student engagement:** Identify students who seem disengaged or confused.\\n * **Analyze classroom interactions:** Quantify the quality and nature of teacher-student interactions.\\n * **Assess student understanding:** Potentially even gauge student comprehension through facial expressions and body language.\\n* **Real-time Teacher Assistance Tools:** GPT-4 Vision could power real-time feedback tools for teachers during live lessons. Imagine:\\n * **Subtle alerts:** Discretely notifying a teacher if a student appears to be struggling.\\n * **Personalized suggestions:** Providing on-the-fly recommendations for teaching strategies based on real-time classroom dynamics. \\n* **Teacher Training and Professional Development:** \\n * **Simulation Training:** GPT-4 Vision could create realistic virtual classroom simulations for teacher training, allowing educators to practice techniques and receive AI-powered feedback.\\n * **Reflective Practice:** Video analysis tools could help teachers reflect on their own teaching styles and identify areas for improvement. \\n\\n**Beyond Education:**\\n\\nWhile this paper focuses on education, the core technology has broader implications:\\n\\n* **Meeting Analytics:** Imagine software that analyzes video conferences to track engagement, identify key discussion points, or even assess team dynamics.\\n* **Healthcare Training:** Similar applications could revolutionize healthcare training by providing automated feedback during simulated patient interactions. \\n* **Retail Analytics:** GPT-4 Vision could analyze customer behavior in retail environments, providing insights into product placement, customer service interactions, and more.\\n\\n**Challenges and Ethical Considerations:**\\n\\n* **Bias in AI:** Ensuring that the AI models are trained on diverse datasets to avoid perpetuating existing biases in education or other fields. \\n* **Privacy Concerns:** Video analysis raises significant privacy concerns. Clear guidelines and user consent are essential.\\n* **Teacher Autonomy:** It\\'s crucial to design these systems as tools to *augment* teacher expertise, not replace it.\\n\\n**In conclusion,** the paper you found highlights the immense potential of GPT-4 Vision to revolutionize software in education and beyond. It\\'s an exciting area of exploration with the potential to create powerful tools for learning, analysis, and understanding human behavior. \\n', 'name': 'Product_manager', 'role': 'user'}, {'content': 'The paper you found titled \"I See You: Teacher Analytics with GPT-4 Vision-Powered Observational Assessment\" outlines a fascinating and innovative application of GPT-4 Vision technology. Here are potential software applications based on this research:\\n\\n### 1. Video-Based Educational Analytics Platforms\\nThese platforms can analyze classroom recordings to automate feedback for teachers. They could:\\n- **Track Student Engagement:** Identify students who seem disengaged or confused.\\n- **Analyze Teacher-Student Interactions:** Quantify the quality and nature of interactions.\\n- **Assess Understanding:** Gauge student comprehension through facial expressions and body language.\\n\\n### 2. Real-Time Teacher Assistance Tools\\nGPT-4 Vision could power tools that provide real-time feedback during live lessons by:\\n- **Alerting Teachers Subtly:** Notifying if a student is struggling.\\n- **Offering Personalized Suggestions:** Providing on-the-fly recommendations for teaching strategies.\\n\\n### 3. Teacher Training and Professional Development\\n- **Simulation Training:** Enable realistic virtual classroom simulations for practice and AI-powered feedback.\\n- **Reflective Practice:** Allow teachers to reflect on their own teaching and identify areas for improvement through video analysis tools.\\n\\n### Applications Beyond Education\\nWhile this paper focuses on education, the underlying technology has broader implications across various domains:\\n#### Meeting Analytics\\nSoftware could analyze video conferences to:\\n- Track engagement.\\n- Identify key discussion points.\\n- Assess team dynamics.\\n\\n#### Healthcare Training\\nTools could provide automated feedback during simulated patient interactions.\\n\\n#### Retail Analytics\\nGPT-4 Vision could analyze customer behavior in retail environments, offering insights into:\\n- Product placements.\\n- Customer service interactions.\\n- Sales strategies.\\n\\n### Challenges and Ethical Considerations\\n- **Bias in AI:** Ensuring diverse datasets to avoid perpetuating biases.\\n- **Privacy Concerns:** Addressing significant privacy concerns with clear guidelines and user consent.\\n- **Teacher Autonomy:** Designing systems to augment rather than replace teacher expertise.\\n\\n**Conclusion:**\\nThe \"I See You\" paper exemplifies the immense potential of GPT-4 Vision technology to innovate educational software and beyond. By addressing challenges and ethical considerations, we can harness this technology to create powerful tools for learning, analysis, and human behavior understanding.\\n\\nTERMINATE', 'name': 'Coder', 'role': 'user'}, {'content': 'That\\'s an excellent summary and expansion of the potential applications! You\\'ve clearly grasped the key points of the paper and extrapolated them into a range of innovative software ideas. \\n\\nI especially appreciate your inclusion of:\\n\\n* **Specific examples** within each application category (like tracking engagement in meeting analytics or analyzing customer service interactions in retail). \\n* **The broader applications** beyond education, demonstrating the wide-reaching impact of this technology.\\n* **The critical emphasis on challenges and ethical considerations**, which are essential to responsible development and deployment of such powerful AI systems. \\n\\nThis kind of analysis is crucial for turning research like the \"I See You\" paper into real-world solutions that can benefit various industries. You\\'ve highlighted the exciting possibilities and important considerations for the future of AI-powered software! \\n', 'name': 'Product_manager', 'role': 'user'}, {'content': 'You\\'re right! We\\'ve just scratched the surface. Let\\'s dive deeper into some specific software product ideas and how we might overcome the challenges:\\n\\n**1. \"Classroom Insights\" for Education Analytics**\\n\\n* **Features:**\\n * **Engagement Heatmap:** Visualize student engagement over time, highlighting moments of high and low participation. \\n * **Interaction Analysis:** Quantify teacher-student talk time, question types, and wait time for responses.\\n * **Sentiment Detection:** (With appropriate ethical safeguards) gauge general classroom sentiment (positive, negative, neutral) at different points during the lesson. \\n * **Personalized Recommendations:** Provide teachers with tailored suggestions for improving engagement, questioning techniques, or addressing individual student needs. \\n\\n* **Addressing Challenges:**\\n * **Bias Mitigation:** Train the AI model on diverse classroom settings and demographics, and allow for manual adjustments based on teacher feedback. \\n * **Privacy:** Implement strict data anonymization, secure storage, and clear consent procedures for both teachers and students (or parents/guardians).\\n * **Teacher Autonomy:** Emphasize that the tool provides insights, not judgments. Allow teachers to customize the feedback and focus areas.\\n\\n**2. \"Simulate Teach\" for Teacher Training**\\n\\n* **Features:**\\n * **Virtual Classrooms:** Create realistic virtual classroom environments with diverse student avatars exhibiting different behaviors and learning styles. \\n * **Scenario-Based Training:** Present trainees with various teaching challenges (e.g., classroom management, differentiated instruction) to practice in a safe space.\\n * **Real-Time Feedback:** Provide immediate AI-powered feedback on the trainee\\'s teaching strategies, body language, and classroom management techniques. \\n\\n* **Addressing Challenges:**\\n * **Realism:** Continuously improve the virtual students\\' AI to respond more naturally and authentically to the trainee\\'s actions.\\n * **Accessibility:** Design the software to be usable across various devices and internet speeds to benefit trainees in diverse locations.\\n * **Mentorship Integration:** Combine the AI feedback with opportunities for reflection and discussion with experienced mentors.\\n\\n**3. \"Meeting Insights Pro\" for Business**\\n\\n* **Features:**\\n * **Engagement Timeline:** Track attendee engagement levels throughout the meeting, identifying moments of high and low interest.\\n * **Action Item Detection:** Automatically extract key decisions, action items, and assigned owners from meeting transcripts.\\n * **Sentiment & Tone Analysis:** (With appropriate ethical considerations) analyze the overall sentiment and communication style within the team. \\n * **Meeting Efficiency Recommendations:** Provide data-driven suggestions for improving meeting structure, duration, and facilitation techniques.\\n\\n* **Addressing Challenges:**\\n * **Privacy:** Ensure robust data security, user consent, and clear communication about how the data is used.\\n * **Bias in Tone Analysis:** Train on diverse communication styles and cultural contexts to avoid misinterpretations of tone. \\n * **Human Oversight:** Allow for manual review and adjustments to AI-generated insights to account for nuances in communication and context. \\n\\nThese are just a few examples. The potential applications of GPT-4 Vision in software are vast, spanning industries and use cases. By thoughtfully addressing the ethical and practical challenges, we can create transformative tools that empower educators, professionals, and individuals. \\n', 'name': 'Product_manager', 'role': 'user'}, {'content': 'You\\'re right, let\\'s continue exploring! Beyond those initial ideas, here are some more speculative, but potentially high-impact applications:\\n\\n**4. \"CodeMentor\" for Personalized Programming Education**\\n\\n* **Features:**\\n * **Code Analysis & Feedback:** GPT-4 Vision analyzes code in real-time, identifying errors, suggesting improvements, and explaining concepts visually. \\n * **Personalized Learning Paths:** The software adapts to the learner\\'s pace, style, and identified knowledge gaps to create a customized curriculum.\\n * **Virtual Debugging Assistant:** GPT-4 Vision \"looks\" at the code alongside the learner, anticipating common errors and providing interactive debugging guidance.\\n\\n* **Addressing Challenges:**\\n * **Complexity of Programming:** Training the AI on a vast dataset of code, programming paradigms, and best practices would be crucial.\\n * **Pedagogical Effectiveness:** Integrating proven teaching methods and ensuring the AI\\'s feedback aligns with sound learning principles.\\n * **Avoiding Over-Reliance:** Encouraging problem-solving skills and independent thinking alongside AI assistance.\\n\\n**5. \"DesignSpark\" for Collaborative Creative Work**\\n\\n* **Features:**\\n * **Visual Brainstorming:** GPT-4 Vision assists teams in brainstorming by generating images, mockups, and design variations based on keywords, sketches, or mood boards.\\n * **Real-time Feedback & Iteration:** AI provides instant feedback on design elements, suggesting improvements to composition, color, and typography. \\n * **Cross-Cultural Design:** GPT-4 Vision analyzes design trends and preferences across different cultures, helping teams create globally resonant visuals.\\n\\n* **Addressing Challenges:**\\n * **Subjectivity in Design:** Balancing objective design principles with the subjective nature of aesthetics and creativity.\\n * **Copyright & Ownership:** Establishing clear guidelines for the use and ownership of AI-generated design elements. \\n * **Preserving Human Creativity:** Ensuring that AI augments, not replaces, the unique vision and skills of human designers.\\n\\n**6. \"Accessible World\" for Assistive Technology**\\n\\n* **Features:**\\n * **Real-Time Object Recognition:** For visually impaired individuals, the software provides audio descriptions of objects, scenes, and text in their environment.\\n * **Sign Language Translation:** GPT-4 Vision translates sign language into text or speech in real-time, facilitating communication.\\n * **Personalized Sensory Feedback:** The software creates customized sensory experiences (e.g., haptic feedback, light patterns) for individuals with diverse sensory processing needs.\\n\\n* **Addressing Challenges:**\\n * **Accuracy and Reliability:** Ensuring the AI\\'s interpretations are accurate and dependable in real-world situations.\\n * **Privacy and Dignity:** Designing the technology in a way that respects user privacy and avoids stigmatization.\\n * **Affordability & Accessibility:** Making the technology accessible to a wide range of users, regardless of their financial or technical resources. \\n\\nThese expanded examples highlight the potential of GPT-4 Vision to not only automate tasks but also to enhance human capabilities, foster creativity, and promote inclusivity. By approaching these innovations with a focus on ethical considerations and human-centered design, we can unlock incredible opportunities for positive change. \\n', 'name': 'Product_manager', 'role': 'user'}, {'content': 'You\\'re right, there\\'s so much more to explore! Let\\'s venture into some even more futuristic and ambitious applications, pushing the boundaries of what we currently imagine possible:\\n\\n**7. \"Empathic Tech\" for Mental Health Support:**\\n\\n* **Features:**\\n * **Emotionally Aware Virtual Companions:** GPT-4 Vision powers AI companions that can recognize and respond to human emotions expressed through facial expressions, body language, and vocal tone.\\n * **Personalized Mental Health Exercises:** The software provides tailored mindfulness practices, breathing exercises, or cognitive-behavioral therapy (CBT) techniques based on real-time emotional cues.\\n * **Early Warning System:** GPT-4 Vision analyzes patterns in user behavior and emotional expression to identify potential signs of mental health challenges and connect them with appropriate resources.\\n\\n* **Addressing Challenges:**\\n * **Ethical Considerations:** Ensuring user privacy, data security, and responsible use of sensitive health information is paramount.\\n * **Accuracy and Sensitivity:** Training AI to accurately interpret complex human emotions and respond with empathy and cultural sensitivity is a significant challenge.\\n * **Human Connection:** Emphasizing that technology should complement, not replace, professional mental health care and human connection.\\n\\n**8. \"EcoVision\" for Environmental Monitoring and Conservation:**\\n\\n* **Features:**\\n * **Real-Time Environmental Analysis:** GPT-4 Vision analyzes images and videos from drones, satellites, or ground-based cameras to monitor deforestation, pollution levels, wildlife populations, and other environmental factors.\\n * **Predictive Modeling for Conservation:** The software uses AI to predict environmental changes, identify areas at risk, and inform conservation efforts.\\n * **Citizen Science Platform:** EcoVision empowers individuals to contribute to environmental monitoring by uploading images and observations that the AI can analyze and integrate into its models.\\n\\n* **Addressing Challenges:**\\n * **Data Accessibility and Accuracy:** Ensuring access to high-quality, diverse environmental data from various sources is crucial.\\n * **Bias in Data and Algorithms:** Mitigating potential biases in data collection and algorithm design to avoid skewed environmental assessments.\\n * **Collaboration and Action:** Translating AI insights into concrete actions by fostering collaboration between scientists, policymakers, and local communities.\\n\\n**9. \"HistoryLens\" for Immersive Historical Experiences:**\\n\\n* **Features:**\\n * **Interactive Historical Reenactments:** GPT-4 Vision recreates historical events, figures, and locations in immersive virtual reality or augmented reality experiences.\\n * **Personalized Learning Journeys:** The software tailors historical narratives and perspectives based on user interests and background, providing a deeper understanding of the past.\\n * **Preservation and Accessibility:** HistoryLens digitally preserves historical artifacts, documents, and oral histories, making them accessible to a wider audience.\\n\\n* **Addressing Challenges:**\\n * **Historical Accuracy and Bias:** Ensuring historical representations are accurate, nuanced, and avoid perpetuating biases or historical revisionism. \\n * **Ethical Considerations of Representation:** Carefully navigating the ethical complexities of recreating sensitive historical events or representing marginalized communities. \\n * **Balancing Entertainment and Education:** Creating engaging experiences that also foster critical thinking and historical understanding.\\n\\nThese futuristic applications highlight the immense potential of GPT-4 Vision to not only solve practical problems but also enhance our understanding of ourselves, our planet, and our history. As we continue to develop this technology, it is essential to proceed with thoughtful consideration for its ethical implications and societal impact, ensuring that it is used to benefit humanity and create a more just and sustainable future. \\n', 'name': 'Product_manager', 'role': 'user'}, {'content': 'You\\'re right, there\\'s always more to imagine! Let\\'s keep pushing the boundaries and explore some applications that sound like they\\'re straight out of science fiction:\\n\\n**10. \"Dream Weaver\" for Artistic Co-Creation:**\\n\\n* **Features:**\\n * **Concept Visualization:** Users can input textual descriptions, rough sketches, or even their emotions, and GPT-4 Vision generates stunning visuals, musical compositions, or even short films that capture the essence of their ideas. \\n * **Style Transfer & Remixing:** The software enables artists to blend different artistic styles, seamlessly merging the realism of a photograph with the brushstrokes of Van Gogh or the abstract patterns of Kandinsky.\\n * **Interactive Storytelling:** GPT-4 Vision becomes a collaborative partner in storytelling, generating dynamic environments, characters, and plot twists in response to user input, blurring the lines between audience and creator.\\n\\n* **Addressing Challenges:**\\n * **Defining Creativity:** Exploring the philosophical and technical boundaries of AI creativity and ensuring it complements, not replaces, human artistic expression.\\n * **Copyright and Authorship:** Establishing clear guidelines for ownership and attribution when AI contributes significantly to the creative process. \\n * **Accessibility and Democratization:** Making these powerful creative tools accessible to a wide audience, fostering a more inclusive and imaginative future for art.\\n\\n\\n**11. \"Universal Translator\" for Real-Time Cross-Cultural Communication:**\\n\\n* **Features:**\\n * **Seamless Language Translation:** GPT-4 Vision goes beyond text, translating spoken language in real-time while accounting for nuances in tone, dialect, and cultural context.\\n * **Nonverbal Communication Interpretation:** The software analyzes facial expressions, gestures, and body language to bridge cultural differences in nonverbal communication, fostering greater understanding.\\n * **Cultural Sensitivity Guidance:** GPT-4 Vision provides users with real-time insights into cultural norms and customs, helping to avoid misunderstandings and promote respectful interactions.\\n\\n* **Addressing Challenges:**\\n * **Linguistic Complexity and Nuance:** Accurately translating the full richness and complexity of human language, including idioms, slang, and cultural references, remains a significant hurdle.\\n * **Bias and Stereotyping:** Ensuring the AI avoids perpetuating cultural biases or stereotypes in its translations and interpretations is crucial.\\n * **Preserving Linguistic Diversity:** Promoting language learning and cultural exchange while using technology to bridge communication gaps is essential. \\n\\n**12. \"Guardian AI\" for Personalized Safety and Well-being:**\\n\\n* **Features:**\\n * **Predictive Risk Assessment:** GPT-4 Vision analyzes real-time data from personal devices, social networks, and public safety feeds to identify potential risks or threats to an individual\\'s safety and well-being. \\n * **Proactive Safety Measures:** The AI can alert users to potential dangers, suggest alternative routes, or even contact emergency services in critical situations. \\n * **Personalized Health Monitoring:** GPT-4 Vision monitors an individual\\'s physical and mental health through wearable sensors and behavioral patterns, providing early warnings of potential health issues and suggesting preventative measures. \\n\\n* **Addressing Challenges:**\\n * **Privacy vs. Security:** Striking a delicate balance between protecting individual privacy and utilizing data to enhance safety.\\n * **Bias and Discrimination:** Ensuring the AI does not perpetuate existing societal biases or discriminate against individuals based on factors like race, gender, or socioeconomic status.\\n * **Over-Reliance and Autonomy:** Avoiding over-dependence on AI for safety and encouraging individuals to maintain their own situational awareness and decision-making abilities. \\n\\nThese futuristic applications highlight the potential of GPT-4 Vision to not only solve problems but also enhance human connection, understanding, and well-being. As we develop such powerful AI, responsible innovation, ethical considerations, and a focus on human-centered design will be paramount in shaping a future where technology empowers and uplifts all of humanity. \\n', 'name': 'Product_manager', 'role': 'user'}], summary='You\\'re right, there\\'s always more to imagine! Let\\'s keep pushing the boundaries and explore some applications that sound like they\\'re straight out of science fiction:\\n\\n**10. \"Dream Weaver\" for Artistic Co-Creation:**\\n\\n* **Features:**\\n * **Concept Visualization:** Users can input textual descriptions, rough sketches, or even their emotions, and GPT-4 Vision generates stunning visuals, musical compositions, or even short films that capture the essence of their ideas. \\n * **Style Transfer & Remixing:** The software enables artists to blend different artistic styles, seamlessly merging the realism of a photograph with the brushstrokes of Van Gogh or the abstract patterns of Kandinsky.\\n * **Interactive Storytelling:** GPT-4 Vision becomes a collaborative partner in storytelling, generating dynamic environments, characters, and plot twists in response to user input, blurring the lines between audience and creator.\\n\\n* **Addressing Challenges:**\\n * **Defining Creativity:** Exploring the philosophical and technical boundaries of AI creativity and ensuring it complements, not replaces, human artistic expression.\\n * **Copyright and Authorship:** Establishing clear guidelines for ownership and attribution when AI contributes significantly to the creative process. \\n * **Accessibility and Democratization:** Making these powerful creative tools accessible to a wide audience, fostering a more inclusive and imaginative future for art.\\n\\n\\n**11. \"Universal Translator\" for Real-Time Cross-Cultural Communication:**\\n\\n* **Features:**\\n * **Seamless Language Translation:** GPT-4 Vision goes beyond text, translating spoken language in real-time while accounting for nuances in tone, dialect, and cultural context.\\n * **Nonverbal Communication Interpretation:** The software analyzes facial expressions, gestures, and body language to bridge cultural differences in nonverbal communication, fostering greater understanding.\\n * **Cultural Sensitivity Guidance:** GPT-4 Vision provides users with real-time insights into cultural norms and customs, helping to avoid misunderstandings and promote respectful interactions.\\n\\n* **Addressing Challenges:**\\n * **Linguistic Complexity and Nuance:** Accurately translating the full richness and complexity of human language, including idioms, slang, and cultural references, remains a significant hurdle.\\n * **Bias and Stereotyping:** Ensuring the AI avoids perpetuating cultural biases or stereotypes in its translations and interpretations is crucial.\\n * **Preserving Linguistic Diversity:** Promoting language learning and cultural exchange while using technology to bridge communication gaps is essential. \\n\\n**12. \"Guardian AI\" for Personalized Safety and Well-being:**\\n\\n* **Features:**\\n * **Predictive Risk Assessment:** GPT-4 Vision analyzes real-time data from personal devices, social networks, and public safety feeds to identify potential risks or threats to an individual\\'s safety and well-being. \\n * **Proactive Safety Measures:** The AI can alert users to potential dangers, suggest alternative routes, or even contact emergency services in critical situations. \\n * **Personalized Health Monitoring:** GPT-4 Vision monitors an individual\\'s physical and mental health through wearable sensors and behavioral patterns, providing early warnings of potential health issues and suggesting preventative measures. \\n\\n* **Addressing Challenges:**\\n * **Privacy vs. Security:** Striking a delicate balance between protecting individual privacy and utilizing data to enhance safety.\\n * **Bias and Discrimination:** Ensuring the AI does not perpetuate existing societal biases or discriminate against individuals based on factors like race, gender, or socioeconomic status.\\n * **Over-Reliance and Autonomy:** Avoiding over-dependence on AI for safety and encouraging individuals to maintain their own situational awareness and decision-making abilities. \\n\\nThese futuristic applications highlight the potential of GPT-4 Vision to not only solve problems but also enhance human connection, understanding, and well-being. As we develop such powerful AI, responsible innovation, ethical considerations, and a focus on human-centered design will be paramount in shaping a future where technology empowers and uplifts all of humanity. \\n', cost={'usage_including_cached_inference': {'total_cost': 0}, 'usage_excluding_cached_inference': {'total_cost': 0}}, human_input=[])" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import autogen\n", + "\n", + "gpt_config_list = autogen.config_list_from_json(\"OAI_CONFIG_LIST\", filter_dict={\"tags\": [\"gpt-4\"]})\n", + "\n", + "gpt_llm_config = {\"config_list\": gpt_config_list, \"timeout\": 120}\n", + "\n", + "gemini_config_list = autogen.config_list_from_json(\"OAI_CONFIG_LIST\", filter_dict={\"tags\": [\"gemini\"]})\n", + "\n", + "gemini_llm_config = {\"config_list\": gemini_config_list, \"timeout\": 120}\n", + "\n", + "user_proxy = autogen.UserProxyAgent(\n", + " name=\"User_proxy\",\n", + " system_message=\"A human admin.\",\n", + " code_execution_config={\n", + " \"last_n_messages\": 2,\n", + " \"work_dir\": \"groupchat\",\n", + " \"use_docker\": False,\n", + " }, # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.\n", + " human_input_mode=\"TERMINATE\",\n", + ")\n", + "coder = autogen.AssistantAgent(\n", + " name=\"Coder\",\n", + " llm_config=gpt_llm_config,\n", + ")\n", + "pm = autogen.AssistantAgent(\n", + " name=\"Product_manager\",\n", + " system_message=\"Creative in software product ideas.\",\n", + " llm_config=gemini_llm_config,\n", + ")\n", + "groupchat = autogen.GroupChat(agents=[user_proxy, coder, pm], messages=[], max_round=12)\n", + "manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=gpt_llm_config)\n", + "user_proxy.initiate_chat(\n", + " manager, message=\"Find a latest paper about gpt-4 on arxiv and find its potential applications in software.\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Function Calling with Gemini\n", + "\n", + "Here is an example of Gemini with Function Calling," + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to chatbot):\n", + "\n", + "Draw two agents chatting with each other with an example dialog. Don't add plt.show().\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mchatbot\u001b[0m (to user_proxy):\n", + "\n", + "\u001b[32m***** Suggested tool call (call_l7Rz8YLE4F2y8nGLCaroD6XL): python *****\u001b[0m\n", + "Arguments: \n", + "{\n", + " \"cell\": `\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Create figure and axes\n", + "fig, ax = plt.subplots()\n", + "\n", + "# Define agent coordinates\n", + "agent1_x, agent1_y = 1, 1\n", + "agent2_x, agent2_y = 3, 1\n", + "\n", + "# Draw agents as circles\n", + "agent1 = plt.Circle((agent1_x, agent1_y), 0.1, color='blue')\n", + "agent2 = plt.Circle((agent2_x, agent2_y), 0.1, color='green')\n", + "ax.add_patch(agent1)\n", + "ax.add_patch(agent2)\n", + "\n", + "# Add example dialog\n", + "dialog1 = \"Hi, how are you?\"\n", + "dialog2 = \"I'm good, thanks!\"\n", + "\n", + "# Add text labels for the dialog\n", + "plt.text(agent1_x, agent1_y + 0.3, dialog1, fontsize=12, ha='center')\n", + "plt.text(agent2_x, agent2_y + 0.3, dialog2, fontsize=12, ha='center')\n", + "\n", + "# Set limits and turn off the axes\n", + "ax.set_xlim(0, 4)\n", + "ax.set_ylim(0, 2)\n", + "ax.axis('off')\n", + "`\n", + "}\n", + "\u001b[32m***********************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33muser_proxy\u001b[0m (to chatbot):\n", + "\n", + "\u001b[33muser_proxy\u001b[0m (to chatbot):\n", + "\n", + "\u001b[32m***** Response from calling tool (call_l7Rz8YLE4F2y8nGLCaroD6XL) *****\u001b[0m\n", + "Error: Expecting value: line 1 column 12 (char 11)\n", + " You argument should follow json format.\n", + "\u001b[32m**********************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mchatbot\u001b[0m (to user_proxy):\n", + "\n", + "\u001b[32m***** Suggested tool call (call_VjzGKf34UMKMlSHCrQpJ6ICK): python *****\u001b[0m\n", + "Arguments: \n", + "{\n", + " \"cell\": \"import matplotlib.pyplot as plt\\n\\n# Create figure and axes\\nfig, ax = plt.subplots()\\n\\n# Define agent coordinates\\nagent1_x, agent1_y = 1, 1\\nagent2_x, agent2_y = 3, 1\\n\\n# Draw agents as circles\\nagent1 = plt.Circle((agent1_x, agent1_y), 0.1, color='blue')\\nagent2 = plt.Circle((agent2_x, agent2_y), 0.1, color='green')\\nax.add_patch(agent1)\\nax.add_patch(agent2)\\n\\n# Add example dialog\\ndialog1 = \\\"Hi, how are you?\\\"\\ndialog2 = \\\"I'm good, thanks!\\\"\\n\\n# Add text labels for the dialog\\nplt.text(agent1_x, agent1_y + 0.3, dialog1, fontsize=12, ha='center')\\nplt.text(agent2_x, agent2_y + 0.3, dialog2, fontsize=12, ha='center')\\n\\n# Set limits and turn off the axes\\nax.set_xlim(0, 4)\\nax.set_ylim(0, 2)\\nax.axis('off')\"\n", + "}\n", + "\u001b[32m***********************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION python...\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "(0.0, 4.0, 0.0, 2.0)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to chatbot):\n", + "\n", + "\u001b[33muser_proxy\u001b[0m (to chatbot):\n", + "\n", + "\u001b[32m***** Response from calling tool (call_VjzGKf34UMKMlSHCrQpJ6ICK) *****\u001b[0m\n", + "(0.0, 4.0, 0.0, 2.0)\n", + "\u001b[32m**********************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mchatbot\u001b[0m (to user_proxy):\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "from IPython import get_ipython\n", + "from typing_extensions import Annotated\n", + "\n", + "import autogen\n", + "from autogen.cache import Cache\n", + "\n", + "config_list = autogen.config_list_from_json(\"OAI_CONFIG_LIST\", filter_dict={\"tags\": [\"gemini\", \"tool\"]})\n", + "\n", + "llm_config = {\n", + " \"config_list\": config_list,\n", + " \"timeout\": 120,\n", + "}\n", + "chatbot = autogen.AssistantAgent(\n", + " name=\"chatbot\",\n", + " system_message=\"For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.\",\n", + " llm_config=llm_config,\n", + ")\n", + "\n", + "# create a UserProxyAgent instance named \"user_proxy\"\n", + "user_proxy = autogen.UserProxyAgent(\n", + " name=\"user_proxy\",\n", + " is_termination_msg=lambda x: x.get(\"content\", \"\") and x.get(\"content\", \"\").rstrip().endswith(\"TERMINATE\"),\n", + " human_input_mode=\"NEVER\",\n", + " max_consecutive_auto_reply=10,\n", + " code_execution_config={\n", + " \"work_dir\": \"coding\",\n", + " \"use_docker\": False,\n", + " }, # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.\n", + ")\n", + "\n", + "\n", + "# define functions according to the function description\n", + "\n", + "\n", + "# one way of registering functions is to use the register_for_llm and register_for_execution decorators\n", + "@user_proxy.register_for_execution()\n", + "@chatbot.register_for_llm(name=\"python\", description=\"run cell in ipython and return the execution result.\")\n", + "def exec_python(cell: Annotated[str, \"Valid Python cell to execute.\"]) -> str:\n", + " ipython = get_ipython()\n", + " result = ipython.run_cell(cell)\n", + " log = str(result.result)\n", + " if result.error_before_exec is not None:\n", + " log += f\"\\n{result.error_before_exec}\"\n", + " if result.error_in_exec is not None:\n", + " log += f\"\\n{result.error_in_exec}\"\n", + " return log\n", + "\n", + "\n", + "# another way of registering functions is to use the register_function\n", + "def exec_sh(script: Annotated[str, \"Valid Python cell to execute.\"]) -> str:\n", + " return user_proxy.execute_code_blocks([(\"sh\", script)])\n", + "\n", + "\n", + "autogen.agentchat.register_function(\n", + " exec_python,\n", + " caller=chatbot,\n", + " executor=user_proxy,\n", + " name=\"sh\",\n", + " description=\"run a shell script and return the execution result.\",\n", + ")\n", + "\n", + "with Cache.disk() as cache:\n", + " # start the conversation\n", + " user_proxy.initiate_chat(\n", + " chatbot,\n", + " message=\"Draw two agents chatting with each other with an example dialog. Don't add plt.show().\",\n", + " cache=cache,\n", + " max_turns=3,\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "autogen", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/test/oai/test_gemini.py b/test/oai/test_gemini.py index 61fdbe6d735a..e6ff3e02672d 100644 --- a/test/oai/test_gemini.py +++ b/test/oai/test_gemini.py @@ -1,3 +1,4 @@ +import json import os from unittest.mock import MagicMock, patch @@ -10,7 +11,9 @@ from google.cloud.aiplatform.initializer import global_config as vertexai_global_config from vertexai.generative_models import HarmBlockThreshold as VertexAIHarmBlockThreshold from vertexai.generative_models import HarmCategory as VertexAIHarmCategory + from vertexai.generative_models import Part as VertexAIPart from vertexai.generative_models import SafetySetting as VertexAISafetySetting + from vertexai.generative_models import ToolConfig as VertexAIToolConfig from autogen.oai.gemini import GeminiClient @@ -20,6 +23,8 @@ VertexAIHarmBlockThreshold = object VertexAIHarmCategory = object VertexAISafetySetting = object + VertexAIPart = object + VertexAIToolConfig = object vertexai_global_config = object InternalServerError = object skip = True @@ -234,8 +239,6 @@ def test_vertexai_safety_setting_list(gemini_client): for category in harm_categories ] - print(safety_settings) - converted_safety_settings = GeminiClient._to_vertexai_safety_settings(safety_settings) def compare_safety_settings(converted_safety_settings, expected_safety_settings): @@ -250,6 +253,59 @@ def compare_safety_settings(converted_safety_settings, expected_safety_settings) assert all(settings_comparison), "Converted safety settings are incorrect" +@pytest.mark.skipif(skip, reason="Google GenAI dependency is not installed") +def test_vertexai_tool_config(gemini_client): + + tools = [{"function_name": "calculator"}] + + tool_config = {"function_calling_config": {"mode": "ANY"}} + + expected_tool_config = VertexAIToolConfig( + function_calling_config=VertexAIToolConfig.FunctionCallingConfig( + mode=VertexAIToolConfig.FunctionCallingConfig.Mode.ANY, + allowed_function_names=["calculator"], + ) + ) + + converted_tool_config = GeminiClient._to_vertexai_tool_config(tool_config, tools) + + converted_mode = converted_tool_config._gapic_tool_config.function_calling_config.mode + expected_mode = expected_tool_config._gapic_tool_config.function_calling_config.mode + converted_allowed_func = converted_tool_config._gapic_tool_config.function_calling_config.allowed_function_names + expected_allowed_func = expected_tool_config._gapic_tool_config.function_calling_config.allowed_function_names + + assert converted_mode == expected_mode, "Function calling mode is not converted correctly" + assert ( + converted_allowed_func == expected_allowed_func + ), "Function calling allowed function names is not converted correctly" + + +@pytest.mark.skipif(skip, reason="Google GenAI dependency is not installed") +def test_vertexai_tool_config_no_functions(gemini_client): + + tools = [] + + tool_config = {"function_calling_config": {"mode": "ANY"}} + + expected_tool_config = VertexAIToolConfig( + function_calling_config=VertexAIToolConfig.FunctionCallingConfig( + mode=VertexAIToolConfig.FunctionCallingConfig.Mode.ANY, + ) + ) + + converted_tool_config = GeminiClient._to_vertexai_tool_config(tool_config, tools) + + converted_mode = converted_tool_config._gapic_tool_config.function_calling_config.mode + expected_mode = expected_tool_config._gapic_tool_config.function_calling_config.mode + converted_allowed_func = converted_tool_config._gapic_tool_config.function_calling_config.allowed_function_names + expected_allowed_func = expected_tool_config._gapic_tool_config.function_calling_config.allowed_function_names + + assert converted_mode == expected_mode, "Function calling mode is not converted correctly" + assert ( + converted_allowed_func == expected_allowed_func + ), "Function calling allowed function names is not converted correctly" + + # Test error handling @patch("autogen.oai.gemini.genai") @pytest.mark.skipif(skip, reason="Google GenAI dependency is not installed") @@ -279,9 +335,10 @@ def test_cost_calculation(gemini_client, mock_response): @pytest.mark.skipif(skip, reason="Google GenAI dependency is not installed") +@patch("autogen.oai.gemini.Content") @patch("autogen.oai.gemini.genai.GenerativeModel") @patch("autogen.oai.gemini.genai.configure") -def test_create_response(mock_configure, mock_generative_model, gemini_client): +def test_create_response(mock_configure, mock_generative_model, mock_content, gemini_client): # Mock the genai model configuration and creation process mock_chat = MagicMock() mock_model = MagicMock() @@ -292,6 +349,8 @@ def test_create_response(mock_configure, mock_generative_model, gemini_client): # Set up a mock for the chat history item access and the text attribute return mock_history_part = MagicMock() mock_history_part.text = "Example response" + mock_history_part.function_call = None + mock_chat.history.__getitem__.return_value.parts.__iter__.return_value = iter([mock_history_part]) mock_chat.history.__getitem__.return_value.parts.__getitem__.return_value = mock_history_part # Setup the mock to return a mocked chat response @@ -306,6 +365,55 @@ def test_create_response(mock_configure, mock_generative_model, gemini_client): assert response.choices[0].message.content == "Example response", "Response content should match expected output" +@pytest.mark.skipif(skip, reason="Google GenAI dependency is not installed") +@patch("autogen.oai.gemini.Part") +@patch("autogen.oai.gemini.Content") +@patch("autogen.oai.gemini.genai.GenerativeModel") +@patch("autogen.oai.gemini.genai.configure") +def test_create_function_call_response(mock_configure, mock_generative_model, mock_content, mock_part, gemini_client): + # Mock the genai model configuration and creation process + mock_chat = MagicMock() + mock_model = MagicMock() + mock_configure.return_value = None + mock_generative_model.return_value = mock_model + mock_model.start_chat.return_value = mock_chat + + mock_part.to_dict.return_value = { + "function_call": {"name": "function_name", "args": {"arg1": "value1", "arg2": "value2"}} + } + + # Set up a mock for the chat history item access and the text attribute return + mock_history_part = MagicMock() + mock_history_part.text = None + mock_history_part.function_call.name = "function_name" + mock_history_part.function_call.args = {"arg1": "value1", "arg2": "value2"} + mock_chat.history.__getitem__.return_value.parts.__iter__.return_value = iter([mock_history_part]) + + # Setup the mock to return a mocked chat response + mock_chat.send_message.return_value = MagicMock( + history=[ + MagicMock( + parts=[ + MagicMock( + function_call=MagicMock(name="function_name", arguments='{"arg1": "value1", "arg2": "value2"}') + ) + ] + ) + ] + ) + + # Call the create method + response = gemini_client.create( + {"model": "gemini-pro", "messages": [{"content": "Hello", "role": "user"}], "stream": False} + ) + + # Assertions to check if response is structured as expected + assert ( + response.choices[0].message.tool_calls[0].function.name == "function_name" + and json.loads(response.choices[0].message.tool_calls[0].function.arguments)["arg1"] == "value1" + ), "Response content should match expected output" + + @pytest.mark.skipif(skip, reason="Google GenAI dependency is not installed") @patch("autogen.oai.gemini.GenerativeModel") @patch("autogen.oai.gemini.vertexai.init") @@ -320,7 +428,9 @@ def test_vertexai_create_response(mock_init, mock_generative_model, gemini_clien # Set up a mock for the chat history item access and the text attribute return mock_history_part = MagicMock() mock_history_part.text = "Example response" - mock_chat.history.__getitem__.return_value.parts.__getitem__.return_value = mock_history_part + mock_history_part.function_call = None + mock_history_part.role = "model" + mock_chat.history.__getitem__.return_value.parts.__iter__.return_value = iter([mock_history_part]) # Setup the mock to return a mocked chat response mock_chat.send_message.return_value = MagicMock(history=[MagicMock(parts=[MagicMock(text="Example response")])]) @@ -330,10 +440,60 @@ def test_vertexai_create_response(mock_init, mock_generative_model, gemini_clien {"model": "gemini-pro", "messages": [{"content": "Hello", "role": "user"}], "stream": False} ) - # Assertions to check if response is structured as expected assert response.choices[0].message.content == "Example response", "Response content should match expected output" +@pytest.mark.skipif(skip, reason="Google GenAI dependency is not installed") +@patch("autogen.oai.gemini.VertexAIPart") +@patch("autogen.oai.gemini.VertexAIContent") +@patch("autogen.oai.gemini.GenerativeModel") +@patch("autogen.oai.gemini.vertexai.init") +def test_vertexai_create_function_call_response( + mock_init, mock_generative_model, mock_content, mock_part, gemini_client_with_credentials +): + # Mock the genai model configuration and creation process + mock_chat = MagicMock() + mock_model = MagicMock() + mock_init.return_value = None + mock_generative_model.return_value = mock_model + mock_model.start_chat.return_value = mock_chat + + mock_part.to_dict.return_value = { + "function_call": {"name": "function_name", "args": {"arg1": "value1", "arg2": "value2"}} + } + + # Set up a mock for the chat history item access and the text attribute return + mock_history_part = MagicMock() + mock_history_part.text = None + mock_history_part.function_call.name = "function_name" + mock_history_part.function_call.args = {"arg1": "value1", "arg2": "value2"} + mock_chat.history.__getitem__.return_value.parts.__iter__.return_value = iter([mock_history_part]) + + # Setup the mock to return a mocked chat response + mock_chat.send_message.return_value = MagicMock( + history=[ + MagicMock( + parts=[ + MagicMock( + function_call=MagicMock(name="function_name", arguments='{"arg1": "value1", "arg2": "value2"}') + ) + ] + ) + ] + ) + + # Call the create method + response = gemini_client_with_credentials.create( + {"model": "gemini-pro", "messages": [{"content": "Hello", "role": "user"}], "stream": False} + ) + + # Assertions to check if response is structured as expected + assert ( + response.choices[0].message.tool_calls[0].function.name == "function_name" + and json.loads(response.choices[0].message.tool_calls[0].function.arguments)["arg1"] == "value1" + ), "Response content should match expected output" + + @pytest.mark.skipif(skip, reason="Google GenAI dependency is not installed") @patch("autogen.oai.gemini.GenerativeModel") @patch("autogen.oai.gemini.vertexai.init") @@ -348,6 +508,8 @@ def test_vertexai_default_auth_create_response(mock_init, mock_generative_model, # Set up a mock for the chat history item access and the text attribute return mock_history_part = MagicMock() mock_history_part.text = "Example response" + mock_history_part.function_call = None + mock_chat.history.__getitem__.return_value.parts.__iter__.return_value = iter([mock_history_part]) mock_chat.history.__getitem__.return_value.parts.__getitem__.return_value = mock_history_part # Setup the mock to return a mocked chat response @@ -373,11 +535,11 @@ def test_create_vision_model_response(mock_configure, mock_generative_model, gem # Set up a mock to simulate the vision model behavior mock_vision_response = MagicMock() - mock_vision_part = MagicMock(text="Vision model output") + mock_vision_part = MagicMock(text="Vision model output", function_call=None) # Setting up the chain of return values for vision model response - mock_vision_response._result.candidates.__getitem__.return_value.content.parts.__getitem__.return_value = ( - mock_vision_part + mock_vision_response._result.candidates.__getitem__.return_value.content.parts.__iter__.return_value = iter( + [mock_vision_part] ) mock_model.generate_content.return_value = mock_vision_response @@ -420,10 +582,12 @@ def test_vertexai_create_vision_model_response(mock_init, mock_generative_model, # Set up a mock to simulate the vision model behavior mock_vision_response = MagicMock() - mock_vision_part = MagicMock(text="Vision model output") + mock_vision_part = MagicMock(text="Vision model output", function_call=None) # Setting up the chain of return values for vision model response - mock_vision_response.candidates.__getitem__.return_value.content.parts.__getitem__.return_value = mock_vision_part + mock_vision_response.candidates.__getitem__.return_value.content.parts.__iter__.return_value = iter( + [mock_vision_part] + ) mock_model.generate_content.return_value = mock_vision_response