diff --git a/autogen/agentchat/contrib/agent_builder.py b/autogen/agentchat/contrib/agent_builder.py
index c21684fb371c..d16e04307c3c 100644
--- a/autogen/agentchat/contrib/agent_builder.py
+++ b/autogen/agentchat/contrib/agent_builder.py
@@ -2,12 +2,25 @@
import time
import subprocess as sp
import socket
-import os
import json
import hashlib
from typing import Optional, List, Dict, Tuple, Union
+def _config_check(config: Dict):
+ # check config loading
+ assert config.get("coding", None) is not None, 'Missing "coding" in your config.'
+ assert config.get("default_llm_config", None) is not None, 'Missing "default_llm_config" in your config.'
+ assert config.get("code_execution_config", None) is not None, 'Missing "code_execution_config" in your config.'
+
+ for agent_config in config["agent_configs"]:
+ assert agent_config.get("name", None) is not None, 'Missing agent "name" in your agent_configs.'
+ assert agent_config.get("model", None) is not None, 'Missing agent "model" in your agent_configs.'
+ assert (
+ agent_config.get("system_message", None) is not None
+ ), 'Missing agent "system_message" in your agent_configs.'
+
+
class AgentBuilder:
"""
AgentBuilder can help user build an automatic task solving process powered by multi-agent system.
@@ -37,7 +50,8 @@ class AgentBuilder:
Hint:
# Considering the effort, the position in this task should be no more then {max_agents}, less is better.
- # Answer the name of those positions/jobs, separated by comma and use "_" instead of space. For example: Product_manager,Programmer
+ # Answer the name of those positions/jobs.
+ # Separated names by comma and use "_" instead of space. For example: Product_manager,Programmer
# Only return the list of positions.
"""
@@ -69,6 +83,7 @@ def __init__(
Args:
config_path: path of the OpenAI api configs.
builder_model: specify a model as the backbone of build manager.
+ agent_model: specify a model as the backbone of participant agents.
host: endpoint host.
endpoint_building_timeout: timeout for building up an endpoint server.
"""
@@ -89,6 +104,12 @@ def __init__(
if self._is_port_open(host, port):
self.open_ports.append(str(port))
+ def set_builder_model(self, model: str):
+ self.builder_model = model
+
+ def set_agent_model(self, model: str):
+ self.agent_model = model
+
@staticmethod
def _is_port_open(host, port):
"""Check if a tcp port is open."""
@@ -128,6 +149,11 @@ def _create_agent(
agent: a set-up agent.
"""
config_list = autogen.config_list_from_json(self.config_path, filter_dict={"model": [model_name_or_hf_repo]})
+ if len(config_list) == 0:
+ raise RuntimeError(
+ f"Fail to initialize agent:{agent_name}: {self.builder_model} does not exist in {self.config_path}. "
+ f'If you would like to change this model, please specify the "agent_model" in the constructor.'
+ )
if "gpt-" in model_name_or_hf_repo:
server_id = self.openai_server_name
else:
@@ -259,14 +285,6 @@ def build(
"""
use_api = False
- if code_execution_config is None:
- code_execution_config = {
- "last_n_messages": 2,
- "work_dir": "groupchat",
- "use_docker": False,
- "timeout": 60,
- }
-
if cached_configs is None:
use_api = True
agent_configs = []
@@ -276,9 +294,23 @@ def build(
default_llm_config = cached_configs["default_llm_config"]
coding = cached_configs["coding"]
agent_configs = cached_configs["agent_configs"]
+ code_execution_config = cached_configs["code_execution_config"]
+
+ if code_execution_config is None:
+ code_execution_config = {
+ "last_n_messages": 2,
+ "work_dir": "groupchat",
+ "use_docker": False,
+ "timeout": 60,
+ }
if use_api:
config_list = autogen.config_list_from_json(self.config_path, filter_dict={"model": [self.builder_model]})
+ if len(config_list) == 0:
+ raise RuntimeError(
+ f"Fail to initialize build manager: {self.builder_model} does not exist in {self.config_path}. "
+ f'If you want to change this model, please specify the "builder_model" in the constructor.'
+ )
build_manager = autogen.OpenAIWrapper(config_list=config_list)
print("Generating agents...")
@@ -294,8 +326,8 @@ def build(
.choices[0]
.message.content
)
- agent_name_list = resp_agent_name.split(",")
- print(f"{resp_agent_name} are generated.")
+ agent_name_list = [agent_name.strip().replace(" ", "_") for agent_name in resp_agent_name.split(",")]
+ print(f"{agent_name_list} are generated.")
agent_sys_msg_list = []
for name in agent_name_list:
@@ -390,19 +422,31 @@ def save(self, filepath: Optional[str] = None) -> str:
def load(
self,
- filepath: str,
+ filepath: Optional[str] = None,
+ config_json: Optional[str] = None,
**kwargs,
):
"""
Load building configs and call the build function to complete building without calling online LLMs' api.
Args:
- filepath: filepath for the save config.
+ filepath: filepath or JSON string for the save config.
+ config_json: JSON string for the save config.
"""
- try:
- print(f"Loding config from {filepath}")
- cached_configs = json.load(open(filepath))
- except FileNotFoundError:
- raise FileNotFoundError(f"Config file {filepath} does not exist.")
-
- return self.build(cached_configs=cached_configs, **kwargs)
+ # load json string.
+ if config_json is not None:
+ cached_configs = json.loads(config_json)
+ print("Loading config from JSON...")
+ _config_check(cached_configs)
+ return self.build(cached_configs=cached_configs, **kwargs)
+
+ # load from path.
+ if filepath is not None:
+ print(f"Loading config from {filepath}")
+ try:
+ with open(filepath) as f:
+ cached_configs = json.load(f)
+ except FileNotFoundError as e:
+ raise FileNotFoundError(f"{filepath} does not exist.") from e
+ _config_check(cached_configs)
+ return self.build(cached_configs=cached_configs, **kwargs)
diff --git a/autogen/agentchat/groupchat.py b/autogen/agentchat/groupchat.py
index c420d7b22044..5b12a97e6b17 100644
--- a/autogen/agentchat/groupchat.py
+++ b/autogen/agentchat/groupchat.py
@@ -30,16 +30,16 @@ class GroupChat:
- "manual": the next speaker is selected manually by user input.
- "random": the next speaker is selected randomly.
- "round_robin": the next speaker is selected in a round robin fashion, i.e., iterating in the same order as provided in `agents`.
- - allow_repeat_speaker: whether to allow the same speaker to speak consecutively. Default is True.
+ - allow_repeat_speaker: whether to allow the same speaker to speak consecutively. Default is True, in which case all speakers are allowed to speak consecutively. If allow_repeat_speaker is a list of Agents, then only those listed agents are allowed to repeat. If set to False, then no speakers are allowed to repeat.
"""
agents: List[Agent]
messages: List[Dict]
- max_round: int = 10
- admin_name: str = "Admin"
- func_call_filter: bool = True
- speaker_selection_method: str = "auto"
- allow_repeat_speaker: bool = True
+ max_round: Optional[int] = 10
+ admin_name: Optional[str] = "Admin"
+ func_call_filter: Optional[bool] = True
+ speaker_selection_method: Optional[str] = "auto"
+ allow_repeat_speaker: Optional[Union[bool, List[Agent]]] = True
_VALID_SPEAKER_SELECTION_METHODS = ["auto", "manual", "random", "round_robin"]
@@ -125,6 +125,13 @@ def _prepare_and_select_agents(self, last_speaker: Agent) -> Tuple[Optional[Agen
f"It should be one of {self._VALID_SPEAKER_SELECTION_METHODS} (case insensitive). "
)
+ # If provided a list, make sure the agent is in the list
+ allow_repeat_speaker = (
+ self.allow_repeat_speaker
+ if isinstance(self.allow_repeat_speaker, bool)
+ else last_speaker in self.allow_repeat_speaker
+ )
+
agents = self.agents
n_agents = len(agents)
# Warn if GroupChat is underpopulated
@@ -133,7 +140,7 @@ def _prepare_and_select_agents(self, last_speaker: Agent) -> Tuple[Optional[Agen
f"GroupChat is underpopulated with {n_agents} agents. "
"Please add more agents to the GroupChat or use direct communication instead."
)
- elif n_agents == 2 and self.speaker_selection_method.lower() != "round_robin" and self.allow_repeat_speaker:
+ elif n_agents == 2 and self.speaker_selection_method.lower() != "round_robin" and allow_repeat_speaker:
logger.warning(
f"GroupChat is underpopulated with {n_agents} agents. "
"It is recommended to set speaker_selection_method to 'round_robin' or allow_repeat_speaker to False."
@@ -159,7 +166,7 @@ def _prepare_and_select_agents(self, last_speaker: Agent) -> Tuple[Optional[Agen
"Please check the function_map of the agents."
)
# remove the last speaker from the list to avoid selecting the same speaker if allow_repeat_speaker is False
- agents = agents if self.allow_repeat_speaker else [agent for agent in agents if agent != last_speaker]
+ agents = agents if allow_repeat_speaker else [agent for agent in agents if agent != last_speaker]
if self.speaker_selection_method.lower() == "manual":
selected_agent = self.manual_select_speaker(agents)
diff --git a/autogen/oai/client.py b/autogen/oai/client.py
index a4714075b0f9..14abb63ad6c8 100644
--- a/autogen/oai/client.py
+++ b/autogen/oai/client.py
@@ -191,7 +191,7 @@ def _construct_create_params(self, create_config: Dict, extra_kwargs: Dict) -> D
def create(self, **config):
"""Make a completion for a given config using openai's clients.
Besides the kwargs allowed in openai's client, we allow the following additional kwargs.
- The config in each client will be overriden by the config.
+ The config in each client will be overridden by the config.
Args:
- context (Dict | None): The context to instantiate the prompt or messages. Default to None.
diff --git a/autogen/token_count_utils.py b/autogen/token_count_utils.py
index 9e254932faff..18e4d9e4e821 100644
--- a/autogen/token_count_utils.py
+++ b/autogen/token_count_utils.py
@@ -2,27 +2,33 @@
import logging
import json
import tiktoken
+import re
logger = logging.getLogger(__name__)
def get_max_token_limit(model="gpt-3.5-turbo-0613"):
+ # Handle common azure model names/aliases
+ model = re.sub(r"^gpt\-?35", "gpt-3.5", model)
+ model = re.sub(r"^gpt4", "gpt-4", model)
+
max_token_limit = {
"gpt-3.5-turbo": 4096,
"gpt-3.5-turbo-0301": 4096,
"gpt-3.5-turbo-0613": 4096,
"gpt-3.5-turbo-instruct": 4096,
- "gpt-3.5-turbo-16k": 16384,
- "gpt-35-turbo": 4096,
- "gpt-35-turbo-16k": 16384,
- "gpt-35-turbo-instruct": 4096,
+ "gpt-3.5-turbo-16k": 16385,
+ "gpt-3.5-turbo-16k-0613": 16385,
+ "gpt-3.5-turbo-1106": 16385,
"gpt-4": 8192,
"gpt-4-32k": 32768,
"gpt-4-32k-0314": 32768, # deprecate in Sep
"gpt-4-0314": 8192, # deprecate in Sep
"gpt-4-0613": 8192,
"gpt-4-32k-0613": 32768,
+ "gpt-4-1106-preview": 128000,
+ "gpt-4-vision-preview": 128000,
}
return max_token_limit[model]
diff --git a/notebook/agentchat_function_call_async.ipynb b/notebook/agentchat_function_call_async.ipynb
new file mode 100644
index 000000000000..49f61afec266
--- /dev/null
+++ b/notebook/agentchat_function_call_async.ipynb
@@ -0,0 +1,450 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "ae1f50ec",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "9a71fa36",
+ "metadata": {},
+ "source": [
+ "# Auto Generated Agent Chat: Task Solving with Provided Tools as Functions\n",
+ "\n",
+ "AutoGen offers conversable agents powered by LLM, tool, or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation through multi-agent conversation. Please find documentation about this feature [here](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat).\n",
+ "\n",
+ "In this notebook, we demonstrate how to use `AssistantAgent` and `UserProxyAgent` to make function calls with the new feature of OpenAI models (in model version 0613). A specified prompt and function configs must be passed to `AssistantAgent` to initialize the agent. The corresponding functions must be passed to `UserProxyAgent`, which will execute any function calls made by `AssistantAgent`. Besides this requirement of matching descriptions with functions, we recommend checking the system message in the `AssistantAgent` to ensure the instructions align with the function call descriptions.\n",
+ "\n",
+ "## Requirements\n",
+ "\n",
+ "AutoGen requires `Python>=3.8`. To run this notebook example, please install the [mathchat] option since we will import functions from `MathUserProxyAgent`:\n",
+ "```bash\n",
+ "pip install \"pyautogen[mathchat]\"\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "2b803c17",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# %pip install \"pyautogen[mathchat]~=0.1.0\""
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "5ebd2397",
+ "metadata": {},
+ "source": [
+ "## Set your API Endpoint\n",
+ "\n",
+ "The [`config_list_from_models`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_models) function tries to create a list of configurations using Azure OpenAI endpoints and OpenAI endpoints for the provided list of models. It assumes the api keys and api bases are stored in the corresponding environment variables or local txt files:\n",
+ "\n",
+ "- OpenAI API key: os.environ[\"OPENAI_API_KEY\"] or `openai_api_key_file=\"key_openai.txt\"`.\n",
+ "- Azure OpenAI API key: os.environ[\"AZURE_OPENAI_API_KEY\"] or `aoai_api_key_file=\"key_aoai.txt\"`. Multiple keys can be stored, one per line.\n",
+ "- Azure OpenAI API base: os.environ[\"AZURE_OPENAI_API_BASE\"] or `aoai_api_base_file=\"base_aoai.txt\"`. Multiple bases can be stored, one per line.\n",
+ "\n",
+ "It's OK to have only the OpenAI API key, or only the Azure OpenAI API key + base.\n",
+ "If you open this notebook in google colab, you can upload your files by clicking the file icon on the left panel and then choosing \"upload file\" icon.\n",
+ "\n",
+ "The following code excludes Azure OpenAI endpoints from the config list because some endpoints don't support functions yet. Remove the `exclude` argument if they do."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "dca301a4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import autogen\n",
+ "\n",
+ "config_list = autogen.config_list_from_json(\n",
+ " \"OAI_CONFIG_LIST\",\n",
+ " file_location=\".\",\n",
+ " filter_dict={\n",
+ " \"model\": [\"gpt-4\"],\n",
+ " },\n",
+ ")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "92fde41f",
+ "metadata": {},
+ "source": [
+ "The config list looks like the following:\n",
+ "```python\n",
+ "config_list = [\n",
+ " {\n",
+ " 'model': 'gpt-4',\n",
+ " 'api_key': '',\n",
+ " }, # OpenAI API endpoint for gpt-4\n",
+ " {\n",
+ " 'model': 'gpt-3.5-turbo',\n",
+ " 'api_key': '',\n",
+ " }, # OpenAI API endpoint for gpt-3.5-turbo\n",
+ " {\n",
+ " 'model': 'gpt-3.5-turbo-16k',\n",
+ " 'api_key': '',\n",
+ " }, # OpenAI API endpoint for gpt-3.5-turbo-16k\n",
+ "]\n",
+ "```\n"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "2b9526e7",
+ "metadata": {},
+ "source": [
+ "## Making Async and Sync Function Calls\n",
+ "\n",
+ "In this example, we demonstrate function call execution with `AssistantAgent` and `UserProxyAgent`. With the default system prompt of `AssistantAgent`, we allow the LLM assistant to perform tasks with code, and the `UserProxyAgent` would extract code blocks from the LLM response and execute them. With the new \"function_call\" feature, we define functions and specify the description of the function in the OpenAI config for the `AssistantAgent`. Then we register the functions in `UserProxyAgent`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "9fb85afb",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[33muser_proxy\u001b[0m (to chatbot):\n",
+ "\n",
+ "Create a timer for 5 seconds and then a stopwatch for 5 seconds.\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[33mchatbot\u001b[0m (to user_proxy):\n",
+ "\n",
+ "\u001b[32m***** Suggested function Call: timer *****\u001b[0m\n",
+ "Arguments: \n",
+ "{\n",
+ " \"num_seconds\": \"5\"\n",
+ "}\n",
+ "\u001b[32m******************************************\u001b[0m\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[35m\n",
+ ">>>>>>>> EXECUTING ASYNC FUNCTION timer...\u001b[0m\n",
+ "\u001b[33muser_proxy\u001b[0m (to chatbot):\n",
+ "\n",
+ "\u001b[32m***** Response from calling function \"timer\" *****\u001b[0m\n",
+ "Timer is done!\n",
+ "\u001b[32m**************************************************\u001b[0m\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[33mchatbot\u001b[0m (to user_proxy):\n",
+ "\n",
+ "\u001b[32m***** Suggested function Call: stopwatch *****\u001b[0m\n",
+ "Arguments: \n",
+ "{\n",
+ " \"num_seconds\": \"5\"\n",
+ "}\n",
+ "\u001b[32m**********************************************\u001b[0m\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[35m\n",
+ ">>>>>>>> EXECUTING FUNCTION stopwatch...\u001b[0m\n",
+ "\u001b[33muser_proxy\u001b[0m (to chatbot):\n",
+ "\n",
+ "\u001b[32m***** Response from calling function \"stopwatch\" *****\u001b[0m\n",
+ "Stopwatch is done!\n",
+ "\u001b[32m******************************************************\u001b[0m\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[33mchatbot\u001b[0m (to user_proxy):\n",
+ "\n",
+ "TERMINATE\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "# define functions according to the function description\n",
+ "import time\n",
+ "\n",
+ "# An example async function\n",
+ "async def timer(num_seconds):\n",
+ " for i in range(int(num_seconds)):\n",
+ " time.sleep(1)\n",
+ " # should print to stdout\n",
+ " return \"Timer is done!\"\n",
+ "\n",
+ "# An example sync function \n",
+ "def stopwatch(num_seconds):\n",
+ " for i in range(int(num_seconds)):\n",
+ " time.sleep(1)\n",
+ " return \"Stopwatch is done!\"\n",
+ "\n",
+ "llm_config = {\n",
+ " \"functions\": [\n",
+ " {\n",
+ " \"name\": \"timer\",\n",
+ " \"description\": \"create a timer for N seconds\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"num_seconds\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"Number of seconds in the timer.\",\n",
+ " }\n",
+ " },\n",
+ " \"required\": [\"num_seconds\"],\n",
+ " },\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"stopwatch\",\n",
+ " \"description\": \"create a stopwatch for N seconds\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"num_seconds\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"Number of seconds in the stopwatch.\",\n",
+ " }\n",
+ " },\n",
+ " \"required\": [\"num_seconds\"],\n",
+ " },\n",
+ " },\n",
+ " ],\n",
+ " \"config_list\": config_list,\n",
+ "}\n",
+ "coder = autogen.AssistantAgent(\n",
+ " name=\"chatbot\",\n",
+ " system_message=\"For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.\",\n",
+ " llm_config=llm_config,\n",
+ ")\n",
+ "\n",
+ "# create a UserProxyAgent instance named \"user_proxy\"\n",
+ "user_proxy = autogen.UserProxyAgent(\n",
+ " name=\"user_proxy\",\n",
+ " system_message=\"A proxy for the user for executing code.\",\n",
+ " is_termination_msg=lambda x: x.get(\"content\", \"\") and x.get(\"content\", \"\").rstrip().endswith(\"TERMINATE\"),\n",
+ " human_input_mode=\"NEVER\",\n",
+ " max_consecutive_auto_reply=10,\n",
+ " code_execution_config={\"work_dir\": \"coding\"},\n",
+ ")\n",
+ "\n",
+ "# register the functions\n",
+ "user_proxy.register_function(\n",
+ " function_map={\n",
+ " \"timer\": timer,\n",
+ " \"stopwatch\": stopwatch,\n",
+ " }\n",
+ ")\n",
+ "# start the conversation\n",
+ "# 'await' is used to pause and resume code execution for async IO operations. \n",
+ "# Without 'await', an async function returns a coroutine object but doesn't execute the function.\n",
+ "# With 'await', the async function is executed and the current function is paused until the awaited function returns a result.\n",
+ "await user_proxy.a_initiate_chat(\n",
+ " coder,\n",
+ " message=\"Create a timer for 5 seconds and then a stopwatch for 5 seconds.\",\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "950f3de7",
+ "metadata": {},
+ "source": [
+ "# Async Function Call with Group Chat\n",
+ "Sync and async can be used in topologies beyond two agents. Below, we show this feature for a group chat."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "2472f95c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "\n",
+ "# Add a function for robust group chat termination\n",
+ "def terminate_group_chat(message):\n",
+ " return f\"[GROUPCHAT_TERMINATE] {message}\"\n",
+ "\n",
+ "# update LLM config\n",
+ "llm_config[\"functions\"].append(\n",
+ " {\n",
+ " \"name\": \"terminate_group_chat\",\n",
+ " \"description\": \"terminate the group chat\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"message\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"Message to be sent to the group chat.\",\n",
+ " }\n",
+ " },\n",
+ " \"required\": [\"message\"],\n",
+ " },\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "# redefine the coder agent so that it uses the new llm_config\n",
+ "coder = autogen.AssistantAgent(\n",
+ " name=\"chatbot\",\n",
+ " system_message=\"For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.\",\n",
+ " llm_config=llm_config,\n",
+ ")\n",
+ "\n",
+ "# register the new function with user proxy agent\n",
+ "user_proxy.register_function(\n",
+ " function_map={\n",
+ " \"terminate_group_chat\": terminate_group_chat,\n",
+ " }\n",
+ ")\n",
+ "markdownagent = autogen.AssistantAgent(\n",
+ " name=\"Markdown_agent\",\n",
+ " system_message=\"Respond in markdown only\",\n",
+ " llm_config=llm_config,\n",
+ ")\n",
+ "groupchat = autogen.GroupChat(agents=[user_proxy, coder, markdownagent], messages=[], max_round=12)\n",
+ "manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config,\n",
+ " is_termination_msg=lambda x: \"GROUPCHAT_TERMINATE\" in x.get(\"content\", \"\"),\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "e2c9267a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[33muser_proxy\u001b[0m (to chat_manager):\n",
+ "\n",
+ "\n",
+ "1) Create a timer for 5 seconds.\n",
+ "2) a stopwatch for 5 seconds.\n",
+ "3) Pretty print the result as md.\n",
+ "4) when 1-3 are done, terminate the group chat\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[33mchatbot\u001b[0m (to chat_manager):\n",
+ "\n",
+ "\u001b[32m***** Suggested function Call: timer *****\u001b[0m\n",
+ "Arguments: \n",
+ "\n",
+ "{\n",
+ " \"num_seconds\": \"5\"\n",
+ "}\n",
+ "\u001b[32m******************************************\u001b[0m\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[35m\n",
+ ">>>>>>>> EXECUTING ASYNC FUNCTION timer...\u001b[0m\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[33muser_proxy\u001b[0m (to chat_manager):\n",
+ "\n",
+ "\u001b[32m***** Response from calling function \"timer\" *****\u001b[0m\n",
+ "Timer is done!\n",
+ "\u001b[32m**************************************************\u001b[0m\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[33mchatbot\u001b[0m (to chat_manager):\n",
+ "\n",
+ "\u001b[32m***** Suggested function Call: stopwatch *****\u001b[0m\n",
+ "Arguments: \n",
+ "\n",
+ "{\n",
+ " \"num_seconds\": \"5\"\n",
+ "}\n",
+ "\u001b[32m**********************************************\u001b[0m\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[35m\n",
+ ">>>>>>>> EXECUTING FUNCTION stopwatch...\u001b[0m\n",
+ "\u001b[33muser_proxy\u001b[0m (to chat_manager):\n",
+ "\n",
+ "\u001b[32m***** Response from calling function \"stopwatch\" *****\u001b[0m\n",
+ "Stopwatch is done!\n",
+ "\u001b[32m******************************************************\u001b[0m\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[33mMarkdown_agent\u001b[0m (to chat_manager):\n",
+ "\n",
+ "```markdown\n",
+ "# Results \n",
+ "\n",
+ "1. Timer: The timer for 5 seconds has completed.\n",
+ "2. Stopwatch: The stopwatch for 5 seconds has completed.\n",
+ "```\n",
+ "By the way, step 3 is done now. Moving on to step 4.\n",
+ "\u001b[32m***** Suggested function Call: terminate_group_chat *****\u001b[0m\n",
+ "Arguments: \n",
+ "\n",
+ "{\n",
+ " \"message\": \"The tasks have been completed. Terminating the group chat now.\"\n",
+ "}\n",
+ "\u001b[32m*********************************************************\u001b[0m\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[35m\n",
+ ">>>>>>>> EXECUTING FUNCTION terminate_group_chat...\u001b[0m\n",
+ "\u001b[33muser_proxy\u001b[0m (to chat_manager):\n",
+ "\n",
+ "\u001b[32m***** Response from calling function \"terminate_group_chat\" *****\u001b[0m\n",
+ "[GROUPCHAT_TERMINATE] The tasks have been completed. Terminating the group chat now.\n",
+ "\u001b[32m*****************************************************************\u001b[0m\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "await user_proxy.a_initiate_chat(manager,\n",
+ " message=\"\"\"\n",
+ "1) Create a timer for 5 seconds.\n",
+ "2) a stopwatch for 5 seconds.\n",
+ "3) Pretty print the result as md.\n",
+ "4) when 1-3 are done, terminate the group chat\"\"\")\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "flaml_dev",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebook/agentchat_guidance.ipynb b/notebook/agentchat_guidance.ipynb
new file mode 100644
index 000000000000..edbed9df100d
--- /dev/null
+++ b/notebook/agentchat_guidance.ipynb
@@ -0,0 +1,312 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Using Guidance with AutoGen\n",
+ "\n",
+ "This notebook shows how Guidance can be used to enable structured responses from AutoGen agents. In particular, this notebook focuses on creating agents that always output a valid code block or valid json object.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from guidance import models, gen, user, assistant, system\n",
+ "from autogen import AssistantAgent, UserProxyAgent, Agent\n",
+ "from autogen import config_list_from_json\n",
+ "\n",
+ "llm_config = config_list_from_json(\"OAI_CONFIG_LIST\")[0] # use the first config\n",
+ "gpt = models.OpenAI(\"gpt-4\", api_key=llm_config.get(\"api_key\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The example below uses guidance to create a `guidance_coder` agent that only responds with valid code blocks."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[33muser\u001b[0m (to guidance_coder):\n",
+ "\n",
+ "Plot and save a chart of nvidia and tsla stock price change YTD.\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[33mguidance_coder\u001b[0m (to user):\n",
+ "\n",
+ "```python\n",
+ "# filename: stock_price_change.py\n",
+ "\n",
+ "import pandas as pd\n",
+ "import yfinance as yf\n",
+ "import matplotlib.pyplot as plt\n",
+ "from datetime import datetime\n",
+ "\n",
+ "# Get today's date\n",
+ "today = datetime.today().strftime('%Y-%m-%d')\n",
+ "\n",
+ "# Download stock data\n",
+ "nvda = yf.download('NVDA', start='2022-01-01', end=today)\n",
+ "tsla = yf.download('TSLA', start='2022-01-01', end=today)\n",
+ "\n",
+ "# Calculate percentage change in closing price\n",
+ "nvda['Pct Change'] = nvda['Close'].pct_change()\n",
+ "tsla['Pct Change'] = tsla['Close'].pct_change()\n",
+ "\n",
+ "# Plot percentage change\n",
+ "plt.figure(figsize=(14,7))\n",
+ "plt.plot(nvda['Pct Change'], label='NVDA')\n",
+ "plt.plot(tsla['Pct Change'], label='TSLA')\n",
+ "plt.title('Nvidia and Tesla Stock Price Change YTD')\n",
+ "plt.xlabel('Date')\n",
+ "plt.ylabel('Percentage Change')\n",
+ "plt.legend()\n",
+ "plt.grid(True)\n",
+ "\n",
+ "# Save the plot as a PNG file\n",
+ "plt.savefig('stock_price_change.png')\n",
+ "```\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[31m\n",
+ ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
+ "\u001b[31m\n",
+ ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "execute_code was called without specifying a value for use_docker. Since the python docker package is not available, code will be run natively. Note: this fallback behavior is subject to change\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[33muser\u001b[0m (to guidance_coder):\n",
+ "\n",
+ "exitcode: 0 (execution succeeded)\n",
+ "Code output: \n",
+ "\n",
+ "[*********************100%%**********************] 1 of 1 completed\n",
+ "\n",
+ "[*********************100%%**********************] 1 of 1 completed\n",
+ "\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[33mguidance_coder\u001b[0m (to user):\n",
+ "\n",
+ "Great! The code executed successfully and the chart of Nvidia and Tesla stock price change Year-To-Date (YTD) has been saved as 'stock_price_change.png' in the current directory. You can open this file to view the chart.\n",
+ "\n",
+ "TERMINATE\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import re\n",
+ "\n",
+ "def is_valid_code_block(code):\n",
+ " pattern = r\"```[\\w\\s]*\\n([\\s\\S]*?)\\n```\"\n",
+ " match = re.search(pattern, code)\n",
+ " if match:\n",
+ " return True\n",
+ " else:\n",
+ " return False\n",
+ "\n",
+ "\n",
+ "def generate_structured_response(recipient, messages, sender, config):\n",
+ " gpt = models.OpenAI(\"gpt-4\", api_key=llm_config.get(\"api_key\"), echo=False)\n",
+ " \n",
+ " # populate the recipient with the messages from the history\n",
+ " with system():\n",
+ " lm = gpt + recipient.system_message\n",
+ " \n",
+ " for message in messages:\n",
+ " if message.get(\"role\") == \"user\":\n",
+ " with user():\n",
+ " lm += message.get(\"content\")\n",
+ " else:\n",
+ " with assistant():\n",
+ " lm += message.get(\"content\")\n",
+ "\n",
+ " # generate a new response and store it\n",
+ " with assistant():\n",
+ " lm += gen(name=\"initial_response\")\n",
+ " # ask the agent to reflect on the nature of the response and store it\n",
+ " with user():\n",
+ " lm += \"Does the very last response from you contain code? Respond with yes or no.\"\n",
+ " with assistant():\n",
+ " lm += gen(name=\"contains_code\")\n",
+ " # if the response contains code, ask the agent to generate a proper code block\n",
+ " if \"yes\" in lm[\"contains_code\"].lower():\n",
+ " with user():\n",
+ " lm += \"Respond with a single blocks containing the valid code. Valid code blocks start with ```\"\n",
+ " with assistant():\n",
+ " lm += \"```\" + gen(name=\"code\")\n",
+ " response = \"```\" + lm[\"code\"]\n",
+ " \n",
+ " is_valid = is_valid_code_block(response)\n",
+ " if not is_valid:\n",
+ " raise ValueError(f\"Failed to generate a valid code block\\n {response}\")\n",
+ " \n",
+ " # otherwise, just use the initial response\n",
+ " else:\n",
+ " response = lm[\"initial_response\"]\n",
+ " \n",
+ " return True, response\n",
+ "\n",
+ "\n",
+ "guidance_agent = AssistantAgent(\"guidance_coder\", llm_config=llm_config)\n",
+ "guidance_agent.register_reply(Agent, generate_structured_response, 1)\n",
+ "user_proxy = UserProxyAgent(\"user\", human_input_mode=\"TERMINATE\", code_execution_config={\"work_dir\": \"coding\"},\n",
+ " is_termination_msg=lambda msg: \"TERMINATE\" in msg.get(\"content\"))\n",
+ "user_proxy.initiate_chat(guidance_agent, message=\"Plot and save a chart of nvidia and tsla stock price change YTD.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The example below uses Guidance to enable a `guidance_labeler` agent that only responds with a valid JSON that labels a given comment/joke."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[33muser\u001b[0m (to guidance_labeler):\n",
+ "\n",
+ "\n",
+ "Label the TEXT via the following instructions:\n",
+ " \n",
+ "The label must be a JSON of the format:\n",
+ "{\n",
+ " \"label\": str,\n",
+ " \"explanation\": str\n",
+ "}\n",
+ " \n",
+ "TEXT: what did the fish say when it bumped into a wall? Dam!\n",
+ "\n",
+ "\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "\u001b[33mguidance_labeler\u001b[0m (to user):\n",
+ "\n",
+ "{\"label\":\"Joke\",\"explanation\":\"The text is a joke, using a play on words where the fish says 'Dam!' after bumping into a wall, which is a pun on the word 'damn' and a 'dam' which is a barrier that stops or restricts the flow of water, often creating a reservoir, and is something a fish might encounter.\"}\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "from pydantic import BaseModel\n",
+ "\n",
+ "class Response(BaseModel):\n",
+ " label: str\n",
+ " explanation: str\n",
+ "\n",
+ "response_prompt_instructions = \"\"\"The label must be a JSON of the format:\n",
+ "{\n",
+ " \"label\": str,\n",
+ " \"explanation\": str\n",
+ "}\"\"\"\n",
+ "\n",
+ "def generate_structured_response(recipient, messages, sender, config):\n",
+ " gpt = models.OpenAI(\"gpt-4\", api_key=llm_config.get(\"api_key\"), echo=False)\n",
+ " \n",
+ " # populate the recipient with the messages from the history\n",
+ " with system():\n",
+ " lm = gpt + recipient.system_message\n",
+ " \n",
+ " for message in messages:\n",
+ " if message.get(\"role\") == \"user\":\n",
+ " with user():\n",
+ " lm += message.get(\"content\")\n",
+ " else:\n",
+ " with assistant():\n",
+ " lm += message.get(\"content\")\n",
+ "\n",
+ " # generate a new response and store it\n",
+ " with assistant():\n",
+ " lm += gen(name=\"initial_response\")\n",
+ " # ask the agent to reflect on the nature of the response and store it\n",
+ " with user():\n",
+ " lm += \"Does the very last response from you contain JSON object? Respond with yes or no.\"\n",
+ " with assistant():\n",
+ " lm += gen(name=\"contains_json\")\n",
+ " # if the response contains code, ask the agent to generate a proper code block\n",
+ " if \"yes\" in lm[\"contains_json\"].lower():\n",
+ " with user():\n",
+ " lm += \"What was that JSON object? Only respond with that valid JSON string. A valid JSON string starts with {\"\n",
+ " with assistant():\n",
+ " lm += \"{\" + gen(name=\"json\")\n",
+ " response = \"{\" + lm[\"json\"]\n",
+ " # verify that the response is valid json\n",
+ " try:\n",
+ " response_obj = Response.model_validate_json(response)\n",
+ " response = response_obj.model_dump_json()\n",
+ " except Exception as e:\n",
+ " response = str(e)\n",
+ " # otherwise, just use the initial response\n",
+ " else:\n",
+ " response = lm[\"initial_response\"]\n",
+ "\n",
+ " return True, response\n",
+ "\n",
+ "guidance_agent = AssistantAgent(\"guidance_labeler\", llm_config=llm_config, system_message=\"You are a helpful assistant\")\n",
+ "guidance_agent.register_reply(Agent, generate_structured_response, 1)\n",
+ "user_proxy = UserProxyAgent(\"user\", human_input_mode=\"ALWAYS\", code_execution_config=False)\n",
+ "user_proxy.initiate_chat(guidance_agent, message=f\"\"\"\n",
+ "Label the TEXT via the following instructions:\n",
+ " \n",
+ "{response_prompt_instructions}\n",
+ " \n",
+ "TEXT: what did the fish say when it bumped into a wall? Dam!\n",
+ "\n",
+ "\"\"\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/samples/apps/autogen-assistant/README.md b/samples/apps/autogen-assistant/README.md
index 35d1f9932cd1..ab2dbbfbb1be 100644
--- a/samples/apps/autogen-assistant/README.md
+++ b/samples/apps/autogen-assistant/README.md
@@ -8,7 +8,7 @@ AutoGen Assistant is an Autogen-powered AI app (user interface) that can convers
Some of the capabilities supported by the app frontend include the following:
-- [x] Select fron a list of agents (current support for two agent workflows - `UserProxyAgent` and `AssistantAgent`)
+- [x] Select from a list of agents (current support for two agent workflows - `UserProxyAgent` and `AssistantAgent`)
- [x] Modify agent configuration (e.g. temperature, model, agent system message, model etc) and chat with updated agent configurations.
- [x] View agent messages and output files in the UI from agent runs.
- [ ] Support for more complex agent workflows (e.g. `GroupChat` workflows)
@@ -46,18 +46,20 @@ Project Structure:
```bash
npm install -g gatsby-cli
npm install --global yarn
- cd frontend
yarn install
yarn build
```
- For Windows users, to build the frontend, you may need alternative commands to build the frontend.
+ - For Windows users, to build the frontend, you may need alternative commands to build the frontend.
- ```bash
-
- gatsby clean && rmdir /s /q ..\\autogenra\\web\\ui && (set \"PREFIX_PATH_VALUE=\" || ver>nul) && gatsby build --prefix-paths && xcopy /E /I /Y public ..\\autogenra\\web\\ui
+ ```bash
+ gatsby clean && rmdir /s /q ..\\autogenra\\web\\ui && (set \"PREFIX_PATH_VALUE=\" || ver>nul) && gatsby build --prefix-paths && xcopy /E /I /Y public ..\\autogenra\\web\\ui
+ ```
+ - Navigate to the `samples/apps/autogen-assistant` directory and install the `autogenra` library in your current Python environment:
- ````
+ ```bash
+ pip install -e .
+ ```
### Running the Application
diff --git a/samples/tools/testbed/Dockerfile b/samples/tools/testbed/Dockerfile
new file mode 100644
index 000000000000..6ce06f93a621
--- /dev/null
+++ b/samples/tools/testbed/Dockerfile
@@ -0,0 +1,16 @@
+# Host a jsPsych experiment in Azure
+FROM python:3.11
+MAINTAINER AutoGen
+
+# Upgrade pip
+RUN pip install --upgrade pip
+
+# Set the image to the Pacific Timezone
+RUN ln -snf /usr/share/zoneinfo/US/Pacific /etc/localtime && echo "US/Pacific" > /etc/timezone
+
+# Pre-load autogen dependencies, but not autogen itself since we'll often want to install the latest from source
+RUN pip install pyautogen[teachable,lmm,graphs]
+RUN pip uninstall --yes pyautogen
+
+# Pre-load popular packages as per https://learnpython.com/blog/most-popular-python-packages/
+RUN pip install numpy pandas matplotlib seaborn scikit-learn requests urllib3 nltk pillow pytest
diff --git a/samples/tools/testbed/README.md b/samples/tools/testbed/README.md
index 506c8b0835f0..76560b1c6bb5 100644
--- a/samples/tools/testbed/README.md
+++ b/samples/tools/testbed/README.md
@@ -46,6 +46,10 @@ options:
The requirements file to pip install before running the scenario. This file must be found in
the 'includes' directory. (default: requirements.txt)
+ -d DOCKER_IMAGE, --docker-image DOCKER_IMAGE
+ The Docker image to use when running scenarios. Can not be used together with --native.
+ (default: 'autogen/testbed:default', which will be created if not present)
+
--native Run the scenarios natively rather than in docker.
NOTE: This is not advisable, and should be done with great caution.
```
@@ -216,3 +220,20 @@ python ./run_scenarios.py ./scenarios/GAIA/gaia_validation_level_1__two_agents_g
# Compute Metrics
python utils/collate_gaia_csv.py ./results/gaia_validation_level_1__two_agents_gpt4 | python utils/metrics_gaia.py
```
+
+## (Example) Running tasks from AutoGPT
+
+The Testbed supports running tasks proposed in [AutoGPT benchmark](https://github.com/Significant-Gravitas/AutoGPT/tree/master/benchmark/agbenchmark/challenges). In this scenario, the agents are prompted to handle a diverse range of tasks, including coding, question answering according to given tasks, web scraping. Similar to scenarios in HumanEval, the agents can call the unit test script to check if the task is successfully done.
+
+Accessing this scenario-type requires converting tasks, running the Testbed, collating the results, and finally computing the metrics. The following commands will run each test instance with GPT-4:
+
+```
+# Convert tasks
+python utils/prepare_autogpt.py
+
+# Run all the scenarios with GPT-4
+python run_scenarios.py scenarios/AutoGPT/autogpt_twoagent_gpt4.jsonl
+
+# Compute metrics, the metric script shares the same one with HumanEval
+python utils/collate_autogpt.py ./results/autogpt_twoagent_gpt4 | python metrics_human_eval.py
+```
diff --git a/samples/tools/testbed/includes/math_requirements.txt b/samples/tools/testbed/includes/math_requirements.txt
new file mode 100644
index 000000000000..0600c8ce047a
--- /dev/null
+++ b/samples/tools/testbed/includes/math_requirements.txt
@@ -0,0 +1,4 @@
+git+https://github.com/microsoft/autogen.git
+sympy
+matplotlib
+numpy
diff --git a/samples/tools/testbed/includes/requirements.txt b/samples/tools/testbed/includes/requirements.txt
index 8f88664e8bba..33070268d1f7 100644
--- a/samples/tools/testbed/includes/requirements.txt
+++ b/samples/tools/testbed/includes/requirements.txt
@@ -1 +1,5 @@
git+https://github.com/microsoft/autogen.git
+pandas
+beautifulsoup4
+requests
+pytest
diff --git a/samples/tools/testbed/run_scenarios.py b/samples/tools/testbed/run_scenarios.py
index 6020a7591a4b..059d97345d67 100644
--- a/samples/tools/testbed/run_scenarios.py
+++ b/samples/tools/testbed/run_scenarios.py
@@ -15,8 +15,12 @@
# Location of the global includes dir. The contents of this directory will be copied to the Docker environment.
GLOBAL_INCLUDES_DIR = "includes"
+# This is the tag given to the image that is *built* when no other image is provided.
+# Do not use this field to specify the name of an existing image (e.g., on Dockerhub)
+DEFAULT_DOCKER_IMAGE_TAG = "autogen/testbed:default"
-def run_scenarios(scenario, n_repeats, is_native, config_list, requirements, results_dir="results"):
+
+def run_scenarios(scenario, n_repeats, is_native, config_list, requirements, docker_image=None, results_dir="results"):
"""
Run a set testbed scenarios a given number of times.
@@ -103,7 +107,7 @@ def run_scenarios(scenario, n_repeats, is_native, config_list, requirements, res
if is_native:
run_scenario_natively(results_repetition)
else:
- run_scenario_in_docker(results_repetition, requirements)
+ run_scenario_in_docker(results_repetition, requirements, docker_image=docker_image)
def expand_scenario(scenario_dir, scenario, output_dir):
@@ -244,7 +248,7 @@ def run_scenario_natively(work_dir):
return
-def run_scenario_in_docker(work_dir, requirements, timeout=600):
+def run_scenario_in_docker(work_dir, requirements, timeout=600, docker_image=None):
"""
Run a scenario in a Docker environment.
@@ -253,20 +257,34 @@ def run_scenario_in_docker(work_dir, requirements, timeout=600):
timeout (Optional, int): the number of seconds to allow a Docker container to run before timing out
"""
- # Create a docker client
client = docker.from_env()
- image_name = "python:3.11"
-
- # Pull a suitable image
- try:
- image = client.images.get(image_name)
- except docker.errors.ImageNotFound:
- # pull the image
- print("Pulling image", image_name)
+ image = None
+
+ # If the docker_image is None, then we will fetch DEFAULT_DOCKER_IMAGE_TAG, if present,
+ # or build it if missing.
+ if docker_image is None:
+ # Pull a suitable image
+ try:
+ image = client.images.get(DEFAULT_DOCKER_IMAGE_TAG)
+ except docker.errors.ImageNotFound:
+ print(f"Building default Docker image '{DEFAULT_DOCKER_IMAGE_TAG}'. This may take a few minutes...")
+ try:
+ build_default_docker_image(client, DEFAULT_DOCKER_IMAGE_TAG)
+ image = client.images.get(DEFAULT_DOCKER_IMAGE_TAG)
+ except docker.errors.DockerException:
+ print(f"Failed to build image '{DEFAULT_DOCKER_IMAGE_TAG}'")
+
+ # Otherwise get the requested image
+ else:
try:
- image = client.images.pull(image_name)
- except docker.errors.DockerException:
- print("Failed to pull image", image_name)
+ image = client.images.get(docker_image)
+ except docker.errors.ImageNotFound:
+ # pull the image
+ print(f"Pulling image '{docker_image}'")
+ try:
+ image = client.images.pull(docker_image)
+ except docker.errors.DockerException:
+ print(f"Failed to pull image '{docker_image}'")
# Prepare the run script
with open(os.path.join(work_dir, "run.sh"), "wt", newline="\n") as f:
@@ -351,6 +369,12 @@ def run_scenario_in_docker(work_dir, requirements, timeout=600):
f.write(logs)
+def build_default_docker_image(docker_client, image_tag):
+ for segment in docker_client.api.build(path=".", dockerfile="Dockerfile", rm=True, tag=image_tag, decode=True):
+ if "stream" in segment:
+ sys.stdout.write(segment["stream"])
+
+
###############################################################################
if __name__ == "__main__":
script_name = os.path.basename(__file__)
@@ -382,6 +406,15 @@ def run_scenario_in_docker(work_dir, requirements, timeout=600):
+ "' directory. (default: requirements.txt)",
default=None,
)
+ parser.add_argument(
+ "-d",
+ "--docker-image",
+ type=str,
+ help="The Docker image to use when running scenarios. Can not be used together with --native. (default: '"
+ + DEFAULT_DOCKER_IMAGE_TAG
+ + "', which will be created if not present)",
+ default=None,
+ )
parser.add_argument(
"--native",
action="store_true",
@@ -395,6 +428,10 @@ def run_scenario_in_docker(work_dir, requirements, timeout=600):
if len(config_list) == 0:
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), args.config)
+ # Don't allow both --docker-image and --native on the same command
+ if args.docker_image is not None and args.native:
+ sys.exit("The options --native and --docker-image can not be used together. Exiting.")
+
# Warn if running natively
if args.native:
if IS_WIN32:
@@ -434,4 +471,4 @@ def run_scenario_in_docker(work_dir, requirements, timeout=600):
f"The environment file '{env_file}' does not exist (perhaps this is your first time setting up the testbed). A default environment file has been provided, but you may want to edit it to include your API keys and configurations.\n"
)
- run_scenarios(args.scenario, args.repeat, is_native, config_list, requirements)
+ run_scenarios(args.scenario, args.repeat, is_native, config_list, requirements, docker_image=args.docker_image)
diff --git a/samples/tools/testbed/scenarios/AutoGPT/README.md b/samples/tools/testbed/scenarios/AutoGPT/README.md
index 5e737a087cf8..db08a0af4844 100644
--- a/samples/tools/testbed/scenarios/AutoGPT/README.md
+++ b/samples/tools/testbed/scenarios/AutoGPT/README.md
@@ -1,3 +1,3 @@
The AutoGPT style tasks are contained in folder `challenges`.
-Run `python utils/prepare_data.py` to convert the tasks to jsonl format compatible for evaluation.
+Run `python ../../utils/prepare_autogpt.py` to convert the tasks to jsonl format compatible for evaluation.
diff --git a/samples/tools/testbed/scenarios/AutoGPT/Templates/TwoAgents/check.py b/samples/tools/testbed/scenarios/AutoGPT/Templates/TwoAgents/check.py
index cc015278c21c..57043d5695a8 100644
--- a/samples/tools/testbed/scenarios/AutoGPT/Templates/TwoAgents/check.py
+++ b/samples/tools/testbed/scenarios/AutoGPT/Templates/TwoAgents/check.py
@@ -3,6 +3,7 @@
import os
import subprocess
import sys
+import shutil
def scoring(content: str, should_contain: list, should_not_contain: list):
@@ -28,7 +29,6 @@ def scoring(content: str, should_contain: list, should_not_contain: list):
def check():
- workspace = "coding"
files_contents = []
scores = []
@@ -54,9 +54,11 @@ def check():
for file_path in matching_files:
if eval_type == "python":
+ # copy the test file to working directory
+ shutil.copy(f"../custom_python/{file_path}", "./")
result = subprocess.run(
[sys.executable, file_path],
- cwd=os.path.abspath(workspace),
+ cwd=os.path.abspath("./"),
capture_output=True,
text=True,
)
diff --git a/samples/tools/testbed/scenarios/AutoGPT/Templates/TwoAgents/scenario.py b/samples/tools/testbed/scenarios/AutoGPT/Templates/TwoAgents/scenario.py
index 4add9d00cc8f..e43b8f05df1e 100644
--- a/samples/tools/testbed/scenarios/AutoGPT/Templates/TwoAgents/scenario.py
+++ b/samples/tools/testbed/scenarios/AutoGPT/Templates/TwoAgents/scenario.py
@@ -24,15 +24,15 @@
"work_dir": work_dir,
"use_docker": False,
},
- max_consecutive_auto_reply=10,
+ max_consecutive_auto_reply=5,
# default_auto_reply="TERMINATE",
)
if target_folder:
# The tasks involves reading from a file then do sth to it.
message = """
- Your task is to: __TASK__ The file you needed is located in this directory: '__TARGET_FOLDER__'. You should save the output files in this directory: './'
- Use the following command to check if all the unit tests have passed:
+ Here is the task description: __TASK__ The file you needed is located in this directory: '__TARGET_FOLDER__'. You should save the output files in the current directory: './'
+ Run the following command to check if all the unit tests have passed:
```bash
python ../check.py
```
@@ -40,8 +40,8 @@
"""
else:
message = """
- Your task is to: __TASK__
- Use the following command to check if all the unit tests have passed:
+ Here is the task description: __TASK__
+ Run the following command to check if all the unit tests have passed:
```bash
python ../check.py
```
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/10_password_generator/custom_python/test_pwd.py b/samples/tools/testbed/scenarios/AutoGPT/challenges/10_password_generator/custom_python/test_pwd.py
new file mode 100644
index 000000000000..86ce911ab8b5
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/10_password_generator/custom_python/test_pwd.py
@@ -0,0 +1,25 @@
+import unittest
+
+import password_generator
+
+
+class TestPasswordGenerator(unittest.TestCase):
+ def test_password_length(self):
+ for i in range(8, 17):
+ password = password_generator.generate_password(i)
+ self.assertEqual(len(password), i)
+
+ def test_value_error(self):
+ with self.assertRaises(ValueError):
+ password_generator.generate_password(7)
+ with self.assertRaises(ValueError):
+ password_generator.generate_password(17)
+
+ def test_password_content(self):
+ password = password_generator.generate_password()
+ self.assertTrue(any(c.isdigit() for c in password))
+ self.assertTrue(any(c in password_generator.string.punctuation for c in password))
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/10_password_generator/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/10_password_generator/data.json
new file mode 100644
index 000000000000..40af0e3c160e
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/10_password_generator/data.json
@@ -0,0 +1,28 @@
+{
+ "category": [
+ "coding"
+ ],
+ "cutoff": 90,
+ "dependencies": [
+ "TestThreeSum"
+ ],
+ "eval_id": "ac75c471-e0ce-400c-ba9a-fb72aaab444f",
+ "ground": {
+ "answer": "password_generator.py is created and satisfies the requirements.",
+ "eval": {
+ "type": "python"
+ },
+ "files": [
+ "test_pwd.py"
+ ],
+ "should_contain": [],
+ "should_not_contain": []
+ },
+ "info": {
+ "description": "Tests if the agent can create a random password generator.",
+ "difficulty": "basic",
+ "side_effects": []
+ },
+ "name": "PasswordGenerator",
+ "task": "Create a random password generator. The password should have between 8 and 16 characters and should contain at least one letter, number and symbol. The password should be printed to the console. If no length is specified, the password should be 8 characters long. The password_generator should be imported as a module and called as password = password_generator.generate_password(length=x). Any invalid input should raise a ValueError."
+}
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/11_file_organizer/custom_python/test_file_organize.py b/samples/tools/testbed/scenarios/AutoGPT/challenges/11_file_organizer/custom_python/test_file_organize.py
new file mode 100644
index 000000000000..fa39119eda00
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/11_file_organizer/custom_python/test_file_organize.py
@@ -0,0 +1,41 @@
+import os
+import subprocess
+import tempfile
+import unittest
+
+
+class TestOrganizeFiles(unittest.TestCase):
+ def setUp(self):
+ # Create temporary directory
+ self.test_dir = tempfile.mkdtemp()
+
+ # File types and their corresponding directory
+ self.file_types = {
+ "test_image.png": "images",
+ "test_doc.txt": "documents",
+ "test_audio.mp3": "audio",
+ }
+
+ # Create test files
+ for file_name in self.file_types.keys():
+ open(os.path.join(self.test_dir, file_name), "a").close()
+
+ def test_organize_files(self):
+ # Call the organize_files.py script using subprocess
+ subprocess.call(["python", "organize_files.py", "--directory_path=" + self.test_dir])
+
+ # Check if the files have been moved to the correct directories
+ for file_name, directory in self.file_types.items():
+ self.assertTrue(os.path.isfile(os.path.join(self.test_dir, directory, file_name)))
+
+ def tearDown(self):
+ # Delete test directory and its contents
+ for file_name, directory in self.file_types.items():
+ os.remove(os.path.join(self.test_dir, directory, file_name))
+ for directory in set(self.file_types.values()):
+ os.rmdir(os.path.join(self.test_dir, directory))
+ os.rmdir(self.test_dir)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/11_file_organizer/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/11_file_organizer/data.json
new file mode 100644
index 000000000000..031a789c2fad
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/11_file_organizer/data.json
@@ -0,0 +1,29 @@
+{
+ "category": [
+ "coding",
+ "general"
+ ],
+ "cutoff": 90,
+ "dependencies": [
+ "TestPasswordGenerator"
+ ],
+ "eval_id": "029c1e6f-2b36-451e-bca6-60063b827d2e",
+ "ground": {
+ "answer": "The correct python file is written and organizes the files accordingly",
+ "eval": {
+ "type": "python"
+ },
+ "files": [
+ "test_file_organize.py"
+ ],
+ "should_contain": [],
+ "should_not_contain": []
+ },
+ "info": {
+ "description": "Tests if the agent can create a file organizer.",
+ "difficulty": "basic",
+ "side_effects": []
+ },
+ "name": "FileOrganizer",
+ "task": "Create a file organizer CLI tool in Python that sorts files in a directory based on their file types (e.g., images, documents, audio) and moves them into these corresponding folders: 'images', 'documents', 'audio'. The entry point will be a python file that can be run this way: python organize_files.py --directory_path=YOUR_DIRECTORY_PATH"
+}
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/12_url_shortener/custom_python/test_url_shorten.py b/samples/tools/testbed/scenarios/AutoGPT/challenges/12_url_shortener/custom_python/test_url_shorten.py
new file mode 100644
index 000000000000..c3daffa80b0a
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/12_url_shortener/custom_python/test_url_shorten.py
@@ -0,0 +1,22 @@
+import unittest
+
+from url_shortener import retrieve_url, shorten_url
+
+
+class TestURLShortener(unittest.TestCase):
+ def test_url_retrieval(self):
+ # Shorten the URL to get its shortened form
+ shortened_url = shorten_url("https://www.example.com")
+
+ # Retrieve the original URL using the shortened URL directly
+ retrieved_url = retrieve_url(shortened_url)
+
+ self.assertEqual(
+ retrieved_url,
+ "https://www.example.com",
+ "Retrieved URL does not match the original!",
+ )
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/12_url_shortener/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/12_url_shortener/data.json
new file mode 100644
index 000000000000..8fb2b6fc6ec0
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/12_url_shortener/data.json
@@ -0,0 +1,28 @@
+{
+ "category": [
+ "coding"
+ ],
+ "cutoff": 150,
+ "dependencies": [
+ "TestFileOrganizer"
+ ],
+ "eval_id": "8106fd7f-83fd-496e-9513-280f4a3f012c",
+ "ground": {
+ "answer": "The correct python file for a basic url shortener CLI",
+ "eval": {
+ "type": "python"
+ },
+ "files": [
+ "test_url_shorten.py"
+ ],
+ "should_contain": [],
+ "should_not_contain": []
+ },
+ "info": {
+ "description": "Tests if the agent can create a URL shortener.",
+ "difficulty": "basic",
+ "side_effects": []
+ },
+ "name": "UrlShortener",
+ "task": "Build a basic URL shortener using a python CLI. Here are the specifications.\n\nFunctionality: The program should have two primary functionalities.\n\nShorten a given URL.\nRetrieve the original URL from a shortened URL.\n\nCLI: The command-line interface should accept a URL as its first input. It should be able to determine if the url is a shortened url or not. If the url is not shortened, it will display ONLY the shortened url, otherwise, it will display ONLY the original unshortened URL. Afterwards, it should prompt the user for another URL to process.\n\nTechnical specifications:\nBuild a file called url_shortener.py. This file will be called through command lines. Do not write your own test cases or any unit test code.\n\nEdge cases:\nFor the sake of simplicity, there will be no edge cases, you can assume the input is always correct and the user immediately passes the shortened version of the url he just shortened.\n\nYou will be expected to create a python file called url_shortener.py that will function through imported as a module.\n\nThe url_shortener.py will be tested this way:\n```\nimport unittest\nfrom url_shortener import shorten_url, retrieve_url\n\nclass TestURLShortener(unittest.TestCase):\n def test_url_retrieval(self):\n # Shorten the URL to get its shortened form\n shortened_url = shorten_url('https://www.example.com')\n\n # Retrieve the original URL using the shortened URL directly\n retrieved_url = retrieve_url(shortened_url)\n\n self.assertEqual(retrieved_url, 'https://www.example.com', \"Retrieved URL does not match the original!\")\n\nif __name__ == \"__main__\":\n unittest.main()\n```"
+}
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/13_tic_tac_toe/custom_python/test_tictactoe.py b/samples/tools/testbed/scenarios/AutoGPT/challenges/13_tic_tac_toe/custom_python/test_tictactoe.py
new file mode 100644
index 000000000000..94b77820894b
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/13_tic_tac_toe/custom_python/test_tictactoe.py
@@ -0,0 +1,41 @@
+import subprocess
+
+import pytest
+
+
+def run_game_with_inputs(inputs):
+ # Start the game process
+ process = subprocess.Popen(
+ ["python", "tic_tac_toe.py"],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ )
+
+ # Send the input moves one by one
+ output, errors = process.communicate("\n".join(inputs))
+
+ # Print the inputs and outputs
+ print("Inputs:\n", "\n".join(inputs))
+ print("Output:\n", output)
+ print("Errors:\n", errors)
+
+ return output
+
+
+@pytest.mark.parametrize(
+ "inputs, expected_output",
+ [
+ (["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"),
+ (["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"),
+ (["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw"),
+ ],
+)
+def test_game(inputs, expected_output):
+ output = run_game_with_inputs(inputs)
+ assert expected_output in output
+
+
+if __name__ == "__main__":
+ pytest.main()
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/13_tic_tac_toe/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/13_tic_tac_toe/data.json
new file mode 100644
index 000000000000..288f02742bf6
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/13_tic_tac_toe/data.json
@@ -0,0 +1,29 @@
+{
+ "category": [
+ "coding",
+ "general"
+ ],
+ "cutoff": 150,
+ "dependencies": [
+ "TestUrlShortener"
+ ],
+ "eval_id": "504b1648-e14a-4982-8b27-074598eb4fd0",
+ "ground": {
+ "answer": "The correct python file for a TicTacToe game is written",
+ "eval": {
+ "type": "python"
+ },
+ "files": [
+ "test_tictactoe.py"
+ ],
+ "should_contain": [],
+ "should_not_contain": []
+ },
+ "info": {
+ "description": "Tests if the agent can create Tic-Tac-Toe game",
+ "difficulty": "basic",
+ "side_effects": []
+ },
+ "name": "TicTacToe",
+ "task": "Build a Tic-Tac-Toe game using a python CLI. Here are the specifications.\n\nThe Grid: The game board is a 3x3 grid, consisting of 3 rows and 3 columns, creating a total of 9 squares.\n\nPlayers: There are two players. One player uses the number \"1\", and the other player uses the number \"2\".\n\nTaking Turns: Players take turns to put their respective numbers (\"1\" or \"2\") in an empty square of the grid. Once a player has placed their number in a square, it cannot be changed or removed.\n\nObjective: The goal is to get three of your numbers in a row, either horizontally, vertically, or diagonally.\n\nEnd of the Game: The game concludes in one of two ways: One player gets three of their numbers in a row (horizontally, vertically, or diagonally) and is declared the winner.\nAll squares on the grid are filled, and no player has three in a row. This situation is a \"draw\" or a \"tie\".\n\nTechnical specifications:\nBuild a file called tic_tac_toe.py. This file will be called through command lines. You will have to prompt users for their move. Player 1 will always start.\nPlayers will input their move in the following format: \"x,y\" where x and y represent the location in the grid (0,0 is top left, 2,2 is bottom right).\n\nYour primary requirement is to halt the game when appropriate and to print only one of these three exact sentences:\n\n\"Player 1 won!\"\n\"Player 2 won!\"\n\"Draw\"\n\nEdge cases: A player can send an incorrect location. Either the location is incorrect or the square is already filled. In this case, this counts as doing nothing, and the player gets prompted for new locations again.\n\n\nYou will be expected to create a python file called tic_tac_toe.py that will run through command lines by using ```python tic_tac_toe.py```.\n\nHere is an example of how your tic_tac_toe.py game will be tested.\n```\nprocess = subprocess.Popen(\n ['python', 'tic_tac_toe.py'],\n stdout=subprocess.PIPE,\n text=True\n)\n\noutput, _ = process.communicate('\\n'.join([\"0,0\", \"1,0\", \"0,1\", \"1,1\", \"0,2\"]))\n\nassert \"Player 1 won!\" in output\n```"
+}
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/2_combine_csv/artifacts_in/file1.csv b/samples/tools/testbed/scenarios/AutoGPT/challenges/2_combine_csv/artifacts_in/file1.csv
new file mode 100644
index 000000000000..fe552d6774ef
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/2_combine_csv/artifacts_in/file1.csv
@@ -0,0 +1,4 @@
+ID,Name,Age
+101,John,28
+102,Alice,34
+103,Bob,45
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/2_combine_csv/artifacts_in/file2.csv b/samples/tools/testbed/scenarios/AutoGPT/challenges/2_combine_csv/artifacts_in/file2.csv
new file mode 100644
index 000000000000..685e24f4b9f0
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/2_combine_csv/artifacts_in/file2.csv
@@ -0,0 +1,4 @@
+ID,Occupation,Salary
+101,Engineer,80000
+102,Doctor,120000
+103,Lawyer,95000
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/2_combine_csv/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/2_combine_csv/data.json
new file mode 100644
index 000000000000..1943803c7856
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/2_combine_csv/data.json
@@ -0,0 +1,32 @@
+{
+ "category": [
+ "data",
+ "general"
+ ],
+ "cutoff": 60,
+ "dependencies": [
+ "TestSortCsv"
+ ],
+ "eval_id": "52467beb-b951-4356-9776-9a0ae46bb33b",
+ "ground": {
+ "answer": "The csv data is combined",
+ "eval": {
+ "type": "file"
+ },
+ "files": [
+ "output.csv"
+ ],
+ "should_contain": [
+ "Age,ID,Name,Occupation,Salary\n28,101,John,Engineer,80000\n34,102,Alice,Doctor,120000\n45,103,Bob,Lawyer,95000"
+ ]
+ },
+ "info": {
+ "description": "Tests if the agent can combine data from a csv",
+ "difficulty": "intermediate",
+ "side_effects": [
+ ""
+ ]
+ },
+ "name": "CombineCsv",
+ "task": "The csvs 'file1.csv' and 'file2.csv' both have a column 'ID'. Combine these 2 csvs using the 'ID' column. Then sort the rows by 'ID' in ascending order, sort the columns alphabetically. Write the output in 'output.csv'."
+}
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/3_qa_small_csv/artifacts_in/file1.csv b/samples/tools/testbed/scenarios/AutoGPT/challenges/3_qa_small_csv/artifacts_in/file1.csv
new file mode 100644
index 000000000000..8f5826e9f775
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/3_qa_small_csv/artifacts_in/file1.csv
@@ -0,0 +1,12 @@
+Date,Description,Amount,Category
+2023-01-01,Grocery Store,52.3,Groceries
+2023-01-02,Pharmacy,12.5,Healthcare
+2023-01-03,Gas Station,29.1,Transportation
+2023-01-04,Water,19,Utilities
+2023-01-05,Grocery Store,60.25,Groceries
+2023-01-06,Coffee Shop,4.5,Dining
+2023-01-07,Cinema Tickets,20,Entertainment
+2023-01-08,Book Store,30.4,Shopping
+2023-01-09,Restaurant Dinner,55.8,Dining
+2023-01-10,Electric Bill,65.35,Utilities
+2023-01-11,Grocery Store,45.1,Groceries
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/3_qa_small_csv/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/3_qa_small_csv/data.json
new file mode 100644
index 000000000000..639f7946adf2
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/3_qa_small_csv/data.json
@@ -0,0 +1,32 @@
+{
+ "category": [
+ "data",
+ "general"
+ ],
+ "cutoff": 60,
+ "dependencies": [
+ "TestReadFile"
+ ],
+ "eval_id": "9df3f07a-5047-488f-b788-1e1f57eba970",
+ "ground": {
+ "answer": "The correct amount spent on utilities.",
+ "eval": {
+ "type": "file"
+ },
+ "files": [
+ "output.txt"
+ ],
+ "should_contain": [
+ "84"
+ ]
+ },
+ "info": {
+ "description": "Tests if the agent can answer a question from a small csv",
+ "difficulty": "intermediate",
+ "side_effects": [
+ ""
+ ]
+ },
+ "name": "AnswerQuestionSmallCsv",
+ "task": "Read and analyze 'file1.csv', then answer the quetion: How much was spent on utilities in total? Write the answer in file 'output.txt'."
+}
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/4_qa_csv/artifacts_in/file1.csv b/samples/tools/testbed/scenarios/AutoGPT/challenges/4_qa_csv/artifacts_in/file1.csv
new file mode 100644
index 000000000000..cb22e61c466d
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/4_qa_csv/artifacts_in/file1.csv
@@ -0,0 +1,305 @@
+Date,Description,Amount,Category
+2023-01-01,Grocery Store,52.3,Groceries
+2023-01-02,Pharmacy,12.5,Healthcare
+2023-01-03,Gas Station,29.1,Transportation
+2023-01-04,Cinema Tickets,19,Entertainment
+2023-01-05,Grocery Store,60.25,Groceries
+2023-01-06,Coffee Shop,4.5,Dining
+2023-01-07,Cinema Tickets,20,Entertainment
+2023-01-08,Book Store,30.4,Shopping
+2023-01-09,Restaurant Dinner,55.8,Dining
+2023-01-10,Electric Bill,65.35,Utilities
+2023-01-11,Grocery Store,45.1,Groceries
+2023-01-12,Clothing Store,100.2,Shopping
+2023-01-13,Pharmacy,20.3,Healthcare
+2023-01-14,Coffee Shop,4.5,Dining
+2023-01-15,Restaurant Dinner,50,Dining
+2023-01-16,Gas Station,32.1,Transportation
+2023-01-17,Online Shopping,80,Shopping
+2023-01-18,Water Bill,20.35,Utilities
+2023-01-19,Grocery Store,55.6,Groceries
+2023-01-20,Gas Station,28,Transportation
+2023-01-21,Pharmacy,15.4,Healthcare
+2023-01-22,Phone Bill,40,Utilities
+2023-01-23,Cinema Tickets,20,Entertainment
+2023-01-24,Coffee Shop,5.5,Dining
+2023-01-25,Book Purchase,14,Shopping
+2023-01-26,Restaurant Lunch,30,Dining
+2023-01-27,Public Transport,20,Transportation
+2023-01-28,Grocery Store,58.25,Groceries
+2023-01-29,Online Shopping,70,Shopping
+2023-01-30,Grocery Store,62.1,Groceries
+2023-01-31,Medical Prescription,10.4,Healthcare
+2023-02-01,Gas Station,33,Transportation
+2023-02-02,Coffee Shop,6,Dining
+2023-02-03,Cinema Tickets,22,Entertainment
+2023-02-04,Book Store,28.4,Shopping
+2023-02-05,Internet Bill,50,Utilities
+2023-02-06,Grocery Store,60.1,Groceries
+2023-02-07,Clothing Store,120,Shopping
+2023-02-08,Grocery Store,58.25,Groceries
+2023-02-09,Coffee Shop,4.5,Dining
+2023-02-10,Electric Bill,70,Utilities
+2023-02-11,Grocery Store,50.1,Groceries
+2023-02-12,Public Transport,18,Transportation
+2023-02-13,Pharmacy,24,Healthcare
+2023-02-14,Restaurant Dinner,60,Dining
+2023-02-15,Medical Prescription,11.4,Healthcare
+2023-02-16,Gas Station,30,Transportation
+2023-02-17,Online Shopping,85,Shopping
+2023-02-18,Water Bill,18,Utilities
+2023-02-19,Grocery Store,53.6,Groceries
+2023-02-20,Public Transport,22,Transportation
+2023-02-21,Pharmacy,10,Healthcare
+2023-02-22,Phone Bill,42,Utilities
+2023-02-23,Cinema Tickets,24,Entertainment
+2023-02-24,Coffee Shop,6,Dining
+2023-02-25,Book Purchase,16,Shopping
+2023-02-26,Restaurant Lunch,28,Dining
+2023-02-27,Gas Station,34,Transportation
+2023-02-28,Grocery Store,56,Groceries
+2023-03-01,Online Shopping,90,Groceries
+2023-03-02,Dentist Appointment,130,Healthcare
+2023-03-03,Grocery Store,63.45,Groceries
+2023-03-04,Cinema Tickets,21,Entertainment
+2023-03-05,Coffee Shop,5.8,Dining
+2023-03-06,Electric Bill,67.5,Utilities
+2023-03-07,Gas Station,31.2,Transportation
+2023-03-08,Restaurant Dinner,58,Dining
+2023-03-09,Pharmacy,18.3,Healthcare
+2023-03-10,Grocery Store,64.7,Groceries
+2023-03-11,Book Store,25.4,Shopping
+2023-03-12,Online Shopping,78,Shopping
+2023-03-13,Coffee Shop,6.5,Dining
+2023-03-14,Museum Tickets,15,Entertainment
+2023-03-15,Internet Bill,52,Utilities
+2023-03-16,Public Transport,19.5,Transportation
+2023-03-17,Clothing Store,105.6,Shopping
+2023-03-18,Phone Bill,41,Utilities
+2023-03-19,Coffee Shop,5,Dining
+2023-03-20,Grocery Store,59.2,Groceries
+2023-03-21,Gas Station,29.8,Transportation
+2023-03-22,Restaurant Lunch,32,Dining
+2023-03-23,Pharmacy,16.5,Healthcare
+2023-03-24,Concert Tickets,50,Entertainment
+2023-03-25,Coffee Shop,5.5,Dining
+2023-03-26,Grocery Store,61.8,Groceries
+2023-03-27,Online Shopping,82,Shopping
+2023-03-28,Water Bill,19.35,Utilities
+2023-03-29,Public Transport,21,Transportation
+2023-03-30,Book Purchase,17,Shopping
+2023-03-31,Grocery Store,60,Groceries
+2023-04-01,Cinema Tickets,23,Entertainment
+2023-04-02,Pharmacy,17.4,Healthcare
+2023-04-03,Gas Station,33.5,Transportation
+2023-04-04,Restaurant Dinner,56.7,Dining
+2023-04-05,Grocery Store,65.3,Groceries
+2023-04-06,Coffee Shop,5.9,Dining
+2023-04-07,Online Shopping,87,Shopping
+2023-04-08,Electric Bill,69,Utilities
+2023-04-09,Clothing Store,112.5,Shopping
+2023-04-10,Grocery Store,57.4,Groceries
+2023-04-11,Book Store,26.3,Shopping
+2023-04-12,Gas Station,30.9,Transportation
+2023-04-13,Coffee Shop,6.8,Dining
+2023-04-14,Zoo Tickets,24,Entertainment
+2023-04-15,Internet Bill,53,Utilities
+2023-04-16,Public Transport,20.5,Transportation
+2023-04-17,Restaurant Lunch,34,Dining
+2023-04-18,Phone Bill,43,Utilities
+2023-04-19,Coffee Shop,5.2,Dining
+2023-04-20,Grocery Store,58.9,Groceries
+2023-04-21,Pharmacy,14.7,Healthcare
+2023-04-22,Cinema Tickets,25,Entertainment
+2023-04-23,Online Shopping,90,Shopping
+2023-04-24,Gas Station,31.4,Transportation
+2023-04-25,Water Bill,21,Utilities
+2023-04-26,Grocery Store,62.5,Groceries
+2023-04-27,Coffee Shop,5.7,Dining
+2023-04-28,Book Purchase,18.5,Shopping
+2023-04-29,Public Transport,22,Transportation
+2023-04-30,Grocery Store,63,Groceries
+2023-05-01,Theater Tickets,45,Entertainment
+2023-05-02,Dentist Appointment,135,Healthcare
+2023-05-03,Gas Station,32.2,Transportation
+2023-05-04,Restaurant Dinner,59,Dining
+2023-05-05,Grocery Store,66.1,Groceries
+2023-05-06,Coffee Shop,6,Dining
+2023-05-07,Online Shopping,89,Shopping
+2023-05-08,Electric Bill,70.5,Utilities
+2023-05-09,Clothing Store,110,Shopping
+2023-05-10,Grocery Store,59.7,Groceries
+2023-05-11,Coffee Shop,6.1,Dining
+2023-05-12,Book Store,29.2,Shopping
+2023-05-13,Gas Station,29.9,Transportation
+2023-05-14,Museum Tickets,16,Entertainment
+2023-05-15,Internet Bill,52.5,Utilities
+2023-05-16,Public Transport,21.3,Transportation
+2023-05-17,Restaurant Lunch,35.4,Dining
+2023-05-18,Phone Bill,43.5,Utilities
+2023-05-19,Grocery Store,64.8,Groceries
+2023-05-20,Pharmacy,15.2,Healthcare
+2023-05-21,Cinema Tickets,26,Entertainment
+2023-05-22,Coffee Shop,6.3,Dining
+2023-05-23,Gas Station,30.8,Transportation
+2023-05-24,Online Shopping,92.5,Shopping
+2023-05-25,Water Bill,20.5,Utilities
+2023-05-26,Grocery Store,61.9,Groceries
+2023-05-27,Public Transport,23,Transportation
+2023-05-28,Book Purchase,19,Shopping
+2023-05-29,Coffee Shop,5.9,Dining
+2023-05-30,Restaurant Dinner,57.8,Dining
+2023-05-31,Grocery Store,66.7,Groceries
+2023-06-01,Theater Tickets,47,Entertainment
+2023-06-02,Dentist Appointment,140,Healthcare
+2023-06-03,Gas Station,31.6,Transportation
+2023-06-04,Coffee Shop,6.4,Dining
+2023-06-05,Online Shopping,94,Shopping
+2023-06-06,Electric Bill,72,Utilities
+2023-06-07,Restaurant Lunch,36,Dining
+2023-06-08,Grocery Store,65.3,Groceries
+2023-06-09,Pharmacy,17,Healthcare
+2023-06-10,Cinema Tickets,27.5,Entertainment
+2023-06-11,Public Transport,21.5,Transportation
+2023-06-12,Book Store,30,Shopping
+2023-06-13,Gas Station,28.7,Transportation
+2023-06-14,Coffee Shop,6.6,Dining
+2023-06-15,Internet Bill,53.5,Utilities
+2023-06-16,Zoo Tickets,28,Entertainment
+2023-06-17,Grocery Store,67.4,Groceries
+2023-06-18,Phone Bill,44,Utilities
+2023-06-19,Restaurant Dinner,60,Dining
+2023-06-20,Coffee Shop,6.7,Dining
+2023-06-21,Public Transport,22.5,Transportation
+2023-06-22,Online Shopping,96,Shopping
+2023-06-23,Gas Station,32.4,Transportation
+2023-06-24,Cinema Tickets,29,Entertainment
+2023-06-25,Book Purchase,20,Shopping
+2023-06-26,Grocery Store,68.3,Groceries
+2023-06-27,Water Bill,22,Utilities
+2023-06-28,Pharmacy,18.5,Healthcare
+2023-06-29,Restaurant Lunch,37,Dining
+2023-06-30,Coffee Shop,7,Dining
+2023-07-01,Grocery Store,69.5,Groceries
+2023-07-02,Theater Tickets,49,Entertainment
+2023-07-03,Gas Station,33.2,Transportation
+2023-07-04,Park Picnic,40,Dining
+2023-07-05,Electric Bill,73.5,Utilities
+2023-07-06,Clothing Store,120,Shopping
+2023-07-07,Online Shopping,98,Shopping
+2023-07-08,Grocery Store,70.6,Groceries
+2023-07-09,Coffee Shop,7.1,Dining
+2023-07-10,Internet Bill,54,Utilities
+2023-07-11,Public Transport,23.5,Transportation
+2023-07-12,Museum Tickets,18,Entertainment
+2023-07-13,Book Store,31,Shopping
+2023-07-14,Gas Station,29.9,Transportation
+2023-07-15,Coffee Shop,7.2,Dining
+2023-07-16,Restaurant Dinner,62,Dining
+2023-07-17,Grocery Store,71.8,Groceries
+2023-07-18,Phone Bill,45,Utilities
+2023-07-19,Zoo Tickets,30,Entertainment
+2023-07-20,Coffee Shop,7.3,Dining
+2023-07-21,Public Transport,24,Transportation
+2023-07-22,Online Shopping,99.5,Shopping
+2023-07-23,Gas Station,34,Transportation
+2023-07-24,Cinema Tickets,31,Entertainment
+2023-07-25,Book Purchase,21.5,Shopping
+2023-07-26,Grocery Store,72.9,Groceries
+2023-07-27,Water Bill,23.5,Utilities
+2023-07-28,Pharmacy,19.5,Healthcare
+2023-07-29,Restaurant Lunch,38.5,Dining
+2023-07-30,Coffee Shop,7.4,Dining
+2023-07-31,Grocery Store,73.7,Groceries
+2023-08-01,Theater Tickets,50,Entertainment
+2023-08-02,Gas Station,34.5,Transportation
+2023-08-03,Restaurant Dinner,63.5,Dining
+2023-08-04,Online Shopping,101,Shopping
+2023-08-05,Electric Bill,75,Utilities
+2023-08-06,Grocery Store,74.6,Groceries
+2023-08-07,Coffee Shop,7.5,Dining
+2023-08-08,Phone Bill,46,Utilities
+2023-08-09,Public Transport,24.5,Transportation
+2023-08-10,Cinema Tickets,32.5,Entertainment
+2023-08-11,Book Store,32,Shopping
+2023-08-12,Gas Station,35,Transportation
+2023-08-13,Coffee Shop,7.6,Dining
+2023-08-14,Park Picnic,42,Dining
+2023-08-15,Internet Bill,55,Utilities
+2023-08-16,Grocery Store,76.3,Groceries
+2023-08-17,Clothing Store,125,Shopping
+2023-08-18,Pharmacy,20.5,Healthcare
+2023-08-19,Restaurant Lunch,40,Dining
+2023-08-20,Coffee Shop,7.7,Dining
+2023-08-21,Museum Tickets,19,Entertainment
+2023-08-22,Public Transport,25,Transportation
+2023-08-23,Online Shopping,103,Shopping
+2023-08-24,Grocery Store,77.8,Groceries
+2023-08-25,Water Bill,24.5,Utilities
+2023-08-26,Zoo Tickets,32,Entertainment
+2023-08-27,Coffee Shop,7.8,Dining
+2023-08-28,Gas Station,35.5,Transportation
+2023-08-29,Book Purchase,23,Shopping
+2023-08-30,Grocery Store,78.9,Groceries
+2023-08-31,Cinema Tickets,34,Entertainment
+2023-09-01,Theater Tickets,52,Entertainment
+2023-09-02,Gas Station,36,Transportation
+2023-09-03,Restaurant Dinner,65,Dining
+2023-09-04,Online Shopping,105,Shopping
+2023-09-05,Electric Bill,76.5,Utilities
+2023-09-06,Grocery Store,79.6,Groceries
+2023-09-07,Coffee Shop,8,Dining
+2023-09-08,Phone Bill,47,Utilities
+2023-09-09,Public Transport,26,Transportation
+2023-09-10,Cinema Tickets,35.5,Entertainment
+2023-09-11,Book Store,33,Shopping
+2023-09-12,Gas Station,36.5,Transportation
+2023-09-13,Coffee Shop,8.2,Dining
+2023-09-14,Park Picnic,44,Dining
+2023-09-15,Internet Bill,56,Utilities
+2023-09-16,Grocery Store,80.4,Groceries
+2023-09-17,Clothing Store,130,Shopping
+2023-09-18,Pharmacy,21.5,Healthcare
+2023-09-19,Restaurant Lunch,41.5,Dining
+2023-09-20,Coffee Shop,8.4,Dining
+2023-09-21,Museum Tickets,20,Entertainment
+2023-09-22,Public Transport,26.5,Transportation
+2023-09-23,Online Shopping,107,Shopping
+2023-09-24,Grocery Store,81.3,Groceries
+2023-09-25,Water Bill,25.5,Utilities
+2023-09-26,Zoo Tickets,33.5,Entertainment
+2023-09-27,Coffee Shop,8.6,Dining
+2023-09-28,Gas Station,37.5,Transportation
+2023-09-29,Book Purchase,24.5,Shopping
+2023-09-30,Grocery Store,82.7,Groceries
+2023-10-01,Cinema Tickets,36,Entertainment
+2023-10-02,Theater Tickets,54,Entertainment
+2023-10-03,Gas Station,38,Transportation
+2023-10-04,Restaurant Dinner,66.5,Dining
+2023-10-05,Online Shopping,109,Shopping
+2023-10-06,Electric Bill,78,Utilities
+2023-10-07,Grocery Store,83.9,Groceries
+2023-10-08,Coffee Shop,8.8,Dining
+2023-10-09,Phone Bill,48,Utilities
+2023-10-10,Public Transport,27.5,Transportation
+2023-10-11,Cinema Tickets,37.5,Entertainment
+2023-10-12,Book Store,34.5,Shopping
+2023-10-13,Gas Station,39.5,Transportation
+2023-10-14,Coffee Shop,9,Dining
+2023-10-15,Park Picnic,46,Dining
+2023-10-16,Internet Bill,57.5,Utilities
+2023-10-17,Grocery Store,85.2,Groceries
+2023-10-18,Clothing Store,135,Shopping
+2023-10-19,Pharmacy,22.5,Healthcare
+2023-10-20,Restaurant Lunch,43,Dining
+2023-10-21,Coffee Shop,9.2,Dining
+2023-10-22,Museum Tickets,21.5,Entertainment
+2023-10-23,Public Transport,28,Transportation
+2023-10-24,Online Shopping,111,Shopping
+2023-10-25,Grocery Store,86.5,Groceries
+2023-10-26,Water Bill,26.5,Utilities
+2023-10-27,Zoo Tickets,35,Entertainment
+2023-10-28,Coffee Shop,9.4,Dining
+2023-10-29,Gas Station,40.5,Transportation
+2023-10-30,Book Purchase,26,Shopping
+2023-10-31,Grocery Store,88,Groceries
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/4_qa_csv/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/4_qa_csv/data.json
new file mode 100644
index 000000000000..6647a4397779
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/4_qa_csv/data.json
@@ -0,0 +1,31 @@
+{
+ "category": [
+ "data"
+ ],
+ "cutoff": 90,
+ "dependencies": [
+ "TestAnswerQuestionSmallCsv"
+ ],
+ "eval_id": "bb6e0a4b-7faf-4aa6-a524-548cddbc2732",
+ "ground": {
+ "answer": "The correct amount spent on utilities.",
+ "eval": {
+ "type": "file"
+ },
+ "files": [
+ "output.txt"
+ ],
+ "should_contain": [
+ "1861"
+ ]
+ },
+ "info": {
+ "description": "Tests if the agent can answer a question from a csv",
+ "difficulty": "intermediate",
+ "side_effects": [
+ ""
+ ]
+ },
+ "name": "AnswerQuestionCsv",
+ "task": "Read and analyze 'file1.txt', then answer the question: How much was spent on utilities in total? Write the answer in 'output.txt'."
+}
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/5_search/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/5_search/data.json
new file mode 100644
index 000000000000..3a174bbe277b
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/5_search/data.json
@@ -0,0 +1,37 @@
+{
+ "category": [
+ "general",
+ "scrape_synthesize"
+ ],
+ "cutoff": 120,
+ "dependencies": [
+ "TestWriteFile"
+ ],
+ "eval_id": "0bb23182-b434-402b-a73e-9c226469b959",
+ "ground": {
+ "answer": "This is a Heading\nThis is a paragraph.",
+ "eval": {
+ "type": "file"
+ },
+ "files": [
+ ".txt"
+ ],
+ "should_contain": [
+ "Heading",
+ "paragraph"
+ ],
+ "should_not_contain": [
+ "The",
+ "the"
+ ]
+ },
+ "info": {
+ "description": "Tests if the agent can search.",
+ "difficulty": "interface",
+ "side_effects": [
+ ""
+ ]
+ },
+ "name": "Search",
+ "task": "Open 'https://silennaihin.com/random/plain.html' and paste all of the text on the page in a .txt file. Save it to the current directory: './' "
+}
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/6_book_price/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/6_book_price/data.json
new file mode 100644
index 000000000000..86328ebd7087
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/6_book_price/data.json
@@ -0,0 +1,31 @@
+{
+ "category": [
+ "scrape_synthesize",
+ "general"
+ ],
+ "cutoff": 60,
+ "dependencies": [
+ "TestSearch"
+ ],
+ "eval_id": "cd96e6b2-779d-4a4a-8367-d520023e27ae",
+ "ground": {
+ "answer": "\u00a325.89",
+ "eval": {
+ "type": "file"
+ },
+ "files": [
+ ".txt"
+ ],
+ "should_contain": [
+ "25.89"
+ ],
+ "should_not_contain": []
+ },
+ "info": {
+ "description": "Tests if the agent can retrieve a specific information from a website.",
+ "difficulty": "basic",
+ "side_effects": []
+ },
+ "name": "BasicRetrieval",
+ "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file. Save it to the current directory: './'"
+}
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/7_revenue/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/7_revenue/data.json
new file mode 100644
index 000000000000..923c1e36a78f
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/7_revenue/data.json
@@ -0,0 +1,31 @@
+{
+ "category": [
+ "scrape_synthesize",
+ "general"
+ ],
+ "cutoff": 60,
+ "dependencies": [
+ "TestBasicRetrieval"
+ ],
+ "eval_id": "dc2114d7-1597-4c9b-bed0-a97937ad977f",
+ "ground": {
+ "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+ "eval": {
+ "type": "file"
+ },
+ "files": [
+ ".txt"
+ ],
+ "should_contain": [
+ "81,462"
+ ],
+ "should_not_contain": []
+ },
+ "info": {
+ "description": "Tests if the agent can retrieve Tesla's revenue in 2022.",
+ "difficulty": "intermediate",
+ "side_effects": []
+ },
+ "name": "RevenueRetrieval",
+ "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 million). Save the file to the current directory: './'"
+}
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/8_get_information/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/8_get_information/data.json
new file mode 100644
index 000000000000..f198d2e4e61b
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/8_get_information/data.json
@@ -0,0 +1,35 @@
+{
+ "category": [
+ "scrape_synthesize",
+ "general"
+ ],
+ "cutoff": 60,
+ "dependencies": [
+ "TestRevenueRetrieval"
+ ],
+ "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
+ "ground": {
+ "answer": "The twitter handles of the two hosts of Latent Space.",
+ "case_sensitive": false,
+ "eval": {
+ "type": "file"
+ },
+ "files": [
+ "output.txt"
+ ],
+ "should_contain": [
+ "swyx",
+ "FanaHOVA"
+ ],
+ "should_not_contain": []
+ },
+ "info": {
+ "description": "Tests if the agent can retrieve twitter handles given a vague description.",
+ "difficulty": "intermediate",
+ "side_effects": [
+ ""
+ ]
+ },
+ "name": "TestGetInformation",
+ "task": "Write the twitter handle of the two hosts of Latent Space to a file called output.txt"
+}
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/9_three_sum/custom_python/test_three_sum.py b/samples/tools/testbed/scenarios/AutoGPT/challenges/9_three_sum/custom_python/test_three_sum.py
new file mode 100644
index 000000000000..0cc4811f44a2
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/9_three_sum/custom_python/test_three_sum.py
@@ -0,0 +1,32 @@
+# mypy: ignore-errors
+from typing import List
+
+from sample_code import three_sum
+
+
+def test_three_sum(nums: List[int], target: int, expected_result: List[int]) -> None:
+ result = three_sum(nums, target)
+ print(result)
+ assert (
+ result == expected_result
+ ), f"AssertionError: Given input nums: {nums}, target: {target}\nExpected the output to be {expected_result}, got {result} instead."
+
+
+if __name__ == "__main__":
+ # test the trivial case with the first three numbers
+ nums = [2, 7, 11, 15]
+ target = 20
+ expected_result = [0, 1, 2]
+ test_three_sum(nums, target, expected_result)
+
+ # test for ability to use zero and the same number twice
+ nums = [2, 7, 0, 15, 12, 0]
+ target = 2
+ expected_result = [0, 2, 5]
+ test_three_sum(nums, target, expected_result)
+
+ # test for first and last index usage and negative numbers
+ nums = [-6, 7, 11, 4]
+ target = 9
+ expected_result = [0, 2, 3]
+ test_three_sum(nums, target, expected_result)
diff --git a/samples/tools/testbed/scenarios/AutoGPT/challenges/9_three_sum/data.json b/samples/tools/testbed/scenarios/AutoGPT/challenges/9_three_sum/data.json
new file mode 100644
index 000000000000..bf93a9e6b8df
--- /dev/null
+++ b/samples/tools/testbed/scenarios/AutoGPT/challenges/9_three_sum/data.json
@@ -0,0 +1,33 @@
+{
+ "category": [
+ "coding",
+ "general"
+ ],
+ "cutoff": 60,
+ "dependencies": [
+ "TestWriteFile"
+ ],
+ "eval_id": "a1ff38a4-1032-4bf2-960a-3b927f9936f4",
+ "ground": {
+ "answer": "The three_sum function coded properly.",
+ "eval": {
+ "type": "python"
+ },
+ "files": [
+ "test_three_sum.py"
+ ],
+ "should_contain": [
+ "[0, 1, 2]",
+ "[0, 2, 5]",
+ "[0, 2, 3]"
+ ],
+ "should_not_contain": []
+ },
+ "info": {
+ "description": "Tests if the agent can create the three_sum function.",
+ "difficulty": "basic",
+ "side_effects": []
+ },
+ "name": "ThreeSum",
+ "task": "Create a three_sum function in a file called sample_code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you should not use the same element twice. The input array is NOT sorted. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2]."
+}
diff --git a/samples/tools/testbed/scenarios/MATH/README.md b/samples/tools/testbed/scenarios/MATH/README.md
new file mode 100644
index 000000000000..7fea2cd0f4bd
--- /dev/null
+++ b/samples/tools/testbed/scenarios/MATH/README.md
@@ -0,0 +1,27 @@
+## Get json file to run
+
+This will convert the math problems to json format and put it in the `scenarios/MATH` folder.
+```sh
+cd samples/tools/testbed/
+python scenarios/MATH/problems_to_json.py
+```
+
+## Run the testbed
+
+Note: this will first run autogen on the math problems, and then use a LLM as answer checker to check the answers.
+This means the results is not 100% accurate.
+
+```sh
+python run_scenarios.py scenarios/MATH/problems.jsonl -c --requirements math_requirements.txt
+```
+
+## Get the correct count
+Use `--path` or `-p` to specify the path to the problem directory, the default is `./results/problems/`, which is the default save path of this testbed.
+```sh
+python scenarios/MATH/count_correct_math.py --path
+```
+
+Example output:
+```
+Trial 0 | Total Correct: 10 | Total Problems: 17
+```
diff --git a/samples/tools/testbed/scenarios/MATH/answer.txt b/samples/tools/testbed/scenarios/MATH/answer.txt
new file mode 100644
index 000000000000..42844f73f66d
--- /dev/null
+++ b/samples/tools/testbed/scenarios/MATH/answer.txt
@@ -0,0 +1 @@
+__ANSWER__
diff --git a/samples/tools/testbed/scenarios/MATH/count_correct_math.py b/samples/tools/testbed/scenarios/MATH/count_correct_math.py
new file mode 100644
index 000000000000..69766dfb0c5d
--- /dev/null
+++ b/samples/tools/testbed/scenarios/MATH/count_correct_math.py
@@ -0,0 +1,56 @@
+import argparse
+import json
+import os
+
+
+def main(args):
+ stars = "*" * 100
+
+ # initiate the correct count for each trial
+ correct_count = [0 for i in range(args.num_trials)]
+
+ for i in range(args.num_trials):
+ for problem_name in os.listdir(args.path):
+ problem_path = os.path.join(args.path, problem_name, str(i))
+ if os.path.isdir(problem_path):
+ checker_file_path = os.path.join(problem_path, "checker_messages.json")
+
+ with open(checker_file_path, "r") as file:
+ checker_messages = json.load(file)
+
+ check_result = checker_messages["checker_proxy"][-1]["content"].lower()
+
+ if (
+ "the answer is correct" in check_result
+ or "the answer is approximated but should be correct" in check_result
+ ):
+ correct_count[i] += 1
+ # print(f"{problem_name} | Correct")
+ # else:
+ # print(f"{problem_name} | Wrong")
+
+ print(f"{stars}\nTrial {i} | Total Correct: {correct_count[i]} | Total Problems: {len(os.listdir(args.path))}")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="""Print Math Problems results.""".strip(),
+ )
+ parser.add_argument(
+ "--path",
+ "-p",
+ type=str,
+ default="./results/problems/",
+ help="Path to the problems directory",
+ )
+ # num trials
+ parser.add_argument(
+ "--num_trials",
+ "-n",
+ type=int,
+ default=1,
+ help="Number of trials to check",
+ )
+
+ args = parser.parse_args()
+ main(args)
diff --git a/samples/tools/testbed/scenarios/MATH/problems_to_json.py b/samples/tools/testbed/scenarios/MATH/problems_to_json.py
new file mode 100644
index 000000000000..436217a8c251
--- /dev/null
+++ b/samples/tools/testbed/scenarios/MATH/problems_to_json.py
@@ -0,0 +1,77 @@
+import json
+
+problems = [
+ "Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.",
+ "Find the value of $a_2+a_4+a_6+a_8+\\dots+a_{98}$ if $a_1, a_2, a_3, \\ldots$ is an arithmetic progression with common difference $1$ and \\[a_1+a_2+a_3+\\dots+a_{98}=137.\\]",
+ "Tina the tourist goes on a trip. She starts at the origin and drives north (in the positive $y$ direction) for $10$ units. Then she turns east (the positive $x$ direction) and as she's turning her camera flies out the window and lands exactly at $(0,10)$. She then drives $9$ units east, turns and drives $8$ units north. She continues this pattern of turning and driving one unit less than after the previous turn, until stopping after driving $1$ unit east. She reaches for her camera only to find it missing! She activates the GPS homing device on her camera and drives back to it in a straight line. What is the equation of this line? Express your answer as $ax+by=c$, where $a$, $b$, and $c$ are integers, $a>0$, and $a$ is as small as possible.",
+ "For what negative value of $k$ is there exactly one solution to the system of equations \\begin{align*}\ny &= 2x^2 + kx + 6 \\\\\ny &= -x + 4?\n\\end{align*}",
+ "If $\\frac{3x^2-4x+1}{x-1}=m$, and $x$ can be any real number except $1$, what real values can $m$ NOT have?",
+ "Find all numbers $a$ for which the graph of $y=x^2+a$ and the graph of $y=ax$ intersect. Express your answer in interval notation.",
+ "If $\\displaystyle{f(x)=x^{(x+1)}(x+2)^{(x+3)}}$, then find the value of $f(0)+f(-1)+f(-2)+f(-3)$.",
+ "An envelope contains eight bills: 2 ones, 2 fives, 2 tens, and 2 twenties. Two bills are drawn at random without replacement. What is the probability that their sum is $\\$20$ or more?",
+ "Find the coefficient of $x^2$ in the expansion of the product $$(1-x)(1+2x)(1-3x)\\dotsm(1+14x)(1-15x).$$",
+ "All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction.",
+ "Let $x$ and $y$ be real numbers. Find the set of possible values of\n\\[\\frac{(x + y)(1 - xy)}{(1 + x^2)(1 + y^2)}.\\]",
+ "On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\overline{PQ}$ is $B$, the midpoint of $\\overline{BQ}$ is $C$, and the midpoint of $\\overline{PC}$ is $D$. What is the coordinate of $D$?",
+ "Find $24^{-1} \\pmod{11^2}$. That is, find the residue $b$ for which $24b \\equiv 1\\pmod{11^2}$.\n\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.",
+ "There are two cameras that take pictures of a traffic intersection. Camera A starts taking pictures at $6$ AM and takes a picture every $11$ minutes. Camera B starts taking pictures at $7$ AM and takes pictures every $7$ minutes. Camera A and Camera B take a picture at the same time at four different times before noon. When Camera A and Camera B take their last picture together, how many minutes before noon is it?",
+ "Let $z$ be a complex number such that $z^{13} = 1.$ Let $w_1,$ $w_2,$ $\\dots,$ $w_k$ be all the possible values of\n\\[z + z^3 + z^4 + z^9 + z^{10} + z^{12}.\\]Find $w_1^2 + w_2^2 + \\dots + w_k^2.$",
+ "There are 190 people on the beach. 110 are wearing sunglasses, 70 are wearing bathing suits, and 95 are wearing a hat. Everyone is wearing at least one of these items. 30 are wearing both bathing suits and sunglasses. 25 are wearing both bathing suits and a hat. 40 are wearing both sunglasses and a hat. How many people are wearing all three items?",
+ "Completely simplify and rationalize the denominator: $$\\frac{\\sqrt{160}}{\\sqrt{252}}\\times\\frac{\\sqrt{245}}{\\sqrt{108}}$$",
+]
+answers = [
+ # 6 algebra
+ "(-\\infty, -14)\\cup(-3,\\infty)",
+ "93",
+ "4x-5y=-50",
+ "-5",
+ "2",
+ "(-\\infty,0]\\cup[4,\\infty)",
+ # 11 problems, 2 from each category, (1 algebra is deleted)
+ "\\frac{10}{9}",
+ "\\frac{1}{2}",
+ "-588",
+ " \\frac{1}{13}",
+ "\\left[ -\\frac{1}{2}, \\frac{1}{2} \\right]",
+ "23",
+ "116",
+ "41",
+ "43",
+ "10",
+ "\\frac{5\\sqrt{42}}{27}",
+]
+
+
+def problem_to_json():
+ with open("problems.jsonl", "w") as f:
+ for i, problem in enumerate(problems):
+ # a = {
+ # 'id': f'problem{i}',
+ # 'template': 'scenario.py',
+ # 'substitutions': {
+ # '__PROMPT__': problem,
+ # '__ANSWER__': answers[i],
+ # },
+ # }
+ a = {
+ "id": f"problem{i}",
+ "template": "./",
+ "substitutions": {"prompt.txt": {"__PROMPT__": problem}, "answer.txt": {"__ANSWER__": answers[i]}},
+ }
+ # Convert the dictionary to a JSON string and write it to the file
+ json_string = json.dumps(a)
+ f.write(json_string + "\n") # Add a newline character after each JSON object
+
+
+problem_to_json()
+
+problems = []
+with open("problems.jsonl", "r") as file:
+ for line in file:
+ # Parse each line as a JSON object
+ problem = json.loads(line)
+ problems.append(problem)
+ print(problem["substitutions"])
+ print()
+
+# Now 'problems' is a list of dictionaries, each representing a problem
diff --git a/samples/tools/testbed/scenarios/MATH/prompt.txt b/samples/tools/testbed/scenarios/MATH/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/samples/tools/testbed/scenarios/MATH/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/samples/tools/testbed/scenarios/MATH/scenario.py b/samples/tools/testbed/scenarios/MATH/scenario.py
new file mode 100644
index 000000000000..89cdfad1aee0
--- /dev/null
+++ b/samples/tools/testbed/scenarios/MATH/scenario.py
@@ -0,0 +1,97 @@
+import os
+import json
+import autogen
+
+import testbed_utils
+
+testbed_utils.init()
+
+
+PROMPT = ""
+with open("prompt.txt", "rt") as fh:
+ PROMPT = fh.read()
+
+ANSWER = ""
+with open("answer.txt", "rt") as fh:
+ ANSWER = fh.read()
+
+
+####################
+config_list = autogen.config_list_from_json(
+ "OAI_CONFIG_LIST",
+ filter_dict={"model": ["gpt40613"]},
+)
+llm_config = {
+ "cache_seed": 42,
+ "config_list": config_list,
+ "timeout": 600,
+}
+code_execution_config = {
+ "work_dir": "coding",
+ "use_docker": False, # set to True or image name like "python:3" to use docker
+}
+# ---------between "user" and "assistant"---------
+assistant = autogen.AssistantAgent(name="assistant", llm_config=llm_config)
+user_proxy = autogen.UserProxyAgent(
+ name="user",
+ human_input_mode="NEVER",
+ code_execution_config=code_execution_config,
+ max_consecutive_auto_reply=10,
+ is_termination_msg=lambda x: x.get("content", "")
+ and (x.get("content", "").rstrip().endswith("TERMINATE") or x.get("content", "").rstrip().endswith("TERMINATE.")),
+)
+
+user_proxy.initiate_chat(assistant, message=PROMPT)
+
+
+# --------- extract reply ---------
+response_with_ans = ""
+messages = assistant._oai_messages[user_proxy]
+for j in range(len(messages) - 1, -1, -1):
+ if (
+ messages[j]["role"] == "assistant"
+ and messages[j]["content"].strip() != "TERMINATE"
+ and messages[j]["content"].strip() != "TERMINATE."
+ ):
+ response_with_ans = messages[j]["content"]
+ break
+
+
+# ---------between "answer_checker" and "checker_proxy"---------
+# define answer checker chat
+
+check_sys_msg = """You are a helpful AI assistant. You will use your coding and language skills to verify the answer.
+You are given:
+ 1. A problem.
+ 2. A reply with the answer to the problem.
+ 3. A ground truth answer.
+Please do the following:
+1. Extract the answer in the reply: "The answer is ".
+2. Check whether the answer in the reply matches the ground truth answer. When comparison is not obvious (for example, 3*\\sqrt(6) and 7.348), you may write code to check the answer and wait for the user to execute the code.
+3. After everything is done, please choose a reply from the following options:
+ - "The answer is correct."
+ - "The answer is approximated but should be correct. Correct Answer: | Answer extracted: ."
+ - "The answer is incorrect. Correct Answer: | Answer extracted: ."
+ - "The reply doesn't contain an answer." """
+
+answer_checker = autogen.AssistantAgent(name="checker", llm_config=llm_config, system_message=check_sys_msg)
+checker_proxy = autogen.UserProxyAgent(
+ name="checker_proxy",
+ human_input_mode="NEVER",
+ code_execution_config=code_execution_config,
+ max_consecutive_auto_reply=5,
+ is_termination_msg=lambda x: x.get("content", "").lower()
+ and (
+ "the answer is correct" in x.get("content", "").lower()
+ or "the answer is incorrect" in x.get("content", "").lower()
+ or "the reply doesn't contain an answer" in x.get("content", "").lower()
+ or "the answer is approximated but should be correct" in x.get("content", "").lower()
+ ),
+)
+
+message_to_check = "Problem: " + PROMPT + f"\n\nReply: {response_with_ans}\n\nGround truth answer: " + ANSWER
+checker_proxy.initiate_chat(answer_checker, message=message_to_check)
+
+
+####################
+testbed_utils.finalize(agents=[assistant, user_proxy, answer_checker, checker_proxy])
diff --git a/samples/tools/testbed/utils/collate_autogpt.py b/samples/tools/testbed/utils/collate_autogpt.py
new file mode 100644
index 000000000000..ddde5b793fe8
--- /dev/null
+++ b/samples/tools/testbed/utils/collate_autogpt.py
@@ -0,0 +1,108 @@
+import argparse
+import os
+import re
+import subprocess
+import sys
+
+
+def collate(results_dir="results"):
+ """
+ Collate the results of running AutoGPT test.
+
+ Args:
+ results_dir (str, optional): The folder where results are saved. Defaults to "results".
+ """
+
+ all_results = list()
+ max_instances = 0
+
+ for test_name in os.listdir(results_dir):
+ test_path = os.path.join(results_dir, test_name)
+
+ # Collect the results vector
+ results = [test_name]
+
+ instance = 0
+ instance_dir = os.path.join(test_path, str(instance))
+ while os.path.isdir(instance_dir):
+ console_log = os.path.join(instance_dir, "console_log.txt")
+ if os.path.isfile(console_log):
+ with open(console_log, "rt") as fh:
+ content = fh.read()
+ if "ALL TESTS PASSED!" in content:
+ # Ideally we would have a more distinctive pattern.
+ results.append(str(len(re.findall(r"\n(.*?) \(to (.*?)\)\:\n", content))))
+ else:
+ # Sometimes the task actually succeeds, but the check.py isn't properly called
+ result = subprocess.run(
+ [sys.executable, "../check.py"],
+ cwd=os.path.join(instance_dir, "coding"),
+ capture_output=True,
+ text=True,
+ )
+ if "error" in result.stderr or result.returncode != 0:
+ results.append("-1")
+ else:
+ # The task actually succeeds.
+ if "ALL TESTS PASSED!" in result.stdout:
+ results.append(str(len(re.findall(r"\n(.*?) \(to (.*?)\)\:\n", content))))
+ else:
+ results.append("-1")
+ else:
+ # Missing results will appear as blanks
+ results.append("")
+
+ instance += 1
+ instance_dir = os.path.join(test_path, str(instance))
+
+ max_instances = max(max_instances, instance)
+
+ # Buffer the results
+ all_results.append(results)
+
+ # Create a header
+ header = "TestName"
+ for i in range(0, max_instances):
+ header += ",Trial" + str(i)
+ print(header)
+
+ # Print a fully-populated table of results
+ for r in all_results:
+ while len(r) < max_instances + 1:
+ r.append("")
+ print(",".join(r))
+
+
+if __name__ == "__main__":
+ script_path = os.path.realpath(__file__)
+ script_name = os.path.basename(script_path)
+ script_dir = os.path.dirname(script_path)
+
+ # Path to the default results directory
+ # (relative to this script, up on directory, then into the results folder)
+ default_results_dir = os.path.realpath(os.path.join(script_dir, os.path.pardir, "results"))
+
+ parser = argparse.ArgumentParser(
+ description=f"""
+{script_name} will collate the results of the AutoGPT scenarios and output them to a CSV. The CSV format is as follows:
+
+TestName, Trial0, Trial1, ..., TrialN
+Test_1, x_10, x_11, ..., X_1N
+Test_2, x_20, x_21, ..., X_2N
+...
+Test_M, x_M0, x_M1, ..., X_MN
+
+
+Where x_ij is the number of AsssitantAgent conversation turns needed to pass all the tests for problem i, in Trial/repetition j. If the agent was not able to pass the tests by the end of the conversation, the value will be -1. If data for the trial is missing, the value will be an empty string "".
+""".strip(),
+ formatter_class=argparse.RawTextHelpFormatter,
+ )
+
+ parser.add_argument(
+ "scenario",
+ nargs="?",
+ help="Path to the scenario results. (default: " + default_results_dir + ")",
+ default=default_results_dir,
+ )
+ args = parser.parse_args()
+ collate(args.scenario)
diff --git a/samples/tools/testbed/utils/collate_human_eval.py b/samples/tools/testbed/utils/collate_human_eval.py
index 19dcaac2774a..594051dc2b63 100644
--- a/samples/tools/testbed/utils/collate_human_eval.py
+++ b/samples/tools/testbed/utils/collate_human_eval.py
@@ -10,7 +10,7 @@ def collate(results_dir):
Collate the results of running human eval.
Args:
- results_dir (path): The folder were results were be saved.
+ results_dir (path): The folder where results are saved.
"""
all_results = list()
@@ -19,7 +19,7 @@ def collate(results_dir):
for test_id in os.listdir(results_dir):
test_path = os.path.join(results_dir, test_id)
- # Collect the reslts vector
+ # Collect the results vector
results = [test_id]
instance = 0
diff --git a/samples/tools/testbed/scenarios/AutoGPT/utils/prepare_data.py b/samples/tools/testbed/utils/prepare_autogpt.py
similarity index 69%
rename from samples/tools/testbed/scenarios/AutoGPT/utils/prepare_data.py
rename to samples/tools/testbed/utils/prepare_autogpt.py
index f298b15e8c5a..9da279735452 100644
--- a/samples/tools/testbed/scenarios/AutoGPT/utils/prepare_data.py
+++ b/samples/tools/testbed/utils/prepare_autogpt.py
@@ -4,7 +4,9 @@
import os
import shutil
-data_paths = glob.glob("challenges/*/data.json")
+current_file_dir = os.path.dirname(os.path.abspath(__file__))
+challenge_path = os.path.join(os.path.dirname(current_file_dir), "scenarios/AutoGPT/challenges")
+data_paths = glob.glob(str(challenge_path) + "/*/data.json")
for data_path in data_paths:
print("Converting data path: ", data_path)
@@ -29,21 +31,32 @@
should_not_contain_base64.append(encoded_word)
# copy all the files needed to 'coding' directory
+ # 1. 'artifacts_in' directory: all the files needed for QA
+ save_path = os.path.join(os.path.dirname(current_file_dir), "scenarios/AutoGPT")
artifacts_in = False
if os.path.exists(os.path.join(workspace, "artifacts_in")):
artifacts_in = True
- target_folder = os.path.join("Templates/TwoAgents/coding/file", data["name"])
+ target_folder = os.path.join(save_path, "Templates/TwoAgents/coding/file", data["name"])
if os.path.exists(target_folder):
shutil.rmtree(target_folder)
shutil.copytree(os.path.join(workspace, "artifacts_in"), target_folder)
# print(f"All the artifacts are copied from {os.path.join(workspace, 'artifacts_in')} to {target_folder}")
+ # 2. 'custom_python' directory: all the files needed for testing python code
+ if os.path.exists(os.path.join(workspace, "custom_python")):
+ target_folder = os.path.join(save_path, "Templates/TwoAgents/custom_python")
+ if not os.path.exists(target_folder):
+ os.makedirs(target_folder)
+ for filename in os.listdir(os.path.join(workspace, "custom_python")):
+ shutil.copy(os.path.join(workspace, "custom_python", filename), os.path.join(target_folder, filename))
+ # print(f"File copied from {os.path.join(workspace, 'custom_python', filename)} to {target_folder}")
+
record = {
- "id": data["eval_id"],
+ "id": data["name"],
"template": "Templates/TwoAgents",
"substitutions": {
"scenario.py": {
- "__MODEL__": "gpt-3.5-turbo-16k",
+ "__MODEL__": "gpt-35-turbo-16k",
"__TASK__": data["task"],
"__TARGET_FOLDER__": f"file/{data['name']}" if artifacts_in else "",
},
@@ -60,11 +73,11 @@
},
},
}
- with open(f"{data['name']}_gpt35.jsonl", "wt") as f:
+ with open(os.path.join(save_path, "autogpt_twoagent_gpt35.jsonl"), "a") as f:
f.write(json.dumps(record).strip() + "\n")
record = {
- "id": data["eval_id"],
+ "id": data["name"],
"template": "Templates/TwoAgents",
"substitutions": {
"scenario.py": {
@@ -85,5 +98,5 @@
},
},
}
- with open(f"{data['name']}_gpt4.jsonl", "wt") as f:
+ with open(os.path.join(save_path, "autogpt_twoagent_gpt4.jsonl"), "a") as f:
f.write(json.dumps(record).strip() + "\n")
diff --git a/test/agentchat/contrib/example_test_agent_builder_config.json b/test/agentchat/contrib/example_test_agent_builder_config.json
index 109ea0f20bed..9251cca3e79a 100644
--- a/test/agentchat/contrib/example_test_agent_builder_config.json
+++ b/test/agentchat/contrib/example_test_agent_builder_config.json
@@ -2,24 +2,29 @@
"building_task": "Find a paper on arxiv by programming, and analyze its application in some domain. For example, find a recent paper about gpt-4 on arxiv and find its potential applications in software.",
"agent_configs": [
{
- "name": "Data_scientist",
- "model": "gpt-4-1106-preview",
- "system_message": "As a Data Scientist, you will:\n\n- Utilize your advanced coding skills specifically in Python to automate information gathering from various sources including web scraping, file downloads, and parsing data. This may include writing Python scripts to retrieve and present the latest research papers from preprint services like arXiv.\n- Apply your analytical acumen to conduct thorough examinations of the technical materials you gather, especially focusing on their practical applications within different domains, such as software development in the case of GPT-4 research papers.\n- Perform data processing tasks that may involve complex algorithmic work, statistical analysis, or machine learning methodologies to extract insights and build models based on the gathered information, executing Python code as necessary to accomplish these tasks.\n- Present findings with clarity, extracting and interpreting results solely from the execution of Python scripts you've crafted. Use 'print' functions adequately in your Python code to ensure all results are clear and interpretable.\n- Be diligent in checking the viability and correctness of your code and analysis. When errors occur, address them promptly and provide corrected Python code for execution.\n- Remain adaptive to the dynamic field of data science, continually seeking additional relevant information when required, and revising your approach to problem-solving as needed.\n- Persistently strive for the successful completion of the task at hand, ready to pursue alternative strategies in case initial methods fall short of fulfilling the task's requirements.\n- Conclude any sequence of task-related interactions with a final confirmation that the user's needs have been met, signifying the end of the process by replying \"TERMINATE\"."
+ "name": "Data_Scientist",
+ "model": "gpt-4",
+ "system_message": "You are a proficient Data Scientist with strong Python skills and the ability to analyze academic papers, particularly from arxiv in the domain of programming. Ideally, your tasks involve identifying significant work in the field, such as recent papers on topics like gpt-4, and evaluating their potential applications in areas like software. You should be confident in providing outputs in the form of recommendations, insights, or analytical summaries based solely on the result of your analysis without any additional user feedback or actions. \n\nDetails of your work should include: \n\n 1. Identifying and obtaining the information needed for your task, such as browsing or searching the web, downloading/reading a file, printing the content of a webpage or a file. You'll use Python code to achieve these and more. The output should be comprehensive enough that your following steps based on data analysis can be conducted without requiring any user intervention.\n 2. Performing your main task, which is executing Python code to extract insights and applying your data science expertise to analyze those insights. You will present these results in a manner that satisfies the user's goals without needing further modification or user input. \n 3. Explaining your work in a step-by-step manner. If a plan is not provided initially, you need to formulate and explain your plan first. Clearly distinguish between steps involving coding and those dependent on your data science skills.\n 4. Indicating any errors in the code execution and proposing immediate fixes. If a fix isn't possible, or if the results don't satisfy the goals even after successful execution, you need to adjust your approach accordingly.\n 5. Verifying your results to ensure accuracy. If verifiable evidence can be provided to support your conclusion, make sure to include it in your response.\n \nWhen the task is completed to the satisfaction of the user, you should recognize this and reply with \"TERMINATE\"."
},
{
- "name": "Domain_expert",
- "model": "gpt-4-1106-preview",
- "system_message": "As a Domain Expert, you leverage your deep understanding and analytical abilities to provide insights and applications of new findings in scholarly articles. Your role focuses on identifying, interpreting, and discussing the implications of cutting-edge research in a specific domain. You will:\n\n1. Employ Python programming to autonomously locate and retrieve academic papers from databases such as arXiv. This involves formulating queries, processing search results, and downloading relevant documents using automated scripts.\n\n2. Analyze and synthesize the information contained within the located papers, with a particular emphasis on assessing their applications in the specified domain. Your language skills will be pivotal in understanding complex scientific texts and elucidating their potential impact on real-world problems and industry practices.\n\n3. Clearly communicate your findings and developed applications, providing comprehensive insights into how the content of the research paper can be utilized or integrated into existing systems or processes within your domain of expertise.\n\n4. Your work will be structured and systematic, starting from the initial programming stage to the final analysis and communication. Each phase should be clearly demarcated, with an explanation of your methodology and steps taken.\n\n5. Ensure all coding is provided in Python, and your guidance will be executed directly without the need for user modifications or intervention beyond the execution of provided scripts.\n\n6. You will manage any encountered issues during the process, including correcting errors in code and revising your approach based on the results obtained from script execution.\n\n7. Upon completing your task and providing a thorough analysis, confirm your final output and conclude the interaction with the statement \"TERMINATE,\" signaling the successful satisfaction of the user's need."
+ "name": "Machine_Learning_Engineer",
+ "model": "gpt-4",
+ "system_message": "As a Machine Learning Engineer, your primary tasks involve researching, developing, and applying machine learning and data analysis for complex tasks. In relation to the task at hand, you are expected to find a paper on arxiv using programming techniques, analyze the paper, and discuss its applications in a specific domain, using GPT-4 as an example.\n\nYou will need expertise in Python for implementing your programming skills. If any additional information is required, utilize Python scripts to collect, retrieve, and present the required data by browsing or searching the internet, downloading or reading a file, printing content from a webpage or a file, retrieving the current date/time, or checking the operating system.\n\nUpon collecting the necessary information, use your professional judgment to analyze the data and solve the task at hand. Ensure to perform each task comprehensively and intelligently, presenting each step clearly, specifying when Python code was used and when it was purely your analytical skills. Specify the type of script used in the code block while suggesting a one-time executable Python code to the user, making sure that the code doesn't need modification or addition by the user. If necessary, instruct the user on how to store code into a file prior to execution.\n\nAlways confirm the execution results returned by the user. If there is an error in the execution, you are to correct the error, provide the user with the corrected full script, and prevent suggesting partial or incomplete codes. If an issue persists, revisit your assumptions, gather more data, and consider alternate approaches. Whenever you attain a solution to a task, carefully validate the answer and provide verifiable evidence where possible.\n\nLastly, reply \"TERMINATE\" once the task is complete and all needs have been addressed."
},
{
- "name": "Software_engineer",
- "model": "gpt-4-1106-preview",
- "system_message": "As a skilled Software Engineer, your primary role is to leverage your coding expertise, particularly in Python, to facilitate the discovery and analysis of academic papers on arXiv, and to evaluate their real-world applications. \n\n1. You are expected to craft Python scripts capable of web tasks such as searching for academic papers, downloading and reading files, extracting and presenting content, as well as recognizing the current date/time and operating system details. Your script should output all necessary information for task completion.\n\n2. You should use Python scripts to accomplish specific tasks, ensuring that the script completes the task autonomously and provides the results to the user.\n\nYour responsibilities involve executing tasks in a systematic manner, clarifying your approach when a plan is not provided. Clearly distinguish between steps that involve executing Python code and those that engage your analytical skills. \n\nAlways present your Python code within a code block, ensuring it is ready for immediate execution without requiring modifications from the user. Here is how you should format a code suggestion:\n```python\n# Python code goes here\n```\n\nIf a script is to be saved before execution, indicate the filename at the beginning of the code block. Do not include multiple code blocks in a single interaction or ask users to manually copy results \u2014 use the `print` function within the script to display outputs. After providing a script, review the user's execution result. In case of an error, deliver a corrected script. If the task remains unsolved despite error-free execution, reassess your approach, gather more information if needed, and try a different strategy.\n\nEnsure that your solution is methodically verified and, where possible, supported by verifiable evidence.\n\nConclude your interaction by replying \u201cTERMINATE\u201d once the task is complete and the user\u2019s need has been satisfied. \n\nRemember, while your role is to assist with a task, it is also to enable and educate, ultimately fostering a user's understanding and their ability to independently solve similar problems in the future."
+ "name": "Research_Analyst",
+ "model": "gpt-4",
+ "system_message": "You are a proficient Research Analyst with a knack for finding and interpreting cutting-edge research in technical fields. Your ability to use Python programming to search, collect and present relevant information is a substantial part of your role.\n\nCarrying out tasks, such as navigating web platforms and downloading/reading files, requires expert use of Python code for execution. You can create detailed scripts like browsing the internet, printing webpage content or a file, obtaining the current date and time, and confirming the operating system. Once enough information has been amassed, harness your understanding of the subject matter to solve the task without the need for more code.\n\nDemonstrating intelligent problem-solving, as well as precise and efficient code execution, is paramount in this job. Perform tasks smartly and in a planned sequence if required. If a plan isn't given, outline your own first.\n\nBe especially clear about the steps that necessitate code and those that use your language competence. Specify the script type within Python code blocks, and ensure the code does not need to be altered by the user before execution. There should be only one code block per response.\n\nIf you need to save codes in a file, signify this by starting your Python code block with # filename: . Avoid asking the user to copy and paste results. Instead, generate output using the Python 'print' function.\n\nScrutinize the user's execution results and if an error crops up, rectify it immediately. Focus on providing the complete code rather than partial code snippets. If an error persists despite numerous attempts, reassess your assumptions, gather more information if needed, and explore different problem-solving strategies.\n\nPrecision is key when fruitful answers come into view. Strive for careful validation of all answers and, if feasible, include verifiable evidence in your post.\n\nOnce all matters have been diligently addressed, calmly respond back with \"TERMINATE\" to indicate the successful completion of the task."
}
],
- "manager_system_message": "Group chat manager.",
"coding": true,
"default_llm_config": {
"temperature": 0
+ },
+ "code_execution_config": {
+ "last_n_messages": 2,
+ "work_dir": "/home/elpis_ubuntu/autogen/test/agentchat/contrib/test_agent_scripts",
+ "timeout": 60,
+ "use_docker": false
}
}
diff --git a/test/agentchat/test_groupchat.py b/test/agentchat/test_groupchat.py
index 27fda1fd5249..6d592ae3fa3d 100644
--- a/test/agentchat/test_groupchat.py
+++ b/test/agentchat/test_groupchat.py
@@ -187,7 +187,7 @@ def _test_n_agents_less_than_3(method):
messages=[],
max_round=6,
speaker_selection_method=method,
- allow_repeat_speaker=True if method == "random" else False,
+ allow_repeat_speaker=[agent1, agent2] if method == "random" else False,
)
group_chat_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=False)
agent1.initiate_chat(group_chat_manager, message="This is alice speaking.")
@@ -434,7 +434,7 @@ def test_next_agent():
# test_broadcast()
# test_chat_manager()
# test_plugin()
- # test_speaker_selection_method()
+ test_speaker_selection_method()
# test_n_agents_less_than_3()
# test_agent_mentions()
# test_termination()
diff --git a/test/test_notebook.py b/test/test_notebook.py
index 68f0d9bb1135..54e6c47273ef 100644
--- a/test/test_notebook.py
+++ b/test/test_notebook.py
@@ -68,6 +68,14 @@ def test_agentchat_function_call(save=False):
run_notebook("agentchat_function_call.ipynb", save=save)
+@pytest.mark.skipif(
+ skip or not sys.version.startswith("3.10"),
+ reason="do not run if openai is not installed or py!=3.10",
+)
+def test_agentchat_function_call_async(save=False):
+ run_notebook("agentchat_function_call_async.ipynb", save=save)
+
+
@pytest.mark.skipif(
skip or not sys.version.startswith("3.10"),
reason="do not run if openai is not installed or py!=3.10",
diff --git a/test/test_token_count.py b/test/test_token_count.py
index 1da4ccabeab3..8fe6e36660aa 100644
--- a/test/test_token_count.py
+++ b/test/test_token_count.py
@@ -1,4 +1,10 @@
-from autogen.token_count_utils import count_token, num_tokens_from_functions, token_left, percentile_used
+from autogen.token_count_utils import (
+ count_token,
+ num_tokens_from_functions,
+ token_left,
+ percentile_used,
+ get_max_token_limit,
+)
import pytest
func1 = {
@@ -67,6 +73,14 @@ def test_count_token():
assert percentile_used(text) == 10 / 4096
+def test_model_aliases():
+ assert get_max_token_limit("gpt35-turbo") == get_max_token_limit("gpt-3.5-turbo")
+ assert get_max_token_limit("gpt-35-turbo") == get_max_token_limit("gpt-3.5-turbo")
+ assert get_max_token_limit("gpt4") == get_max_token_limit("gpt-4")
+ assert get_max_token_limit("gpt4-32k") == get_max_token_limit("gpt-4-32k")
+
+
if __name__ == "__main__":
- test_num_tokens_from_functions()
- test_count_token()
+ # test_num_tokens_from_functions()
+ # test_count_token()
+ test_model_aliases()
diff --git a/website/docs/Examples.md b/website/docs/Examples.md
index fb9c2000e4e5..2dbed78b39f3 100644
--- a/website/docs/Examples.md
+++ b/website/docs/Examples.md
@@ -33,10 +33,12 @@ Links to notebook examples:
- **Web Search**: Solve Tasks Requiring Web Info - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_web_info.ipynb)
- Use Provided Tools as Functions - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_function_call.ipynb)
+ - Use Tools via Sync and Async Function Calling - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_function_call_async.ipynb)
- Task Solving with Langchain Provided Tools as Functions - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_langchain.ipynb)
- **RAG**: Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent) - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
- Function Inception: Enable AutoGen agents to update/remove functions during conversations. - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_inception_function.ipynb)
- Agent Chat with Whisper - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_video_transcript_translate_with_whisper.ipynb)
+ - Constrained Responses via Guidance - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_guidance.ipynb)
1. **Human Involvement**
- Simple example in ChatGPT style [View example](https://github.com/microsoft/autogen/blob/main/samples/simple_chat.py)
- Auto Code Generation, Execution, Debugging and **Human Feedback** - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_human_feedback.ipynb)
diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js
index 2de92589db29..390d4def3d71 100644
--- a/website/docusaurus.config.js
+++ b/website/docusaurus.config.js
@@ -1,77 +1,77 @@
/** @type {import('@docusaurus/types').DocusaurusConfig} */
-const math = require('remark-math');
-const katex = require('rehype-katex');
+const math = require("remark-math");
+const katex = require("rehype-katex");
module.exports = {
- title: 'AutoGen',
- tagline: 'Enable Next-Gen Large Language Model Applications',
- url: 'https://microsoft.github.io',
- baseUrl: '/autogen/',
- onBrokenLinks: 'throw',
- onBrokenMarkdownLinks: 'warn',
- favicon: 'img/ag.ico',
- organizationName: 'Microsoft', // Usually your GitHub org/user name.
- projectName: 'AutoGen', // Usually your repo name.
+ title: "AutoGen",
+ tagline: "Enable Next-Gen Large Language Model Applications",
+ url: "https://microsoft.github.io",
+ baseUrl: "/autogen/",
+ onBrokenLinks: "throw",
+ onBrokenMarkdownLinks: "warn",
+ favicon: "img/ag.ico",
+ organizationName: "Microsoft", // Usually your GitHub org/user name.
+ projectName: "AutoGen", // Usually your repo name.
themeConfig: {
navbar: {
- title: 'AutoGen',
+ title: "AutoGen",
logo: {
- alt: 'AutoGen',
- src: 'img/ag.svg',
+ alt: "AutoGen",
+ src: "img/ag.svg",
},
items: [
{
- type: 'doc',
- docId: 'Getting-Started',
- position: 'left',
- label: 'Docs',
+ type: "doc",
+ docId: "Getting-Started",
+ position: "left",
+ label: "Docs",
},
{
- type: 'doc',
- docId: 'reference/agentchat/conversable_agent',
- position: 'left',
- label: 'SDK',
+ type: "doc",
+ docId: "reference/agentchat/conversable_agent",
+ position: "left",
+ label: "SDK",
},
- {to: 'blog', label: 'Blog', position: 'left'},
+ { to: "blog", label: "Blog", position: "left" },
{
- type: 'doc',
- docId: 'FAQ',
- position: 'left',
- label: 'FAQ',
+ type: "doc",
+ docId: "FAQ",
+ position: "left",
+ label: "FAQ",
},
{
- href: 'https://github.com/microsoft/autogen',
- label: 'GitHub',
- position: 'right',
+ href: "https://github.com/microsoft/autogen",
+ label: "GitHub",
+ position: "right",
},
// {
// to: 'examples',
// label: 'Examples',
// },
{
- type: 'doc',
- docId: 'Examples',
- position: 'left',
- label: 'Examples',
+ type: "doc",
+ docId: "Examples",
+ position: "left",
+ label: "Examples",
},
{
- label: 'Resources',
- type: 'dropdown',
+ label: "Resources",
+ type: "dropdown",
items: [
{
- type: 'doc',
- docId: 'Ecosystem',
+ type: "doc",
+ docId: "Ecosystem",
},
{
- type: 'doc',
- docId: 'Gallery',
+ type: "doc",
+ docId: "Gallery",
},
],
},
],
},
footer: {
- style: 'dark',
+ style: "dark",
links: [
// {
// title: 'Docs',
@@ -83,29 +83,29 @@ module.exports = {
// ],
// },
{
- title: 'Community',
+ title: "Community",
items: [
- // // {
- // // label: 'Stack Overflow',
- // // href: 'https://stackoverflow.com/questions/tagged/pymarlin',
- // // },
+ // // {
+ // // label: 'Stack Overflow',
+ // // href: 'https://stackoverflow.com/questions/tagged/pymarlin',
+ // // },
{
- label: 'Discord',
- href: 'https://discord.gg/pAbnFJrkgZ',
+ label: "Discord",
+ href: "https://discord.gg/pAbnFJrkgZ",
},
{
- label: 'Twitter',
- href: 'https://twitter.com/pyautogen',
+ label: "Twitter",
+ href: "https://twitter.com/pyautogen",
},
],
},
],
- copyright: `Copyright © ${new Date().getFullYear()} AutoGen Authors.`,
+ copyright: `Copyright © ${new Date().getFullYear()} AutoGen Authors | Privacy and Cookies`,
},
},
presets: [
[
- '@docusaurus/preset-classic',
+ "@docusaurus/preset-classic",
{
blog: {
showReadingTime: true,
@@ -113,24 +113,24 @@ module.exports = {
// Adjust any other blog settings as needed
},
docs: {
- sidebarPath: require.resolve('./sidebars.js'),
+ sidebarPath: require.resolve("./sidebars.js"),
// Please change this to your repo.
- editUrl:
- 'https://github.com/microsoft/autogen/edit/main/website/',
+ editUrl: "https://github.com/microsoft/autogen/edit/main/website/",
remarkPlugins: [math],
rehypePlugins: [katex],
},
theme: {
- customCss: require.resolve('./src/css/custom.css'),
+ customCss: require.resolve("./src/css/custom.css"),
},
},
],
],
stylesheets: [
{
- href: "https://cdn.jsdelivr.net/npm/katex@0.13.11/dist/katex.min.css",
- integrity: "sha384-Um5gpz1odJg5Z4HAmzPtgZKdTBHZdw8S29IecapCSB31ligYPhHQZMIlWLYQGVoc",
- crossorigin: "anonymous",
+ href: "https://cdn.jsdelivr.net/npm/katex@0.13.11/dist/katex.min.css",
+ integrity:
+ "sha384-Um5gpz1odJg5Z4HAmzPtgZKdTBHZdw8S29IecapCSB31ligYPhHQZMIlWLYQGVoc",
+ crossorigin: "anonymous",
},
],
@@ -142,7 +142,7 @@ module.exports = {
// ... Your options.
// `hashed` is recommended as long-term-cache of index file is possible.
hashed: true,
- blogDir:"./blog/"
+ blogDir: "./blog/",
// For Docs using Chinese, The `language` is recommended to set to:
// ```
// language: ["en", "zh"],
diff --git a/website/src/data/gallery.json b/website/src/data/gallery.json
index 9e2fe1e877ba..e8094e4769d3 100644
--- a/website/src/data/gallery.json
+++ b/website/src/data/gallery.json
@@ -112,7 +112,7 @@
},
{
"title": "AutoGen + Flowise = Super AI Agents on No-Code Platform",
- "link": "https://github.com/sugarforever/LangChain-Advanced/blob/main/Integrations/AutoGen/AutoGen_flowise_ai_agent.ipynb",
+ "link": "https://github.com/sugarforever/LangChain-Advanced/blob/main/Integrations/AutoGen/autogen_flowise_ai_agent.ipynb",
"description": "Build super AI agents on a no-code platform using AutoGen and Flowise.",
"image": "default.png",
"tags": ["app"]