Version 0.0.2 of Autogenbench (#1548)

afourney · gagb · web-flow · commit 085bf6cf3d1f · 2024-02-24T18:12:57.000Z
* Prints the version of AutoGenBench from the command line, closing i1458

* Added autogenbench version to timestamp.txt

* Attempting to fix formatting.

* Add a gitignore for autogenbench

* Generalize to read all template dirs from Templates

* AutoGenBench logs telemetry when available.

* Remove spaces if present from template names.

* Bump version.

* Fixed formatting.

* Allow native warning to be skipped. Mount autogen repo in Docker if it can be found (experimental).

* Native execution now occurs in a venv.

* Bump version.

* Fixed a prompt escaping bug evident in GAIA task '6f37996b-2ac7-44b0-8e68-6d28256631b4'

* Updated all scenarios to use template discovery.

* Update with main version of runtime_logging.

---------

Co-authored-by: gagb &lt;gagb@users.noreply.github.com&gt;
diff --git a/samples/tools/autogenbench/.gitignore b/samples/tools/autogenbench/.gitignore
@@ -0,0 +1,3 @@
+scenarios/*/Downloads
+scenarios/*/Tasks
+*/Results
diff --git a/samples/tools/autogenbench/autogenbench/cli.py b/samples/tools/autogenbench/autogenbench/cli.py
@@ -1,4 +1,5 @@
 import sys
+from .version import __version__
 from .run_cmd import run_cli
 from .clone_cmd import clone_cli
 from .tabulate_cmd import tabulate_cli
@@ -9,6 +10,7 @@ def main(args=None):
         args = sys.argv[:]  # Shallow copy
 
     invocation_cmd = "autogenbench"
+    version_string = f"AutoGenBench version {__version__}"
 
     commands = [
         {
@@ -26,6 +28,11 @@ def main(args=None):
             "description": "tabulate the results of a previous run",
             "function": tabulate_cli,
         },
+        {
+            "command": "--version",
+            "description": f"print the version of {invocation_cmd}",
+            "function": lambda _args: print(f"{version_string}"),
+        },
         {"command": "--help", "description": "print this message", "function": None},
     ]
 
@@ -40,6 +47,8 @@ def main(args=None):
         commands_details += f"    {padded_cmd}: {c['description']}\n"
 
     usage_text = f"""
+{version_string}
+
 usage: {invocation_cmd} COMMAND ARGS
 
 Where, COMMAND is one of: {commands_list}
@@ -49,6 +58,8 @@ def main(args=None):
 """.strip()
 
     help_text = f"""
+{version_string}
+
 usage: {invocation_cmd} COMMAND ARGS
 
 {invocation_cmd} is a tool for running and managing AutoGen benchmark scenarios. A typically session might resemble:
diff --git a/samples/tools/autogenbench/autogenbench/run_cmd.py b/samples/tools/autogenbench/autogenbench/run_cmd.py
@@ -11,6 +11,7 @@
 import random
 from autogen import config_list_from_json
 from autogen.oai.openai_utils import filter_config
+from .version import __version__
 
 # Figure out where everything is
 SCRIPT_PATH = os.path.realpath(__file__)
@@ -247,17 +248,25 @@ def get_scenario_env(config_list, env_file=DEFAULT_ENV_FILE):
     Returns: A dictionary of keys and values that need to be added to the system environment.
     """
     env = dict()
+
+    # Populate with commonly needed keys
+    openai_api_key = os.environ.get("OPENAI_API_KEY")
+    if openai_api_key is not None and len(openai_api_key.strip()) > 0:
+        env["OPENAI_API_KEY"] = openai_api_key
+
+    bing_api_key = os.environ.get("BING_API_KEY")
+    if bing_api_key is not None and len(bing_api_key.strip()) > 0:
+        env["BING_API_KEY"] = bing_api_key
+
+    # Update with any values from the ENV.json file
     if os.path.isfile(env_file):
         with open(env_file, "rt") as fh:
-            env = json.loads(fh.read())
+            env.update(json.loads(fh.read()))
 
+    # Include the config_list that we are using
     config_list_json = json.dumps(config_list)
     env["OAI_CONFIG_LIST"] = config_list_json
 
-    openai_api_key = os.environ.get("OPENAI_API_KEY")
-    if openai_api_key is not None and len(openai_api_key.strip()) > 0:
-        env["OPENAI_API_KEY"] = openai_api_key
-
     return env
 
 
@@ -286,6 +295,12 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
             f"""#
 echo RUN.SH STARTING !#!#
 export AUTOGEN_TESTBED_SETTING="Native"
+echo "autogenbench version: {__version__}" > timestamp.txt
+
+# Create and activate the virtual environment
+# This is called in a subprocess, and will not impact the parent
+{sys.executable} -m venv .autogenbench_venv
+. .autogenbench_venv/bin/activate
 
 # Run the global init script if it exists
 if [ -f global_init.sh ] ; then
@@ -298,6 +313,7 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
 fi
 
 # Run the scenario
+pip install -r requirements.txt
 echo SCENARIO.PY STARTING !#!#
 timeout --preserve-status --kill-after {timeout  + 30}s {timeout}s python scenario.py
 EXIT_CODE=$?
@@ -312,6 +328,10 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
     rm -Rf .cache
 fi
 
+if [ -d __pycache__ ] ; then
+    rm -Rf __pycache__
+fi
+
 # Run the scenario finalize script if it exists
 if [ -f scenario_finalize.sh ] ; then
     . ./scenario_finalize.sh
@@ -322,6 +342,12 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
     . ./global_finalize.sh
 fi
 
+# We don't need to deactivate the venv because it's
+# contained in the subprocess; but we should clean it up
+if [ -d .autogenbench_venv ] ; then
+    rm -Rf .autogenbench_venv
+fi
+
 echo RUN.SH COMPLETE !#!#
 """
         )
@@ -387,7 +413,9 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
             f"""#
 echo RUN.SH STARTING !#!#
 export AUTOGEN_TESTBED_SETTING="Docker"
+
 umask 000
+echo "autogenbench version: {__version__}" > timestamp.txt
 
 # Run the global init script if it exists
 if [ -f global_init.sh ] ; then
@@ -415,6 +443,10 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
     rm -Rf .cache
 fi
 
+if [ -d __pycache__ ] ; then
+    rm -Rf __pycache__
+fi
+
 # Run the scenario finalize script if it exists
 if [ -f scenario_finalize.sh ] ; then
     . ./scenario_finalize.sh
@@ -429,18 +461,31 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
 """
         )
 
-    print("\n\n" + work_dir + "\n===================================================================")
+    # Figure out what folders to mount
+    volumes = {str(pathlib.Path(work_dir).absolute()): {"bind": "/workspace", "mode": "rw"}}
+
+    # Add the autogen repo if we can find it
+    autogen_repo_base = os.environ.get("AUTOGENBENCH_REPO_BASE")
+    if autogen_repo_base is None:
+        autogen_repo_base = find_autogen_repo(os.getcwd())
+    elif not os.path.isdir(autogen_repo_base):
+        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), autogen_repo_base)
+
+    if autogen_repo_base is not None:
+        volumes[str(pathlib.Path(autogen_repo_base).absolute())] = {"bind": "/autogen", "mode": "rw"}
+
+    print("Mounting:")
+    for k in volumes:
+        bind = volumes[k]["bind"]
+        mode = volumes[k]["mode"].upper()
+        if bind == "/workspace":
+            k = os.path.relpath(k)
+        print(f"[{mode}]\t'{k}' => '{bind}'")
+    print("===================================================================")
 
     # Create and run the container
-    abs_path = str(pathlib.Path(work_dir).absolute())
     container = client.containers.run(
-        image,
-        command=["sh", "run.sh"],
-        working_dir="/workspace",
-        environment=env,
-        detach=True,
-        # get absolute path to the working directory
-        volumes={abs_path: {"bind": "/workspace", "mode": "rw"}},
+        image, command=["sh", "run.sh"], working_dir="/workspace", environment=env, detach=True, volumes=volumes
     )
 
     # Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
@@ -485,6 +530,34 @@ def build_default_docker_image(docker_client, image_tag):
             sys.stdout.write(segment["stream"])
 
 
+def find_autogen_repo(path):
+    """
+    Utility for identifying if the path is a subdirectory of the autogen repo.
+
+    Returns: the path to the root of the autogen repo if one is found, otherwise None
+    """
+
+    # Normalize the path (we expect a directory)
+    path = os.path.abspath(path)
+    if os.path.isfile(path):
+        path = os.path.dirname(path)
+
+    while True:
+        test_path = os.path.join(path, "autogen", "agentchat", "conversable_agent.py")  # We found autogen
+        if os.path.isfile(test_path):
+            return path
+
+        # Stop if we hit the root
+        parent_dir = os.path.abspath(os.path.join(path, os.pardir))
+        if parent_dir == path:
+            break
+
+        # Keep searching
+        path = parent_dir
+
+    return None
+
+
 def run_cli(args):
     invocation_cmd = args[0]
     args = args[1:]
@@ -581,12 +654,23 @@ def run_cli(args):
         if parsed_args.requirements is not None:
             sys.exit("--requirements is not compatible with --native. Exiting.")
 
-        choice = input(
-            'WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\nAre you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
+        sys.stderr.write(
+            "WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
         )
 
-        if choice.strip().lower() != "yes":
-            sys.exit("Received '" + choice + "'. Exiting.")
+        # Does an environment variable override the prompt?
+        allow_native = os.environ.get("AUTOGENBENCH_ALLOW_NATIVE")
+        if allow_native is None or allow_native == "":
+            choice = input(
+                'Are you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
+            )
+            if choice.strip().lower() != "yes":
+                sys.exit("Received '" + choice + "'. Exiting.")
+        elif allow_native.strip().lower() != "yes":
+            sys.exit(f"Exiting because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
+        else:
+            sys.stderr.write(f"Continuing because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
+            time.sleep(0.75)  # Pause very briefly so the message isn't lost in the noise
 
     # Parse the subsample
     subsample = None
diff --git a/samples/tools/autogenbench/autogenbench/template/testbed_utils.py b/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
@@ -6,6 +6,15 @@
 
 AUTOGEN_VERSION = packaging.version.parse(autogen.__version__)
 
+# Try importing the runtime_logging module (only available in some branches)
+LOGGING_ENABLED = False
+try:
+    import autogen.runtime_logging
+
+    LOGGING_ENABLED = True
+except ImportError:
+    pass
+
 
 def default_llm_config(config_list, timeout=180):
     """Return a default config list with a given timeout, and with caching disabled.
@@ -57,6 +66,10 @@ def init():
     if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):
         autogen.Completion.start_logging(compact=False)
 
+    # Start logging
+    if LOGGING_ENABLED:
+        autogen.runtime_logging.start(config={"dbname": "telemetry.db"})
+
 
 def finalize(agents):
     """Helper function to finalize logging in a testbed scenario.
@@ -89,3 +102,7 @@ def messages_to_json(agent):
         with open(os.path.join(script_dir, "completion_log.json"), "wt") as fh:
             fh.write(json.dumps(autogen.Completion.logged_history, indent=4))
         autogen.Completion.stop_logging()
+
+    # Stop logging
+    if LOGGING_ENABLED:
+        autogen.runtime_logging.stop()
diff --git a/samples/tools/autogenbench/autogenbench/version.py b/samples/tools/autogenbench/autogenbench/version.py
@@ -1 +1 @@
-__version__ = "0.0.1"
+__version__ = "0.0.2a4"
diff --git a/samples/tools/autogenbench/pyproject.toml b/samples/tools/autogenbench/pyproject.toml
@@ -47,3 +47,8 @@ exclude = ["*.tests*"]
 
 [project.scripts]
 autogenbench = "autogenbench.cli:main"
+
+[tool.black]
+# https://github.com/psf/black
+line-length = 120
+exclude = "(.eggs|.git|.hg|.mypy_cache|.venv|_build|buck-out|build|dist)"
diff --git a/samples/tools/autogenbench/scenarios/AutoGPT/Scripts/init_tasks.py b/samples/tools/autogenbench/scenarios/AutoGPT/Scripts/init_tasks.py
@@ -8,6 +8,7 @@
 import sys
 import glob
 import base64
+import re
 from huggingface_hub import snapshot_download
 
 SCRIPT_PATH = os.path.realpath(__file__)
@@ -88,7 +89,12 @@ def create_jsonl(name, template):
 
 ###############################################################################
 def main():
-    templates = {"two_agents": os.path.join(TEMPLATES_DIR, "TwoAgents")}
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path
 
     # Add coding directories if needed (these are usually empty and left out of the repo)
     for template in templates.values():
diff --git a/samples/tools/autogenbench/scenarios/GAIA/MANIFEST.json b/samples/tools/autogenbench/scenarios/GAIA/MANIFEST.json
@@ -4,9 +4,11 @@
         "Scripts/init_tasks.py": "Scripts/init_tasks.py",
         "Scripts/custom_tabulate.py": "Scripts/custom_tabulate.py",
         "Templates/BasicTwoAgents/expected_answer.txt": "Templates/BasicTwoAgents/expected_answer.txt",
+        "Templates/BasicTwoAgents/prompt.txt": "Templates/BasicTwoAgents/prompt.txt",
         "Templates/BasicTwoAgents/scenario.py": "Templates/BasicTwoAgents/scenario.py",
-        "Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
         "Templates/SocietyOfMind/expected_answer.txt": "Templates/SocietyOfMind/expected_answer.txt",
+        "Templates/SocietyOfMind/prompt.txt": "Templates/SocietyOfMind/prompt.txt",
+        "Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
         "Templates/SocietyOfMind/requirements.txt": "Templates/SocietyOfMind/requirements.txt"
     }
 }
diff --git a/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py b/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
@@ -6,6 +6,7 @@
 import json
 import os
 import sys
+import re
 from huggingface_hub import snapshot_download
 
 SCRIPT_PATH = os.path.realpath(__file__)
@@ -60,9 +61,9 @@ def create_jsonl(name, tasks, files_dir, template):
                 "substitutions": {
                     "scenario.py": {
                         "__FILE_NAME__": task["file_name"],
-                        "__PROMPT__": task["Question"],
                     },
                     "expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
+                    "prompt.txt": {"__PROMPT__": task["Question"]},
                 },
             }
 
@@ -97,10 +98,12 @@ def main():
 
             gaia_test_tasks[data["Level"] - 1].append(data)
 
-    templates = {
-        "two_agents": os.path.join(TEMPLATES_DIR, "BasicTwoAgents"),
-        "soc": os.path.join(TEMPLATES_DIR, "SocietyOfMind"),
-    }
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path
 
     # Add coding directories if needed (these are usually empty and left out of the repo)
     for template in templates.values():
diff --git a/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/prompt.txt b/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py b/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
@@ -7,6 +7,10 @@
 testbed_utils.init()
 ##############################
 
+# Read the prompt
+PROMPT = ""
+with open("prompt.txt", "rt") as fh:
+    PROMPT = fh.read().strip()
 
 GAIA_SYSTEM_MESSAGE = (
     "You are a helpful AI assistant, and today's date is "
@@ -48,9 +52,7 @@
 )
 
 filename = "__FILE_NAME__".strip()
-question = """
-__PROMPT__
-""".strip()
+question = PROMPT
 
 if len(filename) > 0:
     question = f"Consider the file '{filename}', which can be read from the current working directory. If you need to read or write it, output python code in a code block (```python) to do so. {question}"
diff --git a/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/prompt.txt b/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/prompt.txt
diff --git a/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/scenario.py b/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/scenario.py
diff --git a/samples/tools/autogenbench/scenarios/HumanEval/Scripts/init_tasks.py b/samples/tools/autogenbench/scenarios/HumanEval/Scripts/init_tasks.py
diff --git a/samples/tools/autogenbench/scenarios/MATH/Scripts/init_tasks.py b/samples/tools/autogenbench/scenarios/MATH/Scripts/init_tasks.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+scenarios/*/Downloads`
	`2`	`+scenarios/*/Tasks`
	`3`	`+*/Results`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.1"`
	`1`	`+__version__ = "0.0.2a4"`
Original file line number	Diff line number	Diff line change
`@@ -4,9 +4,11 @@`
`4`	`4`	`"Scripts/init_tasks.py": "Scripts/init_tasks.py",`
`5`	`5`	`"Scripts/custom_tabulate.py": "Scripts/custom_tabulate.py",`
`6`	`6`	`"Templates/BasicTwoAgents/expected_answer.txt": "Templates/BasicTwoAgents/expected_answer.txt",`
	`7`	`+ "Templates/BasicTwoAgents/prompt.txt": "Templates/BasicTwoAgents/prompt.txt",`
`7`	`8`	`"Templates/BasicTwoAgents/scenario.py": "Templates/BasicTwoAgents/scenario.py",`
`8`		`- "Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",`
`9`	`9`	`"Templates/SocietyOfMind/expected_answer.txt": "Templates/SocietyOfMind/expected_answer.txt",`
	`10`	`+ "Templates/SocietyOfMind/prompt.txt": "Templates/SocietyOfMind/prompt.txt",`
	`11`	`+ "Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",`
`10`	`12`	`"Templates/SocietyOfMind/requirements.txt": "Templates/SocietyOfMind/requirements.txt"`
`11`	`13`	`}`
`12`	`14`	`}`