From f224886734b1e6c31de5b2fd1a3fb12ef8640eed Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Mon, 29 Jan 2024 08:42:42 -0800
Subject: [PATCH 01/15] Prints the version of AutoGenBench from the command
 line, closing i1458

---
 .../tools/autogenbench/autogenbench/cli.py    | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/samples/tools/autogenbench/autogenbench/cli.py b/samples/tools/autogenbench/autogenbench/cli.py
index dd0ebd70ea74..d6ff2bd7a73b 100644
--- a/samples/tools/autogenbench/autogenbench/cli.py
+++ b/samples/tools/autogenbench/autogenbench/cli.py
@@ -1,4 +1,5 @@
 import sys
+from .version import __version__
 from .run_cmd import run_cli
 from .clone_cmd import clone_cli
 from .tabulate_cmd import tabulate_cli
@@ -9,6 +10,7 @@ def main(args=None):
         args = sys.argv[:]  # Shallow copy
 
     invocation_cmd = "autogenbench"
+    version_string = f"AutoGenBench version {__version__}"
 
     commands = [
         {
@@ -26,6 +28,11 @@ def main(args=None):
             "description": "tabulate the results of a previous run",
             "function": tabulate_cli,
         },
+        {
+            "command": "--version",
+            "description": f"print the version of {invocation_cmd}",
+            "function": lambda _args: print(f"{version_string}"),
+        },
         {"command": "--help", "description": "print this message", "function": None},
     ]
 
@@ -40,6 +47,8 @@ def main(args=None):
         commands_details += f"    {padded_cmd}: {c['description']}\n"
 
     usage_text = f"""
+{version_string}
+
 usage: {invocation_cmd} COMMAND ARGS
 
 Where, COMMAND is one of: {commands_list}
@@ -49,6 +58,8 @@ def main(args=None):
 """.strip()
 
     help_text = f"""
+{version_string}
+
 usage: {invocation_cmd} COMMAND ARGS
 
 {invocation_cmd} is a tool for running and managing AutoGen benchmark scenarios. A typically session might resemble:
@@ -80,11 +91,15 @@ def main(args=None):
                 sys.stderr.write(help_text + "\n")
                 sys.exit(0)
             else:
-                command["function"]([invocation_cmd + " " + command["command"]] + args[2:])
+                command["function"](
+                    [invocation_cmd + " " + command["command"]] + args[2:]
+                )
                 sys.exit(0)
 
     # Command not found
-    sys.stderr.write(f"Invalid command '{args[1]}'. Available commands include: {commands_list}\n")
+    sys.stderr.write(
+        f"Invalid command '{args[1]}'. Available commands include: {commands_list}\n"
+    )
     sys.exit(2)
 
 

From 37f633f27c8152a5c1fd4e44a81a30645c17a763 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Mon, 29 Jan 2024 20:05:37 -0800
Subject: [PATCH 02/15] Added autogenbench version to timestamp.txt

---
 .../autogenbench/autogenbench/run_cmd.py      | 56 +++++++++++++++----
 .../autogenbench/template/testbed_utils.py    |  4 +-
 2 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/samples/tools/autogenbench/autogenbench/run_cmd.py b/samples/tools/autogenbench/autogenbench/run_cmd.py
index c29f064d56e6..2886fa27d032 100644
--- a/samples/tools/autogenbench/autogenbench/run_cmd.py
+++ b/samples/tools/autogenbench/autogenbench/run_cmd.py
@@ -11,6 +11,7 @@
 import random
 from autogen import config_list_from_json
 from autogen.oai.openai_utils import filter_config
+from .version import __version__
 
 # Figure out where everything is
 SCRIPT_PATH = os.path.realpath(__file__)
@@ -138,7 +139,9 @@ def run_scenarios(
                 print(f"Running scenario {results_repetition}")
 
                 # Expand the scenario
-                expand_scenario(scenario_dir, instance, results_repetition, requirements)
+                expand_scenario(
+                    scenario_dir, instance, results_repetition, requirements
+                )
 
                 # Prepare the environment (keys/values that need to be added)
                 env = get_scenario_env(config_list)
@@ -168,10 +171,14 @@ def expand_scenario(scenario_dir, scenario, output_dir, requirements):
     template = scenario["template"]
 
     # Either key works for finding the substiturions list. "values" may be deprecated in the future
-    substitutions = scenario["substitutions"] if "substitutions" in scenario else scenario["values"]
+    substitutions = (
+        scenario["substitutions"] if "substitutions" in scenario else scenario["values"]
+    )
 
     # Older versions are only one-level deep. Convert them,
-    if len(substitutions) > 0 and isinstance(substitutions[next(iter(substitutions))], str):
+    if len(substitutions) > 0 and isinstance(
+        substitutions[next(iter(substitutions))], str
+    ):
         substitutions = {"scenario.py": substitutions}
 
     copy_operations = []
@@ -210,14 +217,18 @@ def expand_scenario(scenario_dir, scenario, output_dir, requirements):
         else:
             if os.path.isdir(dest_path):
                 # If the destination is a directory, use the same filename
-                shutil.copyfile(src_path, os.path.join(dest_path, os.path.basename(src_path)))
+                shutil.copyfile(
+                    src_path, os.path.join(dest_path, os.path.basename(src_path))
+                )
             else:
                 # Otherwuse use the filename provided
                 shutil.copyfile(src_path, dest_path)
 
     # Copy the requirements file if specified
     if requirements is not None:
-        shutil.copyfile(requirements, pathlib.Path(os.path.join(output_dir, "requirements.txt")))
+        shutil.copyfile(
+            requirements, pathlib.Path(os.path.join(output_dir, "requirements.txt"))
+        )
 
     # Expand templated files
     for templated_file in substitutions.keys():  # Keys are relative file paths
@@ -278,7 +289,11 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
 
     # Navigate to the scenario
     os.chdir(work_dir)
-    print("\n\n" + os.getcwd() + "\n===================================================================")
+    print(
+        "\n\n"
+        + os.getcwd()
+        + "\n==================================================================="
+    )
 
     # Prepare the run script
     with open(os.path.join("run.sh"), "wt") as f:
@@ -286,6 +301,7 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
             f"""#
 echo RUN.SH STARTING !#!#
 export AUTOGEN_TESTBED_SETTING="Native"
+echo "autogenbench version: {__version__}" > timestamp.txt
 
 # Run the global init script if it exists
 if [ -f global_init.sh ] ; then
@@ -362,7 +378,9 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
         try:
             image = client.images.get(DEFAULT_DOCKER_IMAGE_TAG)
         except docker.errors.ImageNotFound:
-            print(f"Building default Docker image '{DEFAULT_DOCKER_IMAGE_TAG}'. This may take a few minutes...")
+            print(
+                f"Building default Docker image '{DEFAULT_DOCKER_IMAGE_TAG}'. This may take a few minutes..."
+            )
             try:
                 build_default_docker_image(client, DEFAULT_DOCKER_IMAGE_TAG)
                 image = client.images.get(DEFAULT_DOCKER_IMAGE_TAG)
@@ -387,7 +405,9 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
             f"""#
 echo RUN.SH STARTING !#!#
 export AUTOGEN_TESTBED_SETTING="Docker"
+
 umask 000
+echo "autogenbench version: {__version__}" > timestamp.txt
 
 # Run the global init script if it exists
 if [ -f global_init.sh ] ; then
@@ -429,7 +449,11 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
 """
         )
 
-    print("\n\n" + work_dir + "\n===================================================================")
+    print(
+        "\n\n"
+        + work_dir
+        + "\n==================================================================="
+    )
 
     # Create and run the container
     abs_path = str(pathlib.Path(work_dir).absolute())
@@ -444,7 +468,9 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
     )
 
     # Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
-    docker_timeout = timeout + 60  # One full minute after the bash timeout command should have already triggered
+    docker_timeout = (
+        timeout + 60
+    )  # One full minute after the bash timeout command should have already triggered
     start_time = time.time()
     logs = container.logs(stream=True)
     log_file = open(os.path.join(work_dir, "console_log.txt"), "wt")
@@ -466,7 +492,9 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
             # but remember how we got here.
             stopping = True
 
-    if stopping:  # By this line we've exited the loop, and the container has actually stopped.
+    if (
+        stopping
+    ):  # By this line we've exited the loop, and the container has actually stopped.
         log_file.write("\nDocker timed out.\n")
         log_file.flush()
         sys.stdout.write("\nDocker timed out.\n")
@@ -571,12 +599,16 @@ def run_cli(args):
 
     # Don't allow both --docker-image and --native on the same command
     if parsed_args.docker_image is not None and parsed_args.native:
-        sys.exit("The options --native and --docker-image can not be used together. Exiting.")
+        sys.exit(
+            "The options --native and --docker-image can not be used together. Exiting."
+        )
 
     # Warn if running natively
     if parsed_args.native:
         if IS_WIN32:
-            sys.exit("Running scenarios with --native is not supported in Windows. Exiting.")
+            sys.exit(
+                "Running scenarios with --native is not supported in Windows. Exiting."
+            )
 
         if parsed_args.requirements is not None:
             sys.exit("--requirements is not compatible with --native. Exiting.")
diff --git a/samples/tools/autogenbench/autogenbench/template/testbed_utils.py b/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
index bb435c5536ca..d572c3d25d82 100644
--- a/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
+++ b/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
@@ -49,9 +49,9 @@ def init():
     """
 
     # Print some information about the run
-    with open("timestamp.txt", "wt") as f:
-        f.write("Timestamp: " + datetime.now().isoformat() + "\n")
+    with open("timestamp.txt", "at") as f:
         f.write("pyautogen version: " + str(autogen.__version__) + "\n")
+        f.write("Timestamp: " + datetime.now().isoformat() + "\n")
 
     # Start logging
     if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):

From faaf883adfec46a75afc8552e1f7bb268eead857 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Mon, 29 Jan 2024 20:08:42 -0800
Subject: [PATCH 03/15] Attempting to fix formatting.

---
 .../tools/autogenbench/autogenbench/cli.py    |  8 +--
 .../autogenbench/autogenbench/run_cmd.py      | 52 +++++--------------
 2 files changed, 14 insertions(+), 46 deletions(-)

diff --git a/samples/tools/autogenbench/autogenbench/cli.py b/samples/tools/autogenbench/autogenbench/cli.py
index d6ff2bd7a73b..6b27a8aeba4e 100644
--- a/samples/tools/autogenbench/autogenbench/cli.py
+++ b/samples/tools/autogenbench/autogenbench/cli.py
@@ -91,15 +91,11 @@ def main(args=None):
                 sys.stderr.write(help_text + "\n")
                 sys.exit(0)
             else:
-                command["function"](
-                    [invocation_cmd + " " + command["command"]] + args[2:]
-                )
+                command["function"]([invocation_cmd + " " + command["command"]] + args[2:])
                 sys.exit(0)
 
     # Command not found
-    sys.stderr.write(
-        f"Invalid command '{args[1]}'. Available commands include: {commands_list}\n"
-    )
+    sys.stderr.write(f"Invalid command '{args[1]}'. Available commands include: {commands_list}\n")
     sys.exit(2)
 
 
diff --git a/samples/tools/autogenbench/autogenbench/run_cmd.py b/samples/tools/autogenbench/autogenbench/run_cmd.py
index 2886fa27d032..5e5472729840 100644
--- a/samples/tools/autogenbench/autogenbench/run_cmd.py
+++ b/samples/tools/autogenbench/autogenbench/run_cmd.py
@@ -139,9 +139,7 @@ def run_scenarios(
                 print(f"Running scenario {results_repetition}")
 
                 # Expand the scenario
-                expand_scenario(
-                    scenario_dir, instance, results_repetition, requirements
-                )
+                expand_scenario(scenario_dir, instance, results_repetition, requirements)
 
                 # Prepare the environment (keys/values that need to be added)
                 env = get_scenario_env(config_list)
@@ -171,14 +169,10 @@ def expand_scenario(scenario_dir, scenario, output_dir, requirements):
     template = scenario["template"]
 
     # Either key works for finding the substiturions list. "values" may be deprecated in the future
-    substitutions = (
-        scenario["substitutions"] if "substitutions" in scenario else scenario["values"]
-    )
+    substitutions = scenario["substitutions"] if "substitutions" in scenario else scenario["values"]
 
     # Older versions are only one-level deep. Convert them,
-    if len(substitutions) > 0 and isinstance(
-        substitutions[next(iter(substitutions))], str
-    ):
+    if len(substitutions) > 0 and isinstance(substitutions[next(iter(substitutions))], str):
         substitutions = {"scenario.py": substitutions}
 
     copy_operations = []
@@ -217,18 +211,14 @@ def expand_scenario(scenario_dir, scenario, output_dir, requirements):
         else:
             if os.path.isdir(dest_path):
                 # If the destination is a directory, use the same filename
-                shutil.copyfile(
-                    src_path, os.path.join(dest_path, os.path.basename(src_path))
-                )
+                shutil.copyfile(src_path, os.path.join(dest_path, os.path.basename(src_path)))
             else:
                 # Otherwuse use the filename provided
                 shutil.copyfile(src_path, dest_path)
 
     # Copy the requirements file if specified
     if requirements is not None:
-        shutil.copyfile(
-            requirements, pathlib.Path(os.path.join(output_dir, "requirements.txt"))
-        )
+        shutil.copyfile(requirements, pathlib.Path(os.path.join(output_dir, "requirements.txt")))
 
     # Expand templated files
     for templated_file in substitutions.keys():  # Keys are relative file paths
@@ -289,11 +279,7 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
 
     # Navigate to the scenario
     os.chdir(work_dir)
-    print(
-        "\n\n"
-        + os.getcwd()
-        + "\n==================================================================="
-    )
+    print("\n\n" + os.getcwd() + "\n===================================================================")
 
     # Prepare the run script
     with open(os.path.join("run.sh"), "wt") as f:
@@ -378,9 +364,7 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
         try:
             image = client.images.get(DEFAULT_DOCKER_IMAGE_TAG)
         except docker.errors.ImageNotFound:
-            print(
-                f"Building default Docker image '{DEFAULT_DOCKER_IMAGE_TAG}'. This may take a few minutes..."
-            )
+            print(f"Building default Docker image '{DEFAULT_DOCKER_IMAGE_TAG}'. This may take a few minutes...")
             try:
                 build_default_docker_image(client, DEFAULT_DOCKER_IMAGE_TAG)
                 image = client.images.get(DEFAULT_DOCKER_IMAGE_TAG)
@@ -449,11 +433,7 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
 """
         )
 
-    print(
-        "\n\n"
-        + work_dir
-        + "\n==================================================================="
-    )
+    print("\n\n" + work_dir + "\n===================================================================")
 
     # Create and run the container
     abs_path = str(pathlib.Path(work_dir).absolute())
@@ -468,9 +448,7 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
     )
 
     # Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
-    docker_timeout = (
-        timeout + 60
-    )  # One full minute after the bash timeout command should have already triggered
+    docker_timeout = timeout + 60  # One full minute after the bash timeout command should have already triggered
     start_time = time.time()
     logs = container.logs(stream=True)
     log_file = open(os.path.join(work_dir, "console_log.txt"), "wt")
@@ -492,9 +470,7 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
             # but remember how we got here.
             stopping = True
 
-    if (
-        stopping
-    ):  # By this line we've exited the loop, and the container has actually stopped.
+    if stopping:  # By this line we've exited the loop, and the container has actually stopped.
         log_file.write("\nDocker timed out.\n")
         log_file.flush()
         sys.stdout.write("\nDocker timed out.\n")
@@ -599,16 +575,12 @@ def run_cli(args):
 
     # Don't allow both --docker-image and --native on the same command
     if parsed_args.docker_image is not None and parsed_args.native:
-        sys.exit(
-            "The options --native and --docker-image can not be used together. Exiting."
-        )
+        sys.exit("The options --native and --docker-image can not be used together. Exiting.")
 
     # Warn if running natively
     if parsed_args.native:
         if IS_WIN32:
-            sys.exit(
-                "Running scenarios with --native is not supported in Windows. Exiting."
-            )
+            sys.exit("Running scenarios with --native is not supported in Windows. Exiting.")
 
         if parsed_args.requirements is not None:
             sys.exit("--requirements is not compatible with --native. Exiting.")

From 7cf509434ee9eaf34036c664241490f2c90ae75b Mon Sep 17 00:00:00 2001
From: gagb <gagb@users.noreply.github.com>
Date: Wed, 31 Jan 2024 02:49:53 +0000
Subject: [PATCH 04/15] Add a gitignore for autogenbench

---
 samples/tools/autogenbench/.gitignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 samples/tools/autogenbench/.gitignore

diff --git a/samples/tools/autogenbench/.gitignore b/samples/tools/autogenbench/.gitignore
new file mode 100644
index 000000000000..2eccb6f6c69f
--- /dev/null
+++ b/samples/tools/autogenbench/.gitignore
@@ -0,0 +1,3 @@
+scenarios/*/Downloads
+scenarios/*/Tasks
+*/Results

From 870cb5f3f33d5fcb0166f31a982866f1890b6de9 Mon Sep 17 00:00:00 2001
From: gagb <gagb@users.noreply.github.com>
Date: Wed, 31 Jan 2024 02:50:49 +0000
Subject: [PATCH 05/15] Generalize to read all template dirs from Templates

---
 .../scenarios/GAIA/Scripts/init_tasks.py       | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py b/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
index 3ff483af1817..9fb4dbaf3b75 100644
--- a/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
+++ b/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
@@ -62,7 +62,9 @@ def create_jsonl(name, tasks, files_dir, template):
                         "__FILE_NAME__": task["file_name"],
                         "__PROMPT__": task["Question"],
                     },
-                    "expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
+                    "expected_answer.txt": {
+                        "__EXPECTED_ANSWER__": task["Final answer"]
+                    },
                 },
             }
 
@@ -77,7 +79,9 @@ def main():
     gaia_test_files = os.path.join(REPO_DIR, "2023", "test")
 
     if not os.path.isdir(gaia_validation_files) or not os.path.isdir(gaia_test_files):
-        sys.exit(f"Error: '{REPO_DIR}' does not appear to be a copy of the GAIA repository.")
+        sys.exit(
+            f"Error: '{REPO_DIR}' does not appear to be a copy of the GAIA repository."
+        )
 
     # Load the GAIA data
     gaia_validation_tasks = [[], [], []]
@@ -97,10 +101,12 @@ def main():
 
             gaia_test_tasks[data["Level"] - 1].append(data)
 
-    templates = {
-        "two_agents": os.path.join(TEMPLATES_DIR, "BasicTwoAgents"),
-        "soc": os.path.join(TEMPLATES_DIR, "SocietyOfMind"),
-    }
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[entry.name] = entry.path
 
     # Add coding directories if needed (these are usually empty and left out of the repo)
     for template in templates.values():

From 66b1f86147992b7e376e204d678fa8e51b9e9650 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Tue, 30 Jan 2024 21:49:54 -0800
Subject: [PATCH 06/15] AutoGenBench logs telemetry when available.

---
 .../autogenbench/template/testbed_utils.py    | 21 +++++++++++++++++--
 .../autogenbench/autogenbench/version.py      |  2 +-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/samples/tools/autogenbench/autogenbench/template/testbed_utils.py b/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
index d572c3d25d82..ee46bc6af5c8 100644
--- a/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
+++ b/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
@@ -6,6 +6,15 @@
 
 AUTOGEN_VERSION = packaging.version.parse(autogen.__version__)
 
+# Try importing the telemetry module (only available in some branches)
+TELEMETRY_ENABLED = False
+try:
+    import autogen.telemetry
+
+    TELEMETRY_ENABLED = True
+except ImportError:
+    pass
+
 
 def default_llm_config(config_list, timeout=180):
     """Return a default config list with a given timeout, and with caching disabled.
@@ -49,14 +58,18 @@ def init():
     """
 
     # Print some information about the run
-    with open("timestamp.txt", "at") as f:
-        f.write("pyautogen version: " + str(autogen.__version__) + "\n")
+    with open("timestamp.txt", "wt") as f:
         f.write("Timestamp: " + datetime.now().isoformat() + "\n")
+        f.write("pyautogen version: " + str(autogen.__version__) + "\n")
 
     # Start logging
     if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):
         autogen.Completion.start_logging(compact=False)
 
+    # Start telemetry
+    if TELEMETRY_ENABLED:
+        autogen.telemetry.start_logging()
+
 
 def finalize(agents):
     """Helper function to finalize logging in a testbed scenario.
@@ -89,3 +102,7 @@ def messages_to_json(agent):
         with open(os.path.join(script_dir, "completion_log.json"), "wt") as fh:
             fh.write(json.dumps(autogen.Completion.logged_history, indent=4))
         autogen.Completion.stop_logging()
+
+    # Start telemetry
+    if TELEMETRY_ENABLED:
+        autogen.telemetry.stop_logging()
diff --git a/samples/tools/autogenbench/autogenbench/version.py b/samples/tools/autogenbench/autogenbench/version.py
index ecbf4901d90d..ee7a05d76eb9 100644
--- a/samples/tools/autogenbench/autogenbench/version.py
+++ b/samples/tools/autogenbench/autogenbench/version.py
@@ -1 +1 @@
-__version__ = "0.0.1a12"
+__version__ = "0.0.1a13"

From fa3229e790a9f2296c037f45691715ba5b589db4 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Tue, 30 Jan 2024 22:24:35 -0800
Subject: [PATCH 07/15] Remove spaces if present from template names.

---
 .../tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py b/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
index 9fb4dbaf3b75..8eac5be88dbf 100644
--- a/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
+++ b/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
@@ -6,6 +6,7 @@
 import json
 import os
 import sys
+import re
 from huggingface_hub import snapshot_download
 
 SCRIPT_PATH = os.path.realpath(__file__)
@@ -106,7 +107,7 @@ def main():
     templates = {}
     for entry in os.scandir(TEMPLATES_DIR):
         if entry.is_dir():
-            templates[entry.name] = entry.path
+            templates[re.sub(r"\s", "", entry.name)] = entry.path
 
     # Add coding directories if needed (these are usually empty and left out of the repo)
     for template in templates.values():

From d366cadc6f9b647992806c093f6f8c7aee88a12d Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Tue, 30 Jan 2024 22:28:21 -0800
Subject: [PATCH 08/15] Bump version.

---
 samples/tools/autogenbench/autogenbench/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/tools/autogenbench/autogenbench/version.py b/samples/tools/autogenbench/autogenbench/version.py
index ee7a05d76eb9..2700e7d2a00b 100644
--- a/samples/tools/autogenbench/autogenbench/version.py
+++ b/samples/tools/autogenbench/autogenbench/version.py
@@ -1 +1 @@
-__version__ = "0.0.1a13"
+__version__ = "0.0.2a1"

From 88ee79e1798fb8257a5f1e73d95626f16415cae2 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Tue, 30 Jan 2024 23:38:24 -0800
Subject: [PATCH 09/15] Fixed formatting.

---
 samples/tools/autogenbench/pyproject.toml                 | 5 +++++
 .../autogenbench/scenarios/GAIA/Scripts/init_tasks.py     | 8 ++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/samples/tools/autogenbench/pyproject.toml b/samples/tools/autogenbench/pyproject.toml
index 339217691d97..8cabc4b55e67 100644
--- a/samples/tools/autogenbench/pyproject.toml
+++ b/samples/tools/autogenbench/pyproject.toml
@@ -47,3 +47,8 @@ exclude = ["*.tests*"]
 
 [project.scripts]
 autogenbench = "autogenbench.cli:main"
+
+[tool.black]
+# https://github.com/psf/black
+line-length = 120
+exclude = "(.eggs|.git|.hg|.mypy_cache|.venv|_build|buck-out|build|dist)"
diff --git a/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py b/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
index 8eac5be88dbf..61fef86136d9 100644
--- a/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
+++ b/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
@@ -63,9 +63,7 @@ def create_jsonl(name, tasks, files_dir, template):
                         "__FILE_NAME__": task["file_name"],
                         "__PROMPT__": task["Question"],
                     },
-                    "expected_answer.txt": {
-                        "__EXPECTED_ANSWER__": task["Final answer"]
-                    },
+                    "expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
                 },
             }
 
@@ -80,9 +78,7 @@ def main():
     gaia_test_files = os.path.join(REPO_DIR, "2023", "test")
 
     if not os.path.isdir(gaia_validation_files) or not os.path.isdir(gaia_test_files):
-        sys.exit(
-            f"Error: '{REPO_DIR}' does not appear to be a copy of the GAIA repository."
-        )
+        sys.exit(f"Error: '{REPO_DIR}' does not appear to be a copy of the GAIA repository.")
 
     # Load the GAIA data
     gaia_validation_tasks = [[], [], []]

From 2c515728a52af59fbeff5596e82be4d3b54ac809 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Fri, 2 Feb 2024 23:41:23 -0800
Subject: [PATCH 10/15] Allow native warning to be skipped. Mount autogen repo
 in Docker if it can be found (experimental).

---
 .../autogenbench/autogenbench/run_cmd.py      | 101 ++++++++++++++----
 .../autogenbench/autogenbench/version.py      |   2 +-
 2 files changed, 84 insertions(+), 19 deletions(-)

diff --git a/samples/tools/autogenbench/autogenbench/run_cmd.py b/samples/tools/autogenbench/autogenbench/run_cmd.py
index 5e5472729840..1e1c78f22fb1 100644
--- a/samples/tools/autogenbench/autogenbench/run_cmd.py
+++ b/samples/tools/autogenbench/autogenbench/run_cmd.py
@@ -248,17 +248,25 @@ def get_scenario_env(config_list, env_file=DEFAULT_ENV_FILE):
     Returns: A dictionary of keys and values that need to be added to the system environment.
     """
     env = dict()
+
+    # Populate with commonly needed keys
+    openai_api_key = os.environ.get("OPENAI_API_KEY")
+    if openai_api_key is not None and len(openai_api_key.strip()) > 0:
+        env["OPENAI_API_KEY"] = openai_api_key
+
+    bing_api_key = os.environ.get("BING_API_KEY")
+    if bing_api_key is not None and len(bing_api_key.strip()) > 0:
+        env["BING_API_KEY"] = bing_api_key
+
+    # Update with any values from the ENV.json file
     if os.path.isfile(env_file):
         with open(env_file, "rt") as fh:
-            env = json.loads(fh.read())
+            env.update(json.loads(fh.read()))
 
+    # Include the config_list that we are using
     config_list_json = json.dumps(config_list)
     env["OAI_CONFIG_LIST"] = config_list_json
 
-    openai_api_key = os.environ.get("OPENAI_API_KEY")
-    if openai_api_key is not None and len(openai_api_key.strip()) > 0:
-        env["OPENAI_API_KEY"] = openai_api_key
-
     return env
 
 
@@ -390,6 +398,11 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
 echo RUN.SH STARTING !#!#
 export AUTOGEN_TESTBED_SETTING="Docker"
 
+# If a read-only copy of the autogen repo is local, copy it to a writeable directory
+if [ -d /autogen.ro ] ; then
+    cp -R /autogen.ro /autogen
+fi
+
 umask 000
 echo "autogenbench version: {__version__}" > timestamp.txt
 
@@ -433,18 +446,31 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
 """
         )
 
-    print("\n\n" + work_dir + "\n===================================================================")
+    # Figure out what folders to mount
+    volumes = {str(pathlib.Path(work_dir).absolute()): {"bind": "/workspace", "mode": "rw"}}
+
+    # Add the autogen repo if we can find it
+    autogen_repo_base = os.environ.get("AUTOGENBENCH_REPO_BASE")
+    if autogen_repo_base is None:
+        autogen_repo_base = find_autogen_repo(os.getcwd())
+    elif not os.path.isdir(autogen_repo_base):
+        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), autogen_repo_base)
+
+    if autogen_repo_base is not None:
+        volumes[str(pathlib.Path(autogen_repo_base).absolute())] = {"bind": "/autogen.ro", "mode": "ro"}
+
+    print("Mounting:")
+    for k in volumes:
+        bind = volumes[k]["bind"]
+        mode = volumes[k]["mode"].upper()
+        if bind == "/workspace":
+            k = os.path.relpath(k)
+        print(f"[{mode}]\t'{k}' => '{bind}'")
+    print("===================================================================")
 
     # Create and run the container
-    abs_path = str(pathlib.Path(work_dir).absolute())
     container = client.containers.run(
-        image,
-        command=["sh", "run.sh"],
-        working_dir="/workspace",
-        environment=env,
-        detach=True,
-        # get absolute path to the working directory
-        volumes={abs_path: {"bind": "/workspace", "mode": "rw"}},
+        image, command=["sh", "run.sh"], working_dir="/workspace", environment=env, detach=True, volumes=volumes
     )
 
     # Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
@@ -489,6 +515,34 @@ def build_default_docker_image(docker_client, image_tag):
             sys.stdout.write(segment["stream"])
 
 
+def find_autogen_repo(path):
+    """
+    Utility for identifying if the path is a subdirectory of the autogen repo.
+
+    Returns: the path to the root of the autogen repo if one is found, otherwise None
+    """
+
+    # Normalize the path (we expect a directory)
+    path = os.path.abspath(path)
+    if os.path.isfile(path):
+        path = os.path.dirname(path)
+
+    while True:
+        test_path = os.path.join(path, "autogen", "agentchat", "conversable_agent.py")  # We found autogen
+        if os.path.isfile(test_path):
+            return path
+
+        # Stop if we hit the root
+        parent_dir = os.path.abspath(os.path.join(path, os.pardir))
+        if parent_dir == path:
+            break
+
+        # Keep searching
+        path = parent_dir
+
+    return None
+
+
 def run_cli(args):
     invocation_cmd = args[0]
     args = args[1:]
@@ -585,12 +639,23 @@ def run_cli(args):
         if parsed_args.requirements is not None:
             sys.exit("--requirements is not compatible with --native. Exiting.")
 
-        choice = input(
-            'WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\nAre you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
+        sys.stderr.write(
+            "WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
         )
 
-        if choice.strip().lower() != "yes":
-            sys.exit("Received '" + choice + "'. Exiting.")
+        # Does an environment variable override the prompt?
+        allow_native = os.environ.get("AUTOGENBENCH_ALLOW_NATIVE")
+        if allow_native is None or allow_native == "":
+            choice = input(
+                'Are you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
+            )
+            if choice.strip().lower() != "yes":
+                sys.exit("Received '" + choice + "'. Exiting.")
+        elif allow_native.strip().lower() != "yes":
+            sys.exit(f"Exiting because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
+        else:
+            sys.stderr.write(f"Continuing because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
+            time.sleep(0.75)  # Pause very briefly so the message isn't lost in the noise
 
     # Parse the subsample
     subsample = None
diff --git a/samples/tools/autogenbench/autogenbench/version.py b/samples/tools/autogenbench/autogenbench/version.py
index 2700e7d2a00b..4e56cb32e050 100644
--- a/samples/tools/autogenbench/autogenbench/version.py
+++ b/samples/tools/autogenbench/autogenbench/version.py
@@ -1 +1 @@
-__version__ = "0.0.2a1"
+__version__ = "0.0.2a2"

From 2a601274e27718322f866a303936d71b8489f532 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Sat, 3 Feb 2024 21:44:52 -0800
Subject: [PATCH 11/15] Native execution now occurs in a venv.

---
 .../autogenbench/autogenbench/run_cmd.py      | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/samples/tools/autogenbench/autogenbench/run_cmd.py b/samples/tools/autogenbench/autogenbench/run_cmd.py
index 1e1c78f22fb1..d4f6d3face3e 100644
--- a/samples/tools/autogenbench/autogenbench/run_cmd.py
+++ b/samples/tools/autogenbench/autogenbench/run_cmd.py
@@ -297,6 +297,11 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
 export AUTOGEN_TESTBED_SETTING="Native"
 echo "autogenbench version: {__version__}" > timestamp.txt
 
+# Create and activate the virtual environment
+# This is called in a subprocess, and will not impact the parent
+{sys.executable} -m venv .autogenbench_venv
+. .autogenbench_venv/bin/activate
+
 # Run the global init script if it exists
 if [ -f global_init.sh ] ; then
     . ./global_init.sh
@@ -308,6 +313,7 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
 fi
 
 # Run the scenario
+pip install -r requirements.txt
 echo SCENARIO.PY STARTING !#!#
 timeout --preserve-status --kill-after {timeout  + 30}s {timeout}s python scenario.py
 EXIT_CODE=$?
@@ -322,6 +328,10 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
     rm -Rf .cache
 fi
 
+if [ -d __pycache__ ] ; then
+    rm -Rf __pycache__
+fi
+
 # Run the scenario finalize script if it exists
 if [ -f scenario_finalize.sh ] ; then
     . ./scenario_finalize.sh
@@ -332,6 +342,12 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
     . ./global_finalize.sh
 fi
 
+# We don't need to deactivate the venv because it's
+# contained in the subprocess; but we should clean it up
+if [ -d .autogenbench_venv ] ; then
+    rm -Rf .autogenbench_venv
+fi
+
 echo RUN.SH COMPLETE !#!#
 """
         )
@@ -398,11 +414,6 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
 echo RUN.SH STARTING !#!#
 export AUTOGEN_TESTBED_SETTING="Docker"
 
-# If a read-only copy of the autogen repo is local, copy it to a writeable directory
-if [ -d /autogen.ro ] ; then
-    cp -R /autogen.ro /autogen
-fi
-
 umask 000
 echo "autogenbench version: {__version__}" > timestamp.txt
 
@@ -432,6 +443,10 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
     rm -Rf .cache
 fi
 
+if [ -d __pycache__ ] ; then
+    rm -Rf __pycache__
+fi
+
 # Run the scenario finalize script if it exists
 if [ -f scenario_finalize.sh ] ; then
     . ./scenario_finalize.sh
@@ -457,7 +472,7 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
         raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), autogen_repo_base)
 
     if autogen_repo_base is not None:
-        volumes[str(pathlib.Path(autogen_repo_base).absolute())] = {"bind": "/autogen.ro", "mode": "ro"}
+        volumes[str(pathlib.Path(autogen_repo_base).absolute())] = {"bind": "/autogen", "mode": "rw"}
 
     print("Mounting:")
     for k in volumes:

From 20e5f86a48b9d985e68e23a77c4f1145b651ccbc Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Sat, 3 Feb 2024 21:54:25 -0800
Subject: [PATCH 12/15] Bump version.

---
 samples/tools/autogenbench/autogenbench/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/tools/autogenbench/autogenbench/version.py b/samples/tools/autogenbench/autogenbench/version.py
index 4e56cb32e050..28eaadead633 100644
--- a/samples/tools/autogenbench/autogenbench/version.py
+++ b/samples/tools/autogenbench/autogenbench/version.py
@@ -1 +1 @@
-__version__ = "0.0.2a2"
+__version__ = "0.0.2a3"

From 98bc041587073986e09bcfcd79fbc6a89a0a6ac0 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Mon, 5 Feb 2024 13:01:41 -0800
Subject: [PATCH 13/15] Fixed a prompt escaping bug evident in GAIA task
 '6f37996b-2ac7-44b0-8e68-6d28256631b4'

---
 .../autogenbench/scenarios/GAIA/MANIFEST.json     |  4 +++-
 .../scenarios/GAIA/Scripts/init_tasks.py          |  2 +-
 .../GAIA/Templates/BasicTwoAgents/prompt.txt      |  1 +
 .../GAIA/Templates/BasicTwoAgents/scenario.py     |  8 +++++---
 .../GAIA/Templates/SocietyOfMind/prompt.txt       |  1 +
 .../GAIA/Templates/SocietyOfMind/scenario.py      | 15 ++++++++++-----
 6 files changed, 21 insertions(+), 10 deletions(-)
 create mode 100644 samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/prompt.txt
 create mode 100644 samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/prompt.txt

diff --git a/samples/tools/autogenbench/scenarios/GAIA/MANIFEST.json b/samples/tools/autogenbench/scenarios/GAIA/MANIFEST.json
index 807ec57bdc32..02c829f25c4b 100644
--- a/samples/tools/autogenbench/scenarios/GAIA/MANIFEST.json
+++ b/samples/tools/autogenbench/scenarios/GAIA/MANIFEST.json
@@ -4,9 +4,11 @@
         "Scripts/init_tasks.py": "Scripts/init_tasks.py",
         "Scripts/custom_tabulate.py": "Scripts/custom_tabulate.py",
         "Templates/BasicTwoAgents/expected_answer.txt": "Templates/BasicTwoAgents/expected_answer.txt",
+        "Templates/BasicTwoAgents/prompt.txt": "Templates/BasicTwoAgents/prompt.txt",
         "Templates/BasicTwoAgents/scenario.py": "Templates/BasicTwoAgents/scenario.py",
-        "Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
         "Templates/SocietyOfMind/expected_answer.txt": "Templates/SocietyOfMind/expected_answer.txt",
+        "Templates/SocietyOfMind/prompt.txt": "Templates/SocietyOfMind/prompt.txt",
+        "Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
         "Templates/SocietyOfMind/requirements.txt": "Templates/SocietyOfMind/requirements.txt"
     }
 }
diff --git a/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py b/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
index 61fef86136d9..61e2864a2534 100644
--- a/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
+++ b/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
@@ -61,9 +61,9 @@ def create_jsonl(name, tasks, files_dir, template):
                 "substitutions": {
                     "scenario.py": {
                         "__FILE_NAME__": task["file_name"],
-                        "__PROMPT__": task["Question"],
                     },
                     "expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
+                    "prompt.txt": {"__PROMPT__": task["Question"]},
                 },
             }
 
diff --git a/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/prompt.txt b/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py b/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
index 5ca7b0a28146..3f3f53f18f69 100644
--- a/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
+++ b/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
@@ -7,6 +7,10 @@
 testbed_utils.init()
 ##############################
 
+# Read the prompt
+PROMPT = ""
+with open("prompt.txt", "rt") as fh:
+    PROMPT = fh.read().strip()
 
 GAIA_SYSTEM_MESSAGE = (
     "You are a helpful AI assistant, and today's date is "
@@ -48,9 +52,7 @@
 )
 
 filename = "__FILE_NAME__".strip()
-question = """
-__PROMPT__
-""".strip()
+question = PROMPT
 
 if len(filename) > 0:
     question = f"Consider the file '{filename}', which can be read from the current working directory. If you need to read or write it, output python code in a code block (```python) to do so. {question}"
diff --git a/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/prompt.txt b/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/scenario.py b/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/scenario.py
index 129c898e47f7..bacd22e096cf 100644
--- a/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/scenario.py
+++ b/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/scenario.py
@@ -15,6 +15,11 @@
 testbed_utils.init()
 ##############################
 
+# Read the prompt
+PROMPT = ""
+with open("prompt.txt", "rt") as fh:
+    PROMPT = fh.read().strip()
+
 config_list = autogen.config_list_from_json(
     "OAI_CONFIG_LIST",
     filter_dict={"model": ["gpt-4"]},
@@ -46,9 +51,9 @@ def response_preparer(inner_messages):
     messages = [
         {
             "role": "user",
-            "content": """Earlier you were asked the following:
+            "content": f"""Earlier you were asked the following:
 
-__PROMPT__
+{PROMPT}
 
 Your team then worked diligently to address that request. Here is a transcript of that conversation:""",
         }
@@ -69,10 +74,10 @@ def response_preparer(inner_messages):
     messages.append(
         {
             "role": "user",
-            "content": """
+            "content": f"""
 Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
 
-__PROMPT__
+{PROMPT}
 
 To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
@@ -140,7 +145,7 @@ def response_preparer(inner_messages):
 question = f"""
 Below I will pose a question to you that I would like you to answer. You should begin by listing all the relevant facts necessary to derive an answer, then fill in those facts from memory where possible, including specific names, numbers and statistics. You are Ken Jennings-level with trivia, and Mensa-level with puzzles, so there should be a deep well to draw from. After listing the facts, begin to solve the question in earnest. Here is the question:
 
-{filename_prompt}__PROMPT__
+{filename_prompt}{PROMPT}
 """.strip()
 
 groupchat = GroupChatModerator(

From 76703d4d2175632dec277a267c0386bbda4d51d4 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Mon, 5 Feb 2024 13:26:49 -0800
Subject: [PATCH 14/15] Updated all scenarios to use template discovery.

---
 .../scenarios/AutoGPT/Scripts/init_tasks.py   |  8 ++++-
 .../scenarios/HumanEval/Scripts/init_tasks.py | 33 +++++++++++--------
 .../scenarios/MATH/Scripts/init_tasks.py      | 12 ++++---
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/samples/tools/autogenbench/scenarios/AutoGPT/Scripts/init_tasks.py b/samples/tools/autogenbench/scenarios/AutoGPT/Scripts/init_tasks.py
index 00a6d15ef77f..2f5ba5f40e85 100644
--- a/samples/tools/autogenbench/scenarios/AutoGPT/Scripts/init_tasks.py
+++ b/samples/tools/autogenbench/scenarios/AutoGPT/Scripts/init_tasks.py
@@ -8,6 +8,7 @@
 import sys
 import glob
 import base64
+import re
 from huggingface_hub import snapshot_download
 
 SCRIPT_PATH = os.path.realpath(__file__)
@@ -88,7 +89,12 @@ def create_jsonl(name, template):
 
 ###############################################################################
 def main():
-    templates = {"two_agents": os.path.join(TEMPLATES_DIR, "TwoAgents")}
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path
 
     # Add coding directories if needed (these are usually empty and left out of the repo)
     for template in templates.values():
diff --git a/samples/tools/autogenbench/scenarios/HumanEval/Scripts/init_tasks.py b/samples/tools/autogenbench/scenarios/HumanEval/Scripts/init_tasks.py
index 799ac7b170ce..04480f5d2a9d 100644
--- a/samples/tools/autogenbench/scenarios/HumanEval/Scripts/init_tasks.py
+++ b/samples/tools/autogenbench/scenarios/HumanEval/Scripts/init_tasks.py
@@ -8,6 +8,7 @@
 import io
 import json
 import os
+import re
 import base64
 
 URL = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
@@ -16,7 +17,13 @@
 SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
 SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
 
+SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
+TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
+TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
+
 # A selected subset of HumanEval problems to work with during development
+
+# Deprecated 2/5/2024 -- Use subsample instead
 REDUCED_SET = [
     "HumanEval/2",
     "HumanEval/26",
@@ -73,19 +80,17 @@ def create_jsonl(name, tasks, template):
     """Creates a JSONL scenario file with a given name, list of HumanEval tasks, and template path."""
 
     # Create a task directory if it doesn't exist
-    scenario_dir = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
-    task_dir = os.path.join(scenario_dir, "Tasks")
-    if not os.path.isdir(task_dir):
-        os.mkdir(task_dir)
+    if not os.path.isdir(TASKS_DIR):
+        os.mkdir(TASKS_DIR)
 
     # Create the jsonl file
-    with open(os.path.join(task_dir, name + ".jsonl"), "wt") as fh:
+    with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh:
         for task in tasks:
             print(f"Converting: [{name}] {task['task_id']}")
 
             record = {
                 "id": task["task_id"].replace("/", "_"),
-                "template": os.path.join(os.path.pardir, template),
+                "template": template,
                 "substitutions": {
                     "scenario.py": {
                         "__ENTRY_POINT__": task["entry_point"],
@@ -102,19 +107,19 @@ def create_jsonl(name, tasks, template):
 ###############################################################################
 def main():
     human_eval = download_human_eval()
-    reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
+    # Deprecated: reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
 
-    templates = {
-        "two_agents": "Templates/TwoAgents",
-        # "gc3_distractor": "Templates/GroupChatThreeAgents_Distractor",
-        # "gc3_guardrails": "Templates/GroupChatThreeAgents_Guardrails",
-        # "gc4": "Templates/GroupChatFourAgents",
-    }
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path
 
     # Create the various combinations of [models] x [templates]
     for t in templates.items():
         create_jsonl(f"human_eval_{t[0]}", human_eval, t[1])
-        create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])
+        # Deprecated: create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])
 
 
 if __name__ == "__main__" and __package__ is None:
diff --git a/samples/tools/autogenbench/scenarios/MATH/Scripts/init_tasks.py b/samples/tools/autogenbench/scenarios/MATH/Scripts/init_tasks.py
index 16545c8e5d04..8b2d07995e2c 100644
--- a/samples/tools/autogenbench/scenarios/MATH/Scripts/init_tasks.py
+++ b/samples/tools/autogenbench/scenarios/MATH/Scripts/init_tasks.py
@@ -8,6 +8,7 @@
 import io
 import json
 import os
+import re
 import sys
 
 URL = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
@@ -91,7 +92,7 @@ def create_jsonl(name, problems, template):
 
             record = {
                 "id": task_id,
-                "template": os.path.join(os.path.pardir, template),
+                "template": template,
                 "substitutions": {
                     "prompt.txt": {"__PROMPT__": data["problem"]},
                     "expected_answer.txt": {"__ANSWER__": data["solution"]},
@@ -105,9 +106,12 @@ def create_jsonl(name, problems, template):
 def main():
     problems = download_math()
 
-    templates = {
-        "two_agents": "Templates/TwoAgents",
-    }
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path
 
     for t in templates.items():
         create_jsonl(f"math_{t[0]}", problems, t[1])

From 09237cc928997069155d695060035156622cb660 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Thu, 15 Feb 2024 22:28:19 -0800
Subject: [PATCH 15/15] Update with main version of runtime_logging.

---
 .../autogenbench/template/testbed_utils.py    | 20 +++++++++----------
 .../autogenbench/autogenbench/version.py      |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/samples/tools/autogenbench/autogenbench/template/testbed_utils.py b/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
index ee46bc6af5c8..a9a956158310 100644
--- a/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
+++ b/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
@@ -6,12 +6,12 @@
 
 AUTOGEN_VERSION = packaging.version.parse(autogen.__version__)
 
-# Try importing the telemetry module (only available in some branches)
-TELEMETRY_ENABLED = False
+# Try importing the runtime_logging module (only available in some branches)
+LOGGING_ENABLED = False
 try:
-    import autogen.telemetry
+    import autogen.runtime_logging
 
-    TELEMETRY_ENABLED = True
+    LOGGING_ENABLED = True
 except ImportError:
     pass
 
@@ -66,9 +66,9 @@ def init():
     if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):
         autogen.Completion.start_logging(compact=False)
 
-    # Start telemetry
-    if TELEMETRY_ENABLED:
-        autogen.telemetry.start_logging()
+    # Start logging
+    if LOGGING_ENABLED:
+        autogen.runtime_logging.start(config={"dbname": "telemetry.db"})
 
 
 def finalize(agents):
@@ -103,6 +103,6 @@ def messages_to_json(agent):
             fh.write(json.dumps(autogen.Completion.logged_history, indent=4))
         autogen.Completion.stop_logging()
 
-    # Start telemetry
-    if TELEMETRY_ENABLED:
-        autogen.telemetry.stop_logging()
+    # Stop logging
+    if LOGGING_ENABLED:
+        autogen.runtime_logging.stop()
diff --git a/samples/tools/autogenbench/autogenbench/version.py b/samples/tools/autogenbench/autogenbench/version.py
index 28eaadead633..5f0b332cb550 100644
--- a/samples/tools/autogenbench/autogenbench/version.py
+++ b/samples/tools/autogenbench/autogenbench/version.py
@@ -1 +1 @@
-__version__ = "0.0.2a3"
+__version__ = "0.0.2a4"