From 84897b46fcd2c766585910e95ada762f17c66270 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Thu, 16 Nov 2023 15:25:54 -0800
Subject: [PATCH 01/16] Re-added completion logging when using older versions
 of autogen.

---
 samples/tools/testbed/includes/testbed_utils.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/samples/tools/testbed/includes/testbed_utils.py b/samples/tools/testbed/includes/testbed_utils.py
index de1f41fd6243..bb435c5536ca 100644
--- a/samples/tools/testbed/includes/testbed_utils.py
+++ b/samples/tools/testbed/includes/testbed_utils.py
@@ -4,6 +4,8 @@
 import autogen
 import json
 
+AUTOGEN_VERSION = packaging.version.parse(autogen.__version__)
+
 
 def default_llm_config(config_list, timeout=180):
     """Return a default config list with a given timeout, and with caching disabled.
@@ -21,11 +23,10 @@ def default_llm_config(config_list, timeout=180):
     }
 
     # Add options depending on the version
-    version = packaging.version.parse(autogen.__version__)
-    if version < packaging.version.parse("0.2.0b1"):
+    if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):
         llm_config["request_timeout"] = timeout
         llm_config["use_cache"] = False
-    elif version < packaging.version.parse("0.2.0b4"):
+    elif AUTOGEN_VERSION < packaging.version.parse("0.2.0b4"):
         llm_config["timeout"] = timeout
         llm_config["cache"] = None
     else:
@@ -52,6 +53,10 @@ def init():
         f.write("Timestamp: " + datetime.now().isoformat() + "\n")
         f.write("pyautogen version: " + str(autogen.__version__) + "\n")
 
+    # Start logging
+    if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):
+        autogen.Completion.start_logging(compact=False)
+
 
 def finalize(agents):
     """Helper function to finalize logging in a testbed scenario.
@@ -78,3 +83,9 @@ def messages_to_json(agent):
         fname = agent.name + "_messages.json"
         with open(os.path.join(script_dir, fname), "wt") as fh:
             fh.write(messages_to_json(agent))
+
+    # Stop logging, and write logs to disk
+    if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):
+        with open(os.path.join(script_dir, "completion_log.json"), "wt") as fh:
+            fh.write(json.dumps(autogen.Completion.logged_history, indent=4))
+        autogen.Completion.stop_logging()

From 014063ebb4a850bbce7bb051eed917c5d8e52159 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Sat, 18 Nov 2023 00:27:27 -0800
Subject: [PATCH 02/16] Extended scenario definitions and templating to include
 folders.

---
 samples/tools/testbed/README.md               | 113 ++++++++++-----
 .../tools/testbed/includes/global_finalize.sh |   1 +
 samples/tools/testbed/includes/global_init.sh |   1 +
 samples/tools/testbed/run_scenarios.py        | 129 ++++++++++++++----
 .../scenarios/default_two_agents.jsonl        |   6 -
 .../scenario.py}                              |   0
 .../default_two_agents/scenario_finalize.sh   |   1 +
 .../default_two_agents/scenario_init.sh       |   1 +
 .../scenarios/default_two_agents_gpt35.jsonl  |   3 +
 .../scenarios/default_two_agents_gpt4.jsonl   |   3 +
 10 files changed, 190 insertions(+), 68 deletions(-)
 create mode 100644 samples/tools/testbed/includes/global_finalize.sh
 create mode 100644 samples/tools/testbed/includes/global_init.sh
 delete mode 100644 samples/tools/testbed/scenarios/default_two_agents.jsonl
 rename samples/tools/testbed/scenarios/{default_two_agents.py => default_two_agents/scenario.py} (100%)
 create mode 100644 samples/tools/testbed/scenarios/default_two_agents/scenario_finalize.sh
 create mode 100644 samples/tools/testbed/scenarios/default_two_agents/scenario_init.sh
 create mode 100644 samples/tools/testbed/scenarios/default_two_agents_gpt35.jsonl
 create mode 100644 samples/tools/testbed/scenarios/default_two_agents_gpt4.jsonl

diff --git a/samples/tools/testbed/README.md b/samples/tools/testbed/README.md
index 52bdf21e4457..aa233c9764a8 100644
--- a/samples/tools/testbed/README.md
+++ b/samples/tools/testbed/README.md
@@ -77,17 +77,18 @@ Within each folder, you will find the following files:
 
 ## Scenario Templating
 
-All scenarios are stored in JSONL files in the ``./scenarios'' directory. Each line of a scenario file is a JSON object with the following schema:
+All scenarios are stored in JSONL files in the ``./scenarios'' directory. Each line of a scenario file is a JSON object. The schema varies slightly based on if "template" specifies a _file_ or a _directory_.
 
+If "template" points to a _file_, the format is:
 ```
 {
    "id": string,
    "template": filename,
-   "values" {
-       "field_name1": string,
-       "field_name2": string,
+   "substitutions" {
+       "find_string1": replace_string1,
+       "find_string2": replace_string2,
        ...
-       "field_nameN": string
+       "find_stringN": replace_stringN
    }
 }
 ```
@@ -98,48 +99,88 @@ For example:
 {
     "id": "two_agent_stocks_gpt4",
     "template": "default_two_agents.py",
-    "values": {
+    "substitutions": {
         "\__MODEL\__": "gpt-4",
         "\__PROMPT\__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD."
     }
 }
 ```
 
-Where the ``id`` is the instance id used when saving results, ``template`` points to a python file that contains the scenario logic, and ``values`` contains a set of strings to find and replace when expanding the template.
 
-An example templated python file is:
+If "template" points to a _directory_, the format is:
 
 ```
-from autogen import AssistantAgent, UserProxyAgent, config_list_from_json
-import os
-import json
-import testbed_utils
-
-testbed_utils.init()
-##############################
-
-config_list = config_list_from_json(
-        "OAI_CONFIG_LIST", filter_dict={"model": ["\__MODEL\__"]},
-)
-
-assistant = AssistantAgent("assistant", llm_config={
-    "request_timeout": 180,
-    "config_list": config_list}
-)
-user_proxy = UserProxyAgent("user_proxy",
-            human_input_mode="NEVER",
-            code_execution_config={
-                "work_dir": "coding",
-                "use_docker": False,
-            },
-            max_consecutive_auto_reply=10)
-user_proxy.initiate_chat(assistant, message="\__PROMPT\__")
-
-
-##############################
-testbed_utils.finalize(assistant, user_proxy)
+{
+   "id": string,
+   "template": dirname,
+   "substitutions" {
+       "filename1": {
+       	   "find_string1_1": replace_string1_1,
+           "find_string1_2": replace_string1_2,
+           ...
+           "find_string1_M": replace_string1_N
+       }
+       "filename2": {
+       	   "find_string2_1": replace_string2_1,
+           "find_string2_2": replace_string2_2,
+           ...
+           "find_string2_N": replace_string2_N
+       }
+   }
+}
+```
+
+For example:
+
+```
+{
+    "id": "two_agent_stocks_gpt4",
+    "template": "default_two_agents",
+    "substitutions": {
+	"scenario.py": {
+            "\__MODEL\__": "gpt-4",
+	},
+	"prompt.txt": {
+            "\__PROMPT\__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD."
+        }
+    }
+}
 ```
 
+In this example, the string `__MODEL__` will be replaced in the file `scenarios.py`, while the string `__PROMPT__` will be replaced in the `prompt.txt` file.
+
+
+## Scenario Expansion Algorithm
+
+When the Testbed runs a scenario, it creates a local folder to share with Docker. As noted above, each instance and repetition gets its own folder along the path: ``./results/[scenario]/[instance_id]/[repetition]``
+
+For the sake of brevity we will refer to this folder as the `DEST_FOLDER`.
+
+The algorithm for populating the `DEST_FOLDER` is as follows:
+
+1. Recursively copy the contents of `./incudes` to DEST_FOLDER. This folder contains all the basic starter files for running a scenario, including an ENV file which will set the Docker environment variables.
+2. Append the OAI_CONFIG_LIST to the ENV file so that autogen may access these secrets.
+3. Recursively copy the scenario folder (if `template` in the json scenario definition points to a folder) to DEST_FOLDER. If the `template` instead points to a file, copy the file, but rename it to `scenario.py`
+4. Apply any templating, as outlined in the prior section.
+5. Write a run.sh file to DEST_FOLDER that will be executed by Docker when it is loaded.
+
+
+## Scenario Execution Algorithm
+
+Once the scenario has been expanded it is run (via run.sh). This script will execute the following steps:
+
+1. Read and set the ENV environment variables
+2. If a file named `global_init.sh` is present, run it.
+3. If a file named `scenario_init.sh` is present, run it.
+4. Install the requirements file (if running in Docker)
+5. Run the Autogen scenario via `python scenario.py`
+6. Clean up (delete cache, etc.)
+7. If a file named `scenario_finalize.sh` is present, run it.
+8. If a file named `global_finalize.sh` is present, run it.
+9. echo "SCENARIO COMPLETE !#!#", signaling that all steps completed.
+
+Notably, this means that scenarios can add custom init and teardown logic by including `scenario_init.sh` and `scenario_finalize.sh` files.
+
 
 ## (Example) Running HumanEval
 
diff --git a/samples/tools/testbed/includes/global_finalize.sh b/samples/tools/testbed/includes/global_finalize.sh
new file mode 100644
index 000000000000..c5d6f5cab238
--- /dev/null
+++ b/samples/tools/testbed/includes/global_finalize.sh
@@ -0,0 +1 @@
+# Global finalize.
diff --git a/samples/tools/testbed/includes/global_init.sh b/samples/tools/testbed/includes/global_init.sh
new file mode 100644
index 000000000000..4815212dbb9e
--- /dev/null
+++ b/samples/tools/testbed/includes/global_init.sh
@@ -0,0 +1 @@
+echo AUTOGEN_TESTBED_SETTING: [$AUTOGEN_TESTBED_SETTING]
diff --git a/samples/tools/testbed/run_scenarios.py b/samples/tools/testbed/run_scenarios.py
index 53b561501a35..3ecaaae3266f 100644
--- a/samples/tools/testbed/run_scenarios.py
+++ b/samples/tools/testbed/run_scenarios.py
@@ -13,7 +13,7 @@
 IS_WIN32 = sys.platform == "win32"
 
 # Location of the global includes dir. The contents of this directory will be copied to the Docker environment.
-INCLUDES_DIR = "includes"
+GLOBAL_INCLUDES_DIR = "includes"
 
 
 def run_scenarios(scenario, n_repeats, is_native, config_list, requirements, results_dir="results"):
@@ -61,10 +61,7 @@ def run_scenarios(scenario, n_repeats, is_native, config_list, requirements, res
             for line in fh:
                 instance = json.loads(line)
 
-                scenario_name + "_" + instance["id"]
-
                 # Create a folder to store the results
-
                 # Results base
                 if not os.path.isdir(results_dir):
                     os.mkdir(results_dir)
@@ -74,7 +71,7 @@ def run_scenarios(scenario, n_repeats, is_native, config_list, requirements, res
                 if not os.path.isdir(results_scenario):
                     os.mkdir(results_scenario)
 
-                # Results fot the instance
+                # Results for the instance
                 results_instance = os.path.join(results_scenario, instance["id"])
                 if not os.path.isdir(results_instance):
                     os.mkdir(results_instance)
@@ -89,23 +86,17 @@ def run_scenarios(scenario, n_repeats, is_native, config_list, requirements, res
                         continue
                     print(f"Running scenario {results_repetition}")
 
-                    # Create the folder, and copy the script to a standard name
-                    os.mkdir(results_repetition)
-                    expand_scenario(scenario_dir, instance, os.path.join(results_repetition, "scenario.py"))
-
-                    # Also copy the contents of INCLUDES_DIR
-                    for item in os.listdir(INCLUDES_DIR):
-                        if item.endswith(".example"):
-                            continue
-                        item_path = os.path.join(INCLUDES_DIR, item)
-                        if os.path.isfile(item_path):
-                            shutil.copyfile(item_path, os.path.join(results_repetition, item))
+                    # Copy the contents of GLOBAL_INCLUDES_DIR to the result_repetition dir
+                    shutil.copytree(GLOBAL_INCLUDES_DIR, results_repetition, ignore=shutil.ignore_patterns("*.example"))
 
                     # Append the config list to the ENV file
                     config_list_json = json.dumps(config_list)
                     with open(os.path.join(results_repetition, "ENV"), "at") as fh:
                         fh.write(f"export OAI_CONFIG_LIST='{config_list_json}'\n")
 
+                    # Expand the scenario
+                    expand_scenario(scenario_dir, instance, results_repetition)
+
                     # Run the scenario
                     if is_native:
                         run_scenario_natively(results_repetition)
@@ -113,14 +104,40 @@ def run_scenarios(scenario, n_repeats, is_native, config_list, requirements, res
                         run_scenario_in_docker(results_repetition, requirements)
 
 
-def expand_scenario(scenario_dir, scenario, output_file):
-    template_fh = open(os.path.join(scenario_dir, scenario["template"]), "rt")
+def expand_scenario(scenario_dir, scenario, output_dir):
+    """
+    Expand a scenario into a folder.
+    """
+
+    template_path = os.path.join(scenario_dir, scenario["template"])
+
+    # Either key works for finding the substiturions list. "values" may be deprecated in the future
+    substitutions = scenario["substitutions"] if "substitutions" in scenario else scenario["values"]
+
+    # If the template is a folder, copy the tree, and treat the substitutions dictionary
+    # as nested [file]->[find_str]->[replace_str].
+    if os.path.isdir(template_path):
+        shutil.copytree(template_path, output_dir, dirs_exist_ok=True)
+        for templated_file in substitutions.keys():  # Keys are relative file paths
+            expand_file(
+                os.path.join(template_path, templated_file),
+                os.path.join(output_dir, templated_file),
+                substitutions[templated_file],
+            )
+    else:
+        expand_file(template_path, os.path.join(output_dir, "scenario.py"), substitutions)
+
+
+def expand_file(template_file, output_file, values):
+    """
+    Expands a template to a file.
+    """
+    template_fh = open(template_file, "rt")
     output_fh = open(output_file, "wt")
 
     for line in template_fh:
-        if "values" in scenario:
-            for k, v in scenario["values"].items():
-                line = line.replace(k, v)
+        for k, v in values.items():
+            line = line.replace(k, v)
         output_fh.write(line)
 
     template_fh.close()
@@ -146,8 +163,40 @@ def run_scenario_natively(work_dir):
     with open(os.path.join("run.sh"), "wt") as f:
         f.write(
             """#
+export AUTOGEN_TESTBED_SETTING="Native"
+
+# Read the environment variables
 . ./ENV
+
+# Run the global init script if it exists
+if [ -f global_init.sh ] ; then
+    . ./global_init.sh
+fi
+
+# Run the scenario init script if it exists
+if [ -f scenario_init.sh ] ; then
+    . ./scenario_init.sh
+fi
+
+# Run the scenario
 python scenario.py
+
+# Clean up
+rm ENV
+if [ -d .cache ] ; then
+    rm -Rf .cache
+fi
+
+# Run the scenario finalize script if it exists
+if [ -f scenario_init.sh ] ; then
+    . ./scenario_finalize.sh
+fi
+
+# Run the global finalize script if it exists
+if [ -f scenario_init.sh ] ; then
+    . ./global_finalize.sh
+fi
+
 echo SCENARIO COMPLETE !#!#
 """
         )
@@ -192,14 +241,42 @@ def run_scenario_in_docker(work_dir, requirements, timeout=600):
     with open(os.path.join(work_dir, "run.sh"), "wt", newline="\n") as f:
         f.write(
             f"""#
+export AUTOGEN_TESTBED_SETTING="Docker"
 umask 000
+
+# Read the environment variables
 . ./ENV
+
+# Run the global init script if it exists
+if [ -f global_init.sh ] ; then
+    . ./global_init.sh
+fi
+
+# Run the scenario init script if it exists
+if [ -f scenario_init.sh ] ; then
+    . ./scenario_init.sh
+fi
+
+# Run the scenario
 pip install -r {requirements}
 python scenario.py
+
+# Clean up
 rm ENV
 if [ -d .cache ] ; then
-    chmod -R a+rw .cache
+    rm -Rf .cache
 fi
+
+# Run the scenario finalize script if it exists
+if [ -f scenario_init.sh ] ; then
+    . ./scenario_finalize.sh
+fi
+
+# Run the global finalize script if it exists
+if [ -f scenario_init.sh ] ; then
+    . ./global_finalize.sh
+fi
+
 echo SCENARIO COMPLETE !#!#
 """
         )
@@ -270,7 +347,7 @@ def run_scenario_in_docker(work_dir, requirements, timeout=600):
         "--requirements",
         type=str,
         help="The requirements file to pip install before running the scenario. This file must be found in the '"
-        + INCLUDES_DIR
+        + GLOBAL_INCLUDES_DIR
         + "' directory. (default: requirements.txt)",
         default=None,
     )
@@ -313,13 +390,13 @@ def run_scenario_in_docker(work_dir, requirements, timeout=600):
         import docker
 
         # Make sure the requirements file exists
-        req_file = os.path.join(INCLUDES_DIR, requirements)
+        req_file = os.path.join(GLOBAL_INCLUDES_DIR, requirements)
         if not os.path.isfile(req_file):
             raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), req_file)
 
     # Warn aboit a common error
-    env_file = os.path.join(INCLUDES_DIR, "ENV")
-    example_file = os.path.join(INCLUDES_DIR, "ENV.example")
+    env_file = os.path.join(GLOBAL_INCLUDES_DIR, "ENV")
+    example_file = os.path.join(GLOBAL_INCLUDES_DIR, "ENV.example")
     if not os.path.isfile(env_file):
         shutil.copyfile(example_file, env_file)
         sys.stderr.write(
diff --git a/samples/tools/testbed/scenarios/default_two_agents.jsonl b/samples/tools/testbed/scenarios/default_two_agents.jsonl
deleted file mode 100644
index 4da04167fa62..000000000000
--- a/samples/tools/testbed/scenarios/default_two_agents.jsonl
+++ /dev/null
@@ -1,6 +0,0 @@
-{ "id": "two_agent_stocks_gpt4", "template": "default_two_agents.py", "values": { "__MODEL__": "gpt-4", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } }
-{ "id": "two_agent_stocks_gpt35", "template": "default_two_agents.py", "values": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } }
-{ "id": "two_agent_arxiv_search_gpt4", "template": "default_two_agents.py", "values": { "__MODEL__": "gpt-4", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } }
-{ "id": "two_agent_arxiv_search_gpt35", "template": "default_two_agents.py", "values": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } }
-{ "id": "two_agent_mslogo_search_gpt4", "template": "default_two_agents.py", "values": { "__MODEL__": "gpt-4", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with API key stored in os.environ['BING_API_KEY']" } }
-{ "id": "two_agent_mslogo_search_gpt35", "template": "default_two_agents.py", "values": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with the API key stored in os.environ['BING_API_KEY']" } }
diff --git a/samples/tools/testbed/scenarios/default_two_agents.py b/samples/tools/testbed/scenarios/default_two_agents/scenario.py
similarity index 100%
rename from samples/tools/testbed/scenarios/default_two_agents.py
rename to samples/tools/testbed/scenarios/default_two_agents/scenario.py
diff --git a/samples/tools/testbed/scenarios/default_two_agents/scenario_finalize.sh b/samples/tools/testbed/scenarios/default_two_agents/scenario_finalize.sh
new file mode 100644
index 000000000000..bc6f402e1f56
--- /dev/null
+++ b/samples/tools/testbed/scenarios/default_two_agents/scenario_finalize.sh
@@ -0,0 +1 @@
+#Scenario finalize.
diff --git a/samples/tools/testbed/scenarios/default_two_agents/scenario_init.sh b/samples/tools/testbed/scenarios/default_two_agents/scenario_init.sh
new file mode 100644
index 000000000000..92ee66e16848
--- /dev/null
+++ b/samples/tools/testbed/scenarios/default_two_agents/scenario_init.sh
@@ -0,0 +1 @@
+#Scenario Init.
diff --git a/samples/tools/testbed/scenarios/default_two_agents_gpt35.jsonl b/samples/tools/testbed/scenarios/default_two_agents_gpt35.jsonl
new file mode 100644
index 000000000000..15884c3a2705
--- /dev/null
+++ b/samples/tools/testbed/scenarios/default_two_agents_gpt35.jsonl
@@ -0,0 +1,3 @@
+{ "id": "two_agent_stocks", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } } }
+{ "id": "two_agent_arxiv_search", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } } }
+{ "id": "two_agent_mslogo_search", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with API key stored in os.environ['BING_API_KEY']" } } }
diff --git a/samples/tools/testbed/scenarios/default_two_agents_gpt4.jsonl b/samples/tools/testbed/scenarios/default_two_agents_gpt4.jsonl
new file mode 100644
index 000000000000..c774f73f405e
--- /dev/null
+++ b/samples/tools/testbed/scenarios/default_two_agents_gpt4.jsonl
@@ -0,0 +1,3 @@
+{ "id": "two_agent_stocks", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } } }
+{ "id": "two_agent_arxiv_search", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } } }
+{ "id": "two_agent_mslogo_search", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with API key stored in os.environ['BING_API_KEY']" } } }

From 81c62c900bd41b896fd03b168a31d06d5788d9a2 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Sat, 18 Nov 2023 09:05:43 -0800
Subject: [PATCH 03/16] Prepare collate_human_eval.py for working with group
 chat scenarios.

---
 samples/tools/testbed/utils/collate_human_eval.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/samples/tools/testbed/utils/collate_human_eval.py b/samples/tools/testbed/utils/collate_human_eval.py
index ed83bb22bbfd..19dcaac2774a 100644
--- a/samples/tools/testbed/utils/collate_human_eval.py
+++ b/samples/tools/testbed/utils/collate_human_eval.py
@@ -1,11 +1,7 @@
 import os
-import errno
-import shutil
-import subprocess
 import json
+import re
 import sys
-import time
-import pathlib
 import argparse
 
 
@@ -34,9 +30,8 @@ def collate(results_dir):
                 with open(console_log, "rt") as fh:
                     content = fh.read()
                     if "ALL TESTS PASSED !#!#" in content:
-                        results.append(
-                            str(content.count("assistant (to user_proxy):"))
-                        )  # The number of assistant replies (which is also equal to the number of GPT calls in this case)
+                        # Ideally we would have a more distinctive pattern.
+                        results.append(str(len(re.findall(r"\n(.*?) \(to (.*?)\)\:\n", content))))
                     else:
                         results.append("-1")
 

From 6f0a45f429f502b723d7a1a7d7146384838a3e91 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Sun, 19 Nov 2023 22:35:58 -0800
Subject: [PATCH 04/16] Converted HumanEval to the folder-based approach, and
 added GroupChat scenarios.

---
 samples/tools/testbed/README.md               |  23 ++-
 .../tools/testbed/includes/requirements.txt   |   2 +-
 .../Templates/ThreeAgents/scenario.py         |  63 ++++++++
 .../Templates/TwoAgents}/scenario.py          |   7 +-
 .../Templates/TwoAgents}/scenario_finalize.sh |   0
 .../Templates/TwoAgents}/scenario_init.sh     |   0
 .../Examples/default_three_agents_gpt35.jsonl |   1 +
 .../Examples/default_three_agents_gpt4.jsonl  |   1 +
 .../Examples/default_two_agents_gpt35.jsonl   |   3 +
 .../Examples/default_two_agents_gpt4.jsonl    |   3 +
 .../testbed/scenarios/HumanEval/README.md     |   1 +
 .../GroupChatFourAgents/coding/my_tests.py    |  10 ++
 .../Templates/GroupChatFourAgents/prompt.txt  |   1 +
 .../Templates/GroupChatFourAgents/scenario.py | 115 +++++++++++++++
 .../coding/my_tests.py                        |  10 ++
 .../prompt.txt                                |   1 +
 .../scenario.py                               |  93 ++++++++++++
 .../coding/my_tests.py                        |  10 ++
 .../prompt.txt                                |   1 +
 .../scenario.py                               | 110 ++++++++++++++
 .../Templates/TwoAgents/coding/my_tests.py    |  10 ++
 .../HumanEval/Templates/TwoAgents/prompt.txt  |   1 +
 .../Templates/TwoAgents/scenario.py}          |  39 ++---
 .../scenarios/default_two_agents_gpt35.jsonl  |   3 -
 .../scenarios/default_two_agents_gpt4.jsonl   |   3 -
 .../tools/testbed/utils/download_humaneval.py | 136 ++++++++++++------
 26 files changed, 557 insertions(+), 90 deletions(-)
 create mode 100644 samples/tools/testbed/scenarios/Examples/Templates/ThreeAgents/scenario.py
 rename samples/tools/testbed/scenarios/{default_two_agents => Examples/Templates/TwoAgents}/scenario.py (91%)
 rename samples/tools/testbed/scenarios/{default_two_agents => Examples/Templates/TwoAgents}/scenario_finalize.sh (100%)
 rename samples/tools/testbed/scenarios/{default_two_agents => Examples/Templates/TwoAgents}/scenario_init.sh (100%)
 create mode 100644 samples/tools/testbed/scenarios/Examples/default_three_agents_gpt35.jsonl
 create mode 100644 samples/tools/testbed/scenarios/Examples/default_three_agents_gpt4.jsonl
 create mode 100644 samples/tools/testbed/scenarios/Examples/default_two_agents_gpt35.jsonl
 create mode 100644 samples/tools/testbed/scenarios/Examples/default_two_agents_gpt4.jsonl
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/README.md
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/coding/my_tests.py
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/prompt.txt
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/scenario.py
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/coding/my_tests.py
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/prompt.txt
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/scenario.py
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/coding/my_tests.py
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/prompt.txt
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/scenario.py
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/Templates/TwoAgents/coding/my_tests.py
 create mode 100644 samples/tools/testbed/scenarios/HumanEval/Templates/TwoAgents/prompt.txt
 rename samples/tools/testbed/scenarios/{human_eval_two_agents.py => HumanEval/Templates/TwoAgents/scenario.py} (62%)
 delete mode 100644 samples/tools/testbed/scenarios/default_two_agents_gpt35.jsonl
 delete mode 100644 samples/tools/testbed/scenarios/default_two_agents_gpt4.jsonl

diff --git a/samples/tools/testbed/README.md b/samples/tools/testbed/README.md
index aa233c9764a8..5347960414c9 100644
--- a/samples/tools/testbed/README.md
+++ b/samples/tools/testbed/README.md
@@ -2,7 +2,7 @@
 
 The Autogen Testbed environment is a tool for repeatedly running a set of pre-defined Autogen scenarios in a setting with tightly-controlled initial conditions. With each run, Autogen will start from a blank slate, working out what code needs to be written, and what libraries or dependencies to install. The results of each run are logged, and can be ingested by analysis or metrics scripts (see the HumanEval example later in this README). By default, all runs are conducted in freshly-initialized docker containers, providing the recommended level of consistency and safety.
 
-This Testbed sample has been tested in, and is known to work with, Autogen versions 0.1.14 and 0.2.0b5
+This Testbed sample has been tested in, and is known to work with, Autogen versions 0.1.14 and 0.2.0b6
 
 ## Setup
 
@@ -17,11 +17,10 @@ The Testbed also requires Docker (Desktop or Engine) AND the __python docker__ l
 ## Running the Testbed
 
 To run the Testbed, simply execute
-``python run_scenarios.py``
-
-The default it to repeat this scenario 10 times. This can be costly. To run each scenario only once, use:
-``python run_scenarios.py --repeat 1``
+``python run_scenarios.py scenarios/Examples``
 
+The default it to run each scenario once time. To run each scenario 10 times, use:
+``python run_scenarios.py --repeat 10 scenarios/Examples ``
 
 The run_scenarios.py script also allows a number of command-line arguments to control various parameters of execution. Type ``python run_scenarios.py -h`` to explore these options:
 
@@ -58,26 +57,26 @@ By default, the Testbed stores results in a folder heirarchy with the following
 
 For example, consider the following folders:
 
-``./results/default_two_agents/two_agent_stocks_gpt4/0``
-``./results/default_two_agents/two_agent_stocks_gpt4/1``
+``./results/default_two_agents_gpt35/two_agent_stocks/0``
+``./results/default_two_agents_gpt35/two_agent_stocks/1``
 
 ...
 
-``./results/default_two_agents/two_agent_stocks_gpt4/9``
+``./results/default_two_agents_gpt35/two_agent_stocks/9``
 
-This folder holds the results for the ``two_agent_stocks_gpt4`` instance of the ``default_two_agents`` scenario. The ``0`` folder contains the results of the first run. The ``1`` folder contains the results of the second run, and so on. You can think of the _instance_ as mapping to a prompt, or a unique set of parameters, while the _scenario_ defines the template in which those parameters are input.
+This folder holds the results for the ``two_agent_stocks`` instance of the ``default_two_agents_gpt35`` scenario. The ``0`` folder contains the results of the first run. The ``1`` folder contains the results of the second run, and so on. You can think of the _instance_ as mapping to a prompt, or a unique set of parameters, while the _scenario_ defines the template in which those parameters are input.
 
 Within each folder, you will find the following files:
 
 - *timestamp.txt*: records the date and time of the run, along with the version of the pyautogen library installed
 - *console_log.txt*: all console output produced by Docker when running autogen. Read this like you would a regular console.
-- *chat_completions.json*: a log of all OpenAI ChatCompletions, as logged by ``autogen.ChatCompletion.start_logging(compact=False)``
+- *chat_completions.json*: a log of all OpenAI ChatCompletions, as logged by `autogen.ChatCompletion.start_logging(compact=False)`
 - *[agent]_messages.json*: for each Agent, a log of their messages dictionaries
 - *./coding*: A directory containing all code written by Autogen, and all artifacts produced by that code.
 
 ## Scenario Templating
 
-All scenarios are stored in JSONL files in the ``./scenarios'' directory. Each line of a scenario file is a JSON object. The schema varies slightly based on if "template" specifies a _file_ or a _directory_.
+All scenarios are stored in JSONL files (in subdirectories under `./scenarios`). Each line of a scenario file is a JSON object. The schema varies slightly based on if "template" specifies a _file_ or a _directory_.
 
 If "template" points to a _file_, the format is:
 ```
@@ -190,7 +189,7 @@ Accessing this scenario-type requires downloading and converting the HumanEval d
 
 ```
 python utils/download_humaneval.py
-python ./run_scenarios.py --repeat 3 scenarios/human_eval_two_agents_gpt35.jsonl
+python ./run_scenarios.py scenarios/HumanEval/human_eval_two_agents_gpt35.jsonl
 python utils/collate_human_eval.py ./results/human_eval_two_agents_gpt35 | python utils/metrics_human_eval.py > human_eval_results_gpt35.csv
 cat human_eval_results_gpt35.csv
 ```
diff --git a/samples/tools/testbed/includes/requirements.txt b/samples/tools/testbed/includes/requirements.txt
index 46ad1e009ca1..8f88664e8bba 100644
--- a/samples/tools/testbed/includes/requirements.txt
+++ b/samples/tools/testbed/includes/requirements.txt
@@ -1 +1 @@
-pyautogen
+git+https://github.com/microsoft/autogen.git
diff --git a/samples/tools/testbed/scenarios/Examples/Templates/ThreeAgents/scenario.py b/samples/tools/testbed/scenarios/Examples/Templates/ThreeAgents/scenario.py
new file mode 100644
index 000000000000..56f72eeaa0e5
--- /dev/null
+++ b/samples/tools/testbed/scenarios/Examples/Templates/ThreeAgents/scenario.py
@@ -0,0 +1,63 @@
+import os
+import json
+import autogen
+import testbed_utils
+
+testbed_utils.init()
+##############################
+
+config_list = autogen.config_list_from_json(
+    "OAI_CONFIG_LIST",
+    filter_dict={"model": ["__MODEL__"]},
+)
+
+assistant = autogen.AssistantAgent(
+    "assistant",
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+user_proxy = autogen.UserProxyAgent(
+    "user_proxy",
+    human_input_mode="NEVER",
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    code_execution_config={
+        "work_dir": "coding",
+        "use_docker": False,
+    },
+    max_consecutive_auto_reply=10,
+    default_auto_reply="TERMINATE",
+)
+
+third_agent = autogen.AssistantAgent(
+    "__3RD_AGENT_NAME__",
+    system_message="""
+__3RD_AGENT_PROMPT__
+""".strip(),
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+groupchat = autogen.GroupChat(
+    agents=[user_proxy, assistant, third_agent],
+    messages=[],
+    speaker_selection_method="__SELECTION_METHOD__",
+    allow_repeat_speaker=False,
+    max_round=12,
+)
+
+manager = autogen.GroupChatManager(
+    groupchat=groupchat,
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+user_proxy.initiate_chat(
+    manager,
+    message="""
+__PROMPT__
+""".strip(),
+)
+
+##############################
+testbed_utils.finalize(agents=[assistant, user_proxy, third_agent, manager])
diff --git a/samples/tools/testbed/scenarios/default_two_agents/scenario.py b/samples/tools/testbed/scenarios/Examples/Templates/TwoAgents/scenario.py
similarity index 91%
rename from samples/tools/testbed/scenarios/default_two_agents/scenario.py
rename to samples/tools/testbed/scenarios/Examples/Templates/TwoAgents/scenario.py
index a24bef83ebf5..a445c68523da 100644
--- a/samples/tools/testbed/scenarios/default_two_agents/scenario.py
+++ b/samples/tools/testbed/scenarios/Examples/Templates/TwoAgents/scenario.py
@@ -27,7 +27,12 @@
     max_consecutive_auto_reply=10,
     default_auto_reply="TERMINATE",
 )
-user_proxy.initiate_chat(assistant, message="__PROMPT__")
+user_proxy.initiate_chat(
+    assistant,
+    message="""
+__PROMPT__
+""".strip(),
+)
 
 
 ##############################
diff --git a/samples/tools/testbed/scenarios/default_two_agents/scenario_finalize.sh b/samples/tools/testbed/scenarios/Examples/Templates/TwoAgents/scenario_finalize.sh
similarity index 100%
rename from samples/tools/testbed/scenarios/default_two_agents/scenario_finalize.sh
rename to samples/tools/testbed/scenarios/Examples/Templates/TwoAgents/scenario_finalize.sh
diff --git a/samples/tools/testbed/scenarios/default_two_agents/scenario_init.sh b/samples/tools/testbed/scenarios/Examples/Templates/TwoAgents/scenario_init.sh
similarity index 100%
rename from samples/tools/testbed/scenarios/default_two_agents/scenario_init.sh
rename to samples/tools/testbed/scenarios/Examples/Templates/TwoAgents/scenario_init.sh
diff --git a/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt35.jsonl b/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt35.jsonl
new file mode 100644
index 000000000000..656fab6760cd
--- /dev/null
+++ b/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt35.jsonl
@@ -0,0 +1 @@
+{ "id": "two_agent_stocks", "template": "Templates/ThreeAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD.", "__SELECTION_METHOD__": "auto", "__3RD_AGENT_NAME__": "visualization_critic", "__3RD_AGENT_PROMPT__": "A student of Edward Tufte, you are an expert in information design, and will provide helpful critiques of visualizations. As you prepare your critiques, please consider the following dimensions:\n- Are there bugs, logic errors, syntax error or typos in the visualization code? Are there any reasons why the code may fail to run? How should it be fixed?\n- Is the data transformed appropriately for the visualization type? E.g., is the dataset appropriated filtered, aggregated, or grouped  if needed? If a date field is used, is the date field first converted to a date object etc?\n- How well does the code meet the specified visualization goals?\n- CONSIDERING BEST PRACTICES, is the visualization type appropriate for the data and intent? Is there a visualization type that would be more effective in conveying insights? \n- Are the aesthetics of the visualization appropriate for the visualization type and the data?" } } }
diff --git a/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt4.jsonl b/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt4.jsonl
new file mode 100644
index 000000000000..a2b9dccd2e42
--- /dev/null
+++ b/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt4.jsonl
@@ -0,0 +1 @@
+{ "id": "two_agent_stocks", "template": "Templates/ThreeAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD.", "__SELECTION_METHOD__": "auto", "__3RD_AGENT_NAME__": "visualization_critic", "__3RD_AGENT_PROMPT__": "A student of Edward Tufte, you are an expert in information design, and will provide helpful critiques of visualizations. As you prepare your critiques, please consider the following dimensions:\n- Are there bugs, logic errors, syntax error or typos in the visualization code? Are there any reasons why the code may fail to run? How should it be fixed?\n- Is the data transformed appropriately for the visualization type? E.g., is the dataset appropriated filtered, aggregated, or grouped  if needed? If a date field is used, is the date field first converted to a date object etc?\n- How well does the code meet the specified visualization goals?\n- CONSIDERING BEST PRACTICES, is the visualization type appropriate for the data and intent? Is there a visualization type that would be more effective in conveying insights? \n- Are the aesthetics of the visualization appropriate for the visualization type and the data?" } } }
diff --git a/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt35.jsonl b/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt35.jsonl
new file mode 100644
index 000000000000..02a5aaff040e
--- /dev/null
+++ b/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt35.jsonl
@@ -0,0 +1,3 @@
+{ "id": "two_agent_stocks", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } } }
+{ "id": "two_agent_arxiv_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } } }
+{ "id": "two_agent_mslogo_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with API key stored in os.environ['BING_API_KEY']" } } }
diff --git a/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt4.jsonl b/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt4.jsonl
new file mode 100644
index 000000000000..6b1cbba2b271
--- /dev/null
+++ b/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt4.jsonl
@@ -0,0 +1,3 @@
+{ "id": "two_agent_stocks", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } } }
+{ "id": "two_agent_arxiv_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } } }
+{ "id": "two_agent_mslogo_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with API key stored in os.environ['BING_API_KEY']" } } }
diff --git a/samples/tools/testbed/scenarios/HumanEval/README.md b/samples/tools/testbed/scenarios/HumanEval/README.md
new file mode 100644
index 000000000000..b0748865807a
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/README.md
@@ -0,0 +1 @@
+Run `python ../../utils/download_humaneval.py` to populate this folder.
diff --git a/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/coding/my_tests.py b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/coding/my_tests.py
new file mode 100644
index 000000000000..951a40831111
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/coding/my_tests.py
@@ -0,0 +1,10 @@
+# Disable ruff linter for template files
+# ruff: noqa: F821
+
+__TEST__
+
+
+def run_tests(candidate):
+    check(candidate)
+    # We can search for this string in the output
+    print("ALL TESTS PASSED !#!#\nTERMINATE")
diff --git a/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/prompt.txt b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/scenario.py b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/scenario.py
new file mode 100644
index 000000000000..c49166ce7eb1
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatFourAgents/scenario.py
@@ -0,0 +1,115 @@
+import os
+import json
+import base64
+import autogen
+import testbed_utils
+
+# NOTE:
+# This scenario runs Human Eval in a slightly unconventional way:
+# The agents have access to the unit tests, and can keep trying
+# until they pass.
+
+testbed_utils.init()
+##############################
+
+work_dir = "coding"
+
+# Read the prompt
+PROMPT = ""
+with open("prompt.txt", "rt") as fh:
+    PROMPT = fh.read()
+
+# Ok, now get autogen to solve it.
+config_list = autogen.config_list_from_json(
+    "OAI_CONFIG_LIST",
+    filter_dict={"model": ["__MODEL__"]},
+)
+
+assistant = autogen.AssistantAgent(
+    "coder",
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+user_proxy = autogen.UserProxyAgent(
+    "user_proxy",
+    human_input_mode="NEVER",
+    system_message="A human who can run code at a terminal and report back the results.",
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    code_execution_config={
+        "work_dir": work_dir,
+        "use_docker": False,
+        "last_n_messages": "auto",
+    },
+    max_consecutive_auto_reply=10,
+)
+
+guardrails_agent = autogen.AssistantAgent(
+    "guardrails_agent",
+    system_message="""
+You are a guardrails agent and are tasked with ensuring that all parties adhere to the following responsible AI policies:
+
+  - You MUST TERMINATE the conversation if it involves writing or running HARMFUL or DESTRUCTIVE code.
+  - You MUST TERMINATE the conversation if it involves discussions of anything relating to hacking, computer exploits, or computer security.
+  - You MUST TERMINATE the conversation if it involves violent or graphic content such as Harm to Others, Self-Harm, Suicide.
+  - You MUST TERMINATE the conversation if it involves demeaning speech, hate speech, discriminatory remarks, or any form of harassment based on race, gender, sexuality, religion, nationality, disability, or any other protected characteristic.
+  - You MUST TERMINATE the conversation if if it involves seeking or giving  advice in highly regulated domains such as medical advice, mental health, legal advice or financial advice
+  - You MUST TERMINATE the conversation if it involves illegal activities including when encouraging or providing guidance on illegal activities.
+  - You MUST TERMINATE the conversation if it involves manipulative or deceptive Content including scams, phishing and spread false information.
+  - You MUST TERMINATE the conversation if it involves involve sexually explicit content or discussions.
+  - You MUST TERMINATE the conversation if it involves sharing or soliciting personal, sensitive, or confidential information from users. This includes financial details, health records, and other private matters.
+  - You MUST TERMINATE the conversation if it involves deep personal problems such as dealing with serious personal issues, mental health concerns, or crisis situations.
+
+If you decide that the conversation must be terminated, explain your reasoning then output the uppercase word "TERMINATE". If, on the other hand, you decide the conversation is acceptable by the above standards, indicate as much, then ask the other parties to proceed.
+""".strip(),
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+distractor_agent = autogen.AssistantAgent(
+    "executive_chef",
+    system_message="You are an executive chef with 28 years of industry experience. You can answer questions about menu planning, meal preparation, and cooking techniques.",
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+groupchat = autogen.GroupChat(
+    agents=[user_proxy, assistant, guardrails_agent, distractor_agent],
+    messages=[],
+    speaker_selection_method="__SELECTION_METHOD__",
+    allow_repeat_speaker=False,
+    max_round=12,
+)
+
+manager = autogen.GroupChatManager(
+    groupchat=groupchat,
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+user_proxy.initiate_chat(
+    manager,
+    message="""
+The following python code imports the `run_tests(candidate)` function from my_tests.py, and runs
+it on the function `__ENTRY_POINT__`. This will run a set of automated unit tests to verify the
+correct implementation of `__ENTRY_POINT__`. However, `__ENTRY_POINT__` is only partially
+implemented in the code below. Complete the implementation of `__ENTRY_POINT__` and output
+a new stand-alone code block that contains everything needed to run the tests, including: importing
+`my_tests`, calling `run_tests(__ENTRY_POINT__)`, as well as __ENTRY_POINT__'s complete definition,
+such that this code block can be run directly in Python.
+
+```python
+from my_tests import run_tests
+
+"""
+    + PROMPT
+    + """
+
+# Run the unit tests
+run_tests(__ENTRY_POINT__)
+```
+""",
+)
+
+##############################
+testbed_utils.finalize(agents=[assistant, user_proxy, guardrails_agent, distractor_agent, manager])
diff --git a/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/coding/my_tests.py b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/coding/my_tests.py
new file mode 100644
index 000000000000..951a40831111
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/coding/my_tests.py
@@ -0,0 +1,10 @@
+# Disable ruff linter for template files
+# ruff: noqa: F821
+
+__TEST__
+
+
+def run_tests(candidate):
+    check(candidate)
+    # We can search for this string in the output
+    print("ALL TESTS PASSED !#!#\nTERMINATE")
diff --git a/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/prompt.txt b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/scenario.py b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/scenario.py
new file mode 100644
index 000000000000..ef8d339429a0
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Distractor/scenario.py
@@ -0,0 +1,93 @@
+import os
+import json
+import base64
+import autogen
+import testbed_utils
+
+# NOTE:
+# This scenario runs Human Eval in a slightly unconventional way:
+# The agents have access to the unit tests, and can keep trying
+# until they pass.
+
+testbed_utils.init()
+##############################
+
+work_dir = "coding"
+
+# Read the prompt
+PROMPT = ""
+with open("prompt.txt", "rt") as fh:
+    PROMPT = fh.read()
+
+# Ok, now get autogen to solve it.
+config_list = autogen.config_list_from_json(
+    "OAI_CONFIG_LIST",
+    filter_dict={"model": ["__MODEL__"]},
+)
+
+assistant = autogen.AssistantAgent(
+    "coder",
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+user_proxy = autogen.UserProxyAgent(
+    "user_proxy",
+    human_input_mode="NEVER",
+    system_message="A human who can run code at a terminal and report back the results.",
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    code_execution_config={
+        "work_dir": work_dir,
+        "use_docker": False,
+        "last_n_messages": "auto",
+    },
+    max_consecutive_auto_reply=10,
+)
+
+distractor_agent = autogen.AssistantAgent(
+    "executive_chef",
+    system_message="You are an executive chef with 28 years of industry experience. You can answer questions about menu planning, meal preparation, and cooking techniques.",
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+groupchat = autogen.GroupChat(
+    agents=[user_proxy, assistant, distractor_agent],
+    messages=[],
+    speaker_selection_method="__SELECTION_METHOD__",
+    allow_repeat_speaker=False,
+    max_round=12,
+)
+
+manager = autogen.GroupChatManager(
+    groupchat=groupchat,
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+user_proxy.initiate_chat(
+    manager,
+    message="""
+The following python code imports the `run_tests(candidate)` function from my_tests.py, and runs
+it on the function `__ENTRY_POINT__`. This will run a set of automated unit tests to verify the
+correct implementation of `__ENTRY_POINT__`. However, `__ENTRY_POINT__` is only partially
+implemented in the code below. Complete the implementation of `__ENTRY_POINT__` and output
+a new stand-alone code block that contains everything needed to run the tests, including: importing
+`my_tests`, calling `run_tests(__ENTRY_POINT__)`, as well as __ENTRY_POINT__'s complete definition,
+such that this code block can be run directly in Python.
+
+```python
+from my_tests import run_tests
+
+"""
+    + PROMPT
+    + """
+
+# Run the unit tests
+run_tests(__ENTRY_POINT__)
+```
+""",
+)
+
+##############################
+testbed_utils.finalize(agents=[assistant, user_proxy, distractor_agent, manager])
diff --git a/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/coding/my_tests.py b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/coding/my_tests.py
new file mode 100644
index 000000000000..951a40831111
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/coding/my_tests.py
@@ -0,0 +1,10 @@
+# Disable ruff linter for template files
+# ruff: noqa: F821
+
+__TEST__
+
+
+def run_tests(candidate):
+    check(candidate)
+    # We can search for this string in the output
+    print("ALL TESTS PASSED !#!#\nTERMINATE")
diff --git a/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/prompt.txt b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/scenario.py b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/scenario.py
new file mode 100644
index 000000000000..258fdcecc2a7
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/GroupChatThreeAgents_Guardrails/scenario.py
@@ -0,0 +1,110 @@
+import os
+import json
+import base64
+import autogen
+import testbed_utils
+
+# NOTE:
+# This scenario runs Human Eval in a slightly unconventional way:
+# The agents have access to the unit tests, and can keep trying
+# until they pass.
+
+testbed_utils.init()
+##############################
+
+work_dir = "coding"
+
+# Read the prompt
+PROMPT = ""
+with open("prompt.txt", "rt") as fh:
+    PROMPT = fh.read()
+
+# Ok, now get autogen to solve it.
+config_list = autogen.config_list_from_json(
+    "OAI_CONFIG_LIST",
+    filter_dict={"model": ["__MODEL__"]},
+)
+
+assistant = autogen.AssistantAgent(
+    "coder",
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+user_proxy = autogen.UserProxyAgent(
+    "user_proxy",
+    human_input_mode="NEVER",
+    system_message="A human who can run code at a terminal and report back the results.",
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    code_execution_config={
+        "work_dir": work_dir,
+        "use_docker": False,
+        "last_n_messages": "auto",
+    },
+    max_consecutive_auto_reply=10,
+)
+
+guardrails_agent = autogen.AssistantAgent(
+    "guardrails_agent",
+    system_message="""
+You are a guardrails agent and are tasked with ensuring that all parties adhere to the following responsible AI policies:
+
+  - You MUST TERMINATE the conversation if it involves writing or running HARMFUL or DESTRUCTIVE code.
+  - You MUST TERMINATE the conversation if it involves discussions of anything relating to hacking, computer exploits, or computer security.
+  - You MUST TERMINATE the conversation if it involves violent or graphic content such as Harm to Others, Self-Harm, Suicide.
+  - You MUST TERMINATE the conversation if it involves demeaning speech, hate speech, discriminatory remarks, or any form of harassment based on race, gender, sexuality, religion, nationality, disability, or any other protected characteristic.
+  - You MUST TERMINATE the conversation if if it involves seeking or giving  advice in highly regulated domains such as medical advice, mental health, legal advice or financial advice
+  - You MUST TERMINATE the conversation if it involves illegal activities including when encouraging or providing guidance on illegal activities.
+  - You MUST TERMINATE the conversation if it involves manipulative or deceptive Content including scams, phishing and spread false information.
+  - You MUST TERMINATE the conversation if it involves involve sexually explicit content or discussions.
+  - You MUST TERMINATE the conversation if it involves sharing or soliciting personal, sensitive, or confidential information from users. This includes financial details, health records, and other private matters.
+  - You MUST TERMINATE the conversation if it involves deep personal problems such as dealing with serious personal issues, mental health concerns, or crisis situations.
+
+If you decide that the conversation must be terminated, explain your reasoning then output the uppercase word "TERMINATE". If, on the other hand, you decide the conversation is acceptable by the above standards, indicate as much, then ask the other parties to proceed.
+""".strip(),
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+groupchat = autogen.GroupChat(
+    agents=[user_proxy, assistant, guardrails_agent],
+    messages=[],
+    speaker_selection_method="__SELECTION_METHOD__",
+    allow_repeat_speaker=False,
+    max_round=12,
+)
+
+
+manager = autogen.GroupChatManager(
+    groupchat=groupchat,
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+
+user_proxy.initiate_chat(
+    manager,
+    message="""
+The following python code imports the `run_tests(candidate)` function from my_tests.py, and runs
+it on the function `__ENTRY_POINT__`. This will run a set of automated unit tests to verify the
+correct implementation of `__ENTRY_POINT__`. However, `__ENTRY_POINT__` is only partially
+implemented in the code below. Complete the implementation of `__ENTRY_POINT__` and output
+a new stand-alone code block that contains everything needed to run the tests, including: importing
+`my_tests`, calling `run_tests(__ENTRY_POINT__)`, as well as __ENTRY_POINT__'s complete definition,
+such that this code block can be run directly in Python.
+
+```python
+from my_tests import run_tests
+
+"""
+    + PROMPT
+    + """
+
+# Run the unit tests
+run_tests(__ENTRY_POINT__)
+```
+""",
+)
+
+
+##############################
+testbed_utils.finalize(agents=[assistant, user_proxy, guardrails_agent, manager])
diff --git a/samples/tools/testbed/scenarios/HumanEval/Templates/TwoAgents/coding/my_tests.py b/samples/tools/testbed/scenarios/HumanEval/Templates/TwoAgents/coding/my_tests.py
new file mode 100644
index 000000000000..951a40831111
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/TwoAgents/coding/my_tests.py
@@ -0,0 +1,10 @@
+# Disable ruff linter for template files
+# ruff: noqa: F821
+
+__TEST__
+
+
+def run_tests(candidate):
+    check(candidate)
+    # We can search for this string in the output
+    print("ALL TESTS PASSED !#!#\nTERMINATE")
diff --git a/samples/tools/testbed/scenarios/HumanEval/Templates/TwoAgents/prompt.txt b/samples/tools/testbed/scenarios/HumanEval/Templates/TwoAgents/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/TwoAgents/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/samples/tools/testbed/scenarios/human_eval_two_agents.py b/samples/tools/testbed/scenarios/HumanEval/Templates/TwoAgents/scenario.py
similarity index 62%
rename from samples/tools/testbed/scenarios/human_eval_two_agents.py
rename to samples/tools/testbed/scenarios/HumanEval/Templates/TwoAgents/scenario.py
index 536bf5fe7f4f..d47a09458888 100644
--- a/samples/tools/testbed/scenarios/human_eval_two_agents.py
+++ b/samples/tools/testbed/scenarios/HumanEval/Templates/TwoAgents/scenario.py
@@ -14,26 +14,10 @@
 
 work_dir = "coding"
 
-# These come formatted as Base64 to avoid conflicting with the triple-quotes
-TESTS = base64.b64decode("__TEST_BASE64__").decode("utf-8")
-PROMPT = base64.b64decode("__PROMPT_BASE64__").decode("utf-8")
-
-# Write the tests to a file so that the agents can access them
-if not os.path.isdir(work_dir):
-    os.mkdir(work_dir)
-with open(os.path.join(work_dir, "my_tests.py"), "wt") as fh:
-    fh.write(
-        TESTS
-        + """
-
-
-def run_tests(candidate):
-   check(candidate)
-   # We can search for this string in the output
-   print("ALL TESTS PASSED !#!#\\nTERMINATE")
-"""
-    )
-
+# Read the prompt
+PROMPT = ""
+with open("prompt.txt", "rt") as fh:
+    PROMPT = fh.read()
 
 # Ok, now get autogen to solve it.
 config_list = autogen.config_list_from_json(
@@ -43,13 +27,14 @@ def run_tests(candidate):
 
 assistant = autogen.AssistantAgent(
     "assistant",
-    is_termination_msg=lambda x: x.get("content", "").rstrip().find("TERMINATE") >= 0,
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
     llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
 )
+
 user_proxy = autogen.UserProxyAgent(
     "user_proxy",
     human_input_mode="NEVER",
-    is_termination_msg=lambda x: x.get("content", "").rstrip().find("TERMINATE") >= 0,
+    is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
     code_execution_config={
         "work_dir": work_dir,
         "use_docker": False,
@@ -57,6 +42,7 @@ def run_tests(candidate):
     max_consecutive_auto_reply=10,
     default_auto_reply="TERMINATE",
 )
+
 user_proxy.initiate_chat(
     assistant,
     message="""
@@ -64,25 +50,22 @@ def run_tests(candidate):
 it on the function `__ENTRY_POINT__`. This will run a set of automated unit tests to verify the
 correct implementation of `__ENTRY_POINT__`. However, `__ENTRY_POINT__` is only partially
 implemented in the code below. Complete the implementation of `__ENTRY_POINT__` and output
-a new stand-alone code block that contains everything needed run the tests, including: importing
-`my_tests`, calling `run_tests(__ENTRY_POINT__)`, as well as __ENTRY_POINT__'s comepelte definition,
-such that this code block can be run direcly in Python.
+a new stand-alone code block that contains everything needed to run the tests, including: importing
+`my_tests`, calling `run_tests(__ENTRY_POINT__)`, as well as __ENTRY_POINT__'s complete definition,
+such that this code block can be run directly in Python.
 
 ```python
 from my_tests import run_tests
 
-
 """
     + PROMPT
     + """
 
-
 # Run the unit tests
 run_tests(__ENTRY_POINT__)
 ```
 """,
 )
 
-
 ##############################
 testbed_utils.finalize(agents=[assistant, user_proxy])
diff --git a/samples/tools/testbed/scenarios/default_two_agents_gpt35.jsonl b/samples/tools/testbed/scenarios/default_two_agents_gpt35.jsonl
deleted file mode 100644
index 15884c3a2705..000000000000
--- a/samples/tools/testbed/scenarios/default_two_agents_gpt35.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-{ "id": "two_agent_stocks", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } } }
-{ "id": "two_agent_arxiv_search", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } } }
-{ "id": "two_agent_mslogo_search", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with API key stored in os.environ['BING_API_KEY']" } } }
diff --git a/samples/tools/testbed/scenarios/default_two_agents_gpt4.jsonl b/samples/tools/testbed/scenarios/default_two_agents_gpt4.jsonl
deleted file mode 100644
index c774f73f405e..000000000000
--- a/samples/tools/testbed/scenarios/default_two_agents_gpt4.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-{ "id": "two_agent_stocks", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } } }
-{ "id": "two_agent_arxiv_search", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } } }
-{ "id": "two_agent_mslogo_search", "template": "default_two_agents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with API key stored in os.environ['BING_API_KEY']" } } }
diff --git a/samples/tools/testbed/utils/download_humaneval.py b/samples/tools/testbed/utils/download_humaneval.py
index faf6c3c3b553..c967b5342a3e 100644
--- a/samples/tools/testbed/utils/download_humaneval.py
+++ b/samples/tools/testbed/utils/download_humaneval.py
@@ -10,58 +10,110 @@
 import os
 import base64
 
+URL = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
 
-script_path = os.path.realpath(__file__)
-script_name = os.path.basename(script_path)
-script_dir = os.path.dirname(script_path)
+SCRIPT_PATH = os.path.realpath(__file__)
+SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
+SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
 
-# Directory where scenarios are stored
-scenarios_dir = os.path.realpath(os.path.join(script_dir, os.path.pardir, "scenarios"))
-print("Saving HumanEval scenarios to: " + scenarios_dir)
+# A selected subset of HumanEval problems to work with during development
+REDUCED_SET = [
+    "HumanEval/2",
+    "HumanEval/26",
+    "HumanEval/32",
+    "HumanEval/33",
+    "HumanEval/36",
+    "HumanEval/38",
+    "HumanEval/41",
+    "HumanEval/50",
+    "HumanEval/56",
+    "HumanEval/65",
+    "HumanEval/67",
+    "HumanEval/84",
+    "HumanEval/85",
+    "HumanEval/86",
+    "HumanEval/89",
+    "HumanEval/99",
+    "HumanEval/104",
+    "HumanEval/113",
+    "HumanEval/115",
+    "HumanEval/120",
+    "HumanEval/124",
+    "HumanEval/126",
+    "HumanEval/132",
+    "HumanEval/135",
+    "HumanEval/140",
+    "HumanEval/146",
+]
 
 
-# URL of the file to download
-url = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
+def download_human_eval():
+    """Download the HumanEval dataset, un-gzips it, and returns a list of its parsed JSON objects."""
 
-# Send a HTTP request to the URL of the file
-response = requests.get(url)
+    # Send a HTTP request to the URL of the file
+    response = requests.get(URL)
 
-# Ensure we raise an error if the download failed
-response.raise_for_status()
+    # Ensure we raise an error if the download failed
+    response.raise_for_status()
 
-# Create a BytesIO object from the response content
-buffer = io.BytesIO(response.content)
+    # Create a BytesIO object from the response content
+    buffer = io.BytesIO(response.content)
 
-# Create a scenario file
-fh_gpt4 = open(os.path.join(scenarios_dir, "human_eval_two_agents_gpt4.jsonl"), "wt")
-fh_gpt35 = open(os.path.join(scenarios_dir, "human_eval_two_agents_gpt35.jsonl"), "wt")
+    # Read the file, line by line, populating a list of parsed JSON objects
+    results = []
+    with gzip.GzipFile(fileobj=buffer) as f_in:
+        for line in f_in:
+            # Parse each line as JSON
+            results.append(json.loads(line))
 
-# Open the buffer as a .gz file and read it line by line
-with gzip.GzipFile(fileobj=buffer) as f_in:
-    for line in f_in:
-        # Parse each line as JSON
-        data = json.loads(line)
-        print("Converting: " + data["task_id"])
+    return results
 
-        # Write the GPT-4 scenario
-        # Prompts and tests are saved in base 64 to greatly simplify escaping them as they
-        # move through the various formats and scripts. I welcome a better, more readable, alternative.
-        record = {
-            "id": data["task_id"].replace("/", "_"),
-            "template": "human_eval_two_agents.py",
-            "values": {
-                "__MODEL__": "gpt-4",
-                "__PROMPT_BASE64__": base64.b64encode(data["prompt"].encode("utf-8")).decode("utf-8"),
-                "__ENTRY_POINT__": data["entry_point"],
-                "__TEST_BASE64__": base64.b64encode(data["test"].encode("utf-8")).decode("utf-8"),
-            },
-        }
-        fh_gpt4.write(json.dumps(record).strip() + "\n")
 
-        # Write the GPT 3.5 Version
-        record["values"]["__MODEL__"] = "gpt-3.5-turbo-16k"
-        fh_gpt35.write(json.dumps(record).strip() + "\n")
+def create_jsonl(name, tasks, template, model):
+    """Creates a JSONL scenario file with a given name, list of HumanEval tasks, template path, and model."""
 
+    scenarios_dir = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir, "scenarios", "HumanEval"))
 
-fh_gpt4.close()
-fh_gpt35.close()
+    with open(os.path.join(scenarios_dir, name + ".jsonl"), "wt") as fh:
+        for task in tasks:
+            print(f"Converting: [{name}] {task['task_id']}")
+
+            record = {
+                "id": task["task_id"].replace("/", "_"),
+                "template": template,
+                "substitutions": {
+                    "scenario.py": {
+                        "__MODEL__": model,
+                        "__ENTRY_POINT__": task["entry_point"],
+                        "__SELECTION_METHOD__": "auto",
+                    },
+                    "prompt.txt": {"__PROMPT__": task["prompt"]},
+                    "coding/my_tests.py": {"__TEST__": task["test"]},
+                },
+            }
+
+            fh.write(json.dumps(record).strip() + "\n")
+
+
+###############################################################################
+if __name__ == "__main__":
+    human_eval = download_human_eval()
+    reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
+
+    models = {
+        "gpt4": "gpt-4",
+        "gpt35": "gpt-3.5-turbo-16k",
+    }
+
+    templates = {
+        "two_agents": "Templates/TwoAgents",
+        "gc3_distractor": "Templates/GroupChatThreeAgents_Distractor",
+        "gc3_guardrails": "Templates/GroupChatThreeAgents_Guardrails",
+        "gc4": "Templates/GroupChatFourAgents",
+    }
+
+    # Create the various combinations of [models] x [templates]
+    for m in models.items():
+        for t in templates.items():
+            create_jsonl(f"human_eval_{t[0]}_{m[0]}", human_eval, t[1], m[1])
+            create_jsonl(f"r_human_eval_{t[0]}_{m[0]}", reduced_human_eval, t[1], m[1])

From 5e9fe6098846e07783d04cbbf3c1461533c2661e Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Sun, 19 Nov 2023 23:04:14 -0800
Subject: [PATCH 05/16] Fixed the default termination message.

---
 .../testbed/scenarios/Examples/Templates/TwoAgents/scenario.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/tools/testbed/scenarios/Examples/Templates/TwoAgents/scenario.py b/samples/tools/testbed/scenarios/Examples/Templates/TwoAgents/scenario.py
index a445c68523da..6f736a5aaba5 100644
--- a/samples/tools/testbed/scenarios/Examples/Templates/TwoAgents/scenario.py
+++ b/samples/tools/testbed/scenarios/Examples/Templates/TwoAgents/scenario.py
@@ -25,7 +25,7 @@
         "use_docker": False,
     },
     max_consecutive_auto_reply=10,
-    default_auto_reply="TERMINATE",
+    default_auto_reply="",
 )
 user_proxy.initiate_chat(
     assistant,

From f96245a379599223e9e54993e38dc7c261f74083 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Sun, 19 Nov 2023 23:15:51 -0800
Subject: [PATCH 06/16] Fixed another termination condition.

---
 .../scenarios/Examples/Templates/ThreeAgents/scenario.py    | 3 ++-
 .../scenarios/Examples/default_three_agents_gpt35.jsonl     | 2 +-
 .../scenarios/Examples/default_three_agents_gpt4.jsonl      | 2 +-
 .../scenarios/Examples/default_two_agents_gpt35.jsonl       | 6 +++---
 .../scenarios/Examples/default_two_agents_gpt4.jsonl        | 6 +++---
 5 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/samples/tools/testbed/scenarios/Examples/Templates/ThreeAgents/scenario.py b/samples/tools/testbed/scenarios/Examples/Templates/ThreeAgents/scenario.py
index 56f72eeaa0e5..cbf383ebfe27 100644
--- a/samples/tools/testbed/scenarios/Examples/Templates/ThreeAgents/scenario.py
+++ b/samples/tools/testbed/scenarios/Examples/Templates/ThreeAgents/scenario.py
@@ -20,13 +20,14 @@
 user_proxy = autogen.UserProxyAgent(
     "user_proxy",
     human_input_mode="NEVER",
+    system_message="A human who can run code at a terminal and report back the results.",
     is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
     code_execution_config={
         "work_dir": "coding",
         "use_docker": False,
     },
     max_consecutive_auto_reply=10,
-    default_auto_reply="TERMINATE",
+    default_auto_reply="",
 )
 
 third_agent = autogen.AssistantAgent(
diff --git a/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt35.jsonl b/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt35.jsonl
index 656fab6760cd..9dc14578f0dc 100644
--- a/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt35.jsonl
+++ b/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt35.jsonl
@@ -1 +1 @@
-{ "id": "two_agent_stocks", "template": "Templates/ThreeAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD.", "__SELECTION_METHOD__": "auto", "__3RD_AGENT_NAME__": "visualization_critic", "__3RD_AGENT_PROMPT__": "A student of Edward Tufte, you are an expert in information design, and will provide helpful critiques of visualizations. As you prepare your critiques, please consider the following dimensions:\n- Are there bugs, logic errors, syntax error or typos in the visualization code? Are there any reasons why the code may fail to run? How should it be fixed?\n- Is the data transformed appropriately for the visualization type? E.g., is the dataset appropriated filtered, aggregated, or grouped  if needed? If a date field is used, is the date field first converted to a date object etc?\n- How well does the code meet the specified visualization goals?\n- CONSIDERING BEST PRACTICES, is the visualization type appropriate for the data and intent? Is there a visualization type that would be more effective in conveying insights? \n- Are the aesthetics of the visualization appropriate for the visualization type and the data?" } } }
+{ "id": "nvda_tsla_stocks", "template": "Templates/ThreeAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD.", "__SELECTION_METHOD__": "auto", "__3RD_AGENT_NAME__": "visualization_critic", "__3RD_AGENT_PROMPT__": "A student of Edward Tufte, you are an expert in information design, and will provide helpful critiques of visualizations. As you prepare your critiques, please consider the following dimensions:\n- Are there bugs, logic errors, syntax error or typos in the visualization code? Are there any reasons why the code may fail to run? How should it be fixed?\n- Is the data transformed appropriately for the visualization type? E.g., is the dataset appropriated filtered, aggregated, or grouped  if needed? If a date field is used, is the date field first converted to a date object etc?\n- How well does the code meet the specified visualization goals?\n- CONSIDERING BEST PRACTICES, is the visualization type appropriate for the data and intent? Is there a visualization type that would be more effective in conveying insights? \n- Are the aesthetics of the visualization appropriate for the visualization type and the data?" } } }
diff --git a/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt4.jsonl b/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt4.jsonl
index a2b9dccd2e42..8b1f5b717e62 100644
--- a/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt4.jsonl
+++ b/samples/tools/testbed/scenarios/Examples/default_three_agents_gpt4.jsonl
@@ -1 +1 @@
-{ "id": "two_agent_stocks", "template": "Templates/ThreeAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD.", "__SELECTION_METHOD__": "auto", "__3RD_AGENT_NAME__": "visualization_critic", "__3RD_AGENT_PROMPT__": "A student of Edward Tufte, you are an expert in information design, and will provide helpful critiques of visualizations. As you prepare your critiques, please consider the following dimensions:\n- Are there bugs, logic errors, syntax error or typos in the visualization code? Are there any reasons why the code may fail to run? How should it be fixed?\n- Is the data transformed appropriately for the visualization type? E.g., is the dataset appropriated filtered, aggregated, or grouped  if needed? If a date field is used, is the date field first converted to a date object etc?\n- How well does the code meet the specified visualization goals?\n- CONSIDERING BEST PRACTICES, is the visualization type appropriate for the data and intent? Is there a visualization type that would be more effective in conveying insights? \n- Are the aesthetics of the visualization appropriate for the visualization type and the data?" } } }
+{ "id": "nvda_tsla_stocks", "template": "Templates/ThreeAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD.", "__SELECTION_METHOD__": "auto", "__3RD_AGENT_NAME__": "visualization_critic", "__3RD_AGENT_PROMPT__": "A student of Edward Tufte, you are an expert in information design, and will provide helpful critiques of visualizations. As you prepare your critiques, please consider the following dimensions:\n- Are there bugs, logic errors, syntax error or typos in the visualization code? Are there any reasons why the code may fail to run? How should it be fixed?\n- Is the data transformed appropriately for the visualization type? E.g., is the dataset appropriated filtered, aggregated, or grouped  if needed? If a date field is used, is the date field first converted to a date object etc?\n- How well does the code meet the specified visualization goals?\n- CONSIDERING BEST PRACTICES, is the visualization type appropriate for the data and intent? Is there a visualization type that would be more effective in conveying insights? \n- Are the aesthetics of the visualization appropriate for the visualization type and the data?" } } }
diff --git a/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt35.jsonl b/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt35.jsonl
index 02a5aaff040e..e67f6b40121a 100644
--- a/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt35.jsonl
+++ b/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt35.jsonl
@@ -1,3 +1,3 @@
-{ "id": "two_agent_stocks", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } } }
-{ "id": "two_agent_arxiv_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } } }
-{ "id": "two_agent_mslogo_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with API key stored in os.environ['BING_API_KEY']" } } }
+{ "id": "nvda_tsla_stocks", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } } }
+{ "id": "arxiv_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } } }
+{ "id": "old_mslogo_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-3.5-turbo-16k", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with API key stored in os.environ['BING_API_KEY']" } } }
diff --git a/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt4.jsonl b/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt4.jsonl
index 6b1cbba2b271..3bb941d92ab9 100644
--- a/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt4.jsonl
+++ b/samples/tools/testbed/scenarios/Examples/default_two_agents_gpt4.jsonl
@@ -1,3 +1,3 @@
-{ "id": "two_agent_stocks", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } } }
-{ "id": "two_agent_arxiv_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } } }
-{ "id": "two_agent_mslogo_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with API key stored in os.environ['BING_API_KEY']" } } }
+{ "id": "nvda_tsla_stocks", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD." } } }
+{ "id": "arxiv_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Find 10 papers on explainable or interpretable AI that were submitted to arXiv within the last year. When printing results, include paper titles, authors, dates, and URLs, but not their abstracts." } } }
+{ "id": "old_mslogo_search", "template": "Templates/TwoAgents", "substitutions": { "scenario.py": { "__MODEL__": "gpt-4", "__PROMPT__": "Find Microsoft's logo from 1983, and save it to disk. If searching the web, use Bing with API key stored in os.environ['BING_API_KEY']" } } }

From a95c74815a8221bea589b65841b9f5265505e51c Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Mon, 27 Nov 2023 20:32:49 -0800
Subject: [PATCH 07/16] Updated compatible autogen versions.

---
 samples/tools/testbed/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/tools/testbed/README.md b/samples/tools/testbed/README.md
index 5347960414c9..c401df8cb341 100644
--- a/samples/tools/testbed/README.md
+++ b/samples/tools/testbed/README.md
@@ -2,7 +2,7 @@
 
 The Autogen Testbed environment is a tool for repeatedly running a set of pre-defined Autogen scenarios in a setting with tightly-controlled initial conditions. With each run, Autogen will start from a blank slate, working out what code needs to be written, and what libraries or dependencies to install. The results of each run are logged, and can be ingested by analysis or metrics scripts (see the HumanEval example later in this README). By default, all runs are conducted in freshly-initialized docker containers, providing the recommended level of consistency and safety.
 
-This Testbed sample has been tested in, and is known to work with, Autogen versions 0.1.14 and 0.2.0b6
+This Testbed sample has been tested in, and is known to work with, Autogen versions 0.1.14 and 0.2.0
 
 ## Setup
 

From 46b0abca64386e1ae58837ea3a025bd4c05b6487 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Wed, 29 Nov 2023 00:48:41 -0800
Subject: [PATCH 08/16] Added initial support for GAIA benchmark.

---
 .../DefaultTwoAgents/expected_answer.txt      |  1 +
 .../Templates/DefaultTwoAgents/scenario.py    | 54 +++++++++++
 .../DefaultTwoAgents/scenario_init.sh         |  6 ++
 samples/tools/testbed/utils/expand_gaia.py    | 90 +++++++++++++++++++
 4 files changed, 151 insertions(+)
 create mode 100644 samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/expected_answer.txt
 create mode 100644 samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario.py
 create mode 100644 samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario_init.sh
 create mode 100644 samples/tools/testbed/utils/expand_gaia.py

diff --git a/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/expected_answer.txt b/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/expected_answer.txt
new file mode 100644
index 000000000000..8153c2bf8242
--- /dev/null
+++ b/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/expected_answer.txt
@@ -0,0 +1 @@
+__EXPECTED_ANSWER__
diff --git a/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario.py b/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario.py
new file mode 100644
index 000000000000..f149eaff30af
--- /dev/null
+++ b/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario.py
@@ -0,0 +1,54 @@
+import os
+import json
+import autogen
+import testbed_utils
+
+testbed_utils.init()
+##############################
+
+config_list = autogen.config_list_from_json(
+    "OAI_CONFIG_LIST",
+    filter_dict={"model": ["__MODEL__"]},
+)
+
+assistant = autogen.AssistantAgent(
+    "assistant",
+    is_termination_msg=lambda x: x.get("content", "").rstrip().find("TERMINATE") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+user_proxy = autogen.UserProxyAgent(
+    "user_proxy",
+    human_input_mode="NEVER",
+    is_termination_msg=lambda x: x.get("content", "").rstrip().find("TERMINATE") >= 0,
+    code_execution_config={
+        "work_dir": "coding",
+        "use_docker": False,
+    },
+    max_consecutive_auto_reply=10,
+    default_auto_reply="",
+)
+
+filename = "__FILE_NAME__".strip()
+question = """
+__PROMPT__
+""".strip()
+
+if len(filename) > 0:
+    question = f"Consider the file '{filename}', which can be read from the current working directory. {question}"
+
+user_proxy.initiate_chat(
+    assistant,
+    message=f"""
+I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
+YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
+If you are asked for a string, don’t use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
+If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
+
+Question: {question}
+""".strip(),
+)
+
+
+##############################
+testbed_utils.finalize(agents=[assistant, user_proxy])
diff --git a/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario_init.sh b/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario_init.sh
new file mode 100644
index 000000000000..16ba19dddaf6
--- /dev/null
+++ b/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario_init.sh
@@ -0,0 +1,6 @@
+# Scenario Init.
+mkdir coding
+if [ -f "gaia_files/__FILE_NAME__" ] ; then
+    mv "gaia_files/__FILE_NAME__" coding/.
+fi
+rm -Rf gaia_files
diff --git a/samples/tools/testbed/utils/expand_gaia.py b/samples/tools/testbed/utils/expand_gaia.py
new file mode 100644
index 000000000000..010837ec8b46
--- /dev/null
+++ b/samples/tools/testbed/utils/expand_gaia.py
@@ -0,0 +1,90 @@
+#
+# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
+# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
+#
+
+import json
+import os
+import sys
+import shutil
+
+SCRIPT_PATH = os.path.realpath(__file__)
+SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
+SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
+
+
+def create_jsonl(name, tasks, template, model):
+    """Creates a JSONL scenario file with a given name, list of HumanEval tasks, template path, and model."""
+
+    scenarios_dir = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir, "scenarios", "GAIA"))
+
+    with open(os.path.join(scenarios_dir, name + ".jsonl"), "wt") as fh:
+        for task in tasks:
+            print(f"Converting: [{name}] {task['task_id']}")
+
+            record = {
+                "id": task["task_id"],
+                "template": template,
+                "substitutions": {
+                    "scenario.py": {
+                        "__MODEL__": model,
+                        "__FILE_NAME__": task["file_name"],
+                        "__PROMPT__": task["Question"],
+                    },
+                    "scenario_init.sh": {"__FILE_NAME__": task["file_name"]},
+                    "expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
+                },
+            }
+
+            fh.write(json.dumps(record).strip() + "\n")
+
+
+###############################################################################
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        sys.exit(f"SYNTAX: python {SCRIPT_NAME} [path to GIA repository]")
+
+    # Copy the relevant GAIA files
+    gaia_path = os.path.realpath(sys.argv[1])
+
+    gaia_validation_files = os.path.join(gaia_path, "2023", "validation")
+    gaia_test_files = os.path.join(gaia_path, "2023", "test")
+
+    if not os.path.isdir(gaia_validation_files) or not os.path.isdir(gaia_test_files):
+        sys.exit(f"Error: '{gaia_path}' does not appear to be a copy of the GAIA repository.")
+
+    gaia_merged_files = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir, "scenarios", "GAIA", "GAIA_Files"))
+
+    shutil.copytree(
+        gaia_validation_files, gaia_merged_files, ignore=shutil.ignore_patterns("metadata.jsonl"), dirs_exist_ok=True
+    )
+    shutil.copytree(
+        gaia_test_files, gaia_merged_files, ignore=shutil.ignore_patterns("metadata.jsonl"), dirs_exist_ok=True
+    )
+
+    # Load the GAIA data
+    gaia_validation_tasks = []
+    with open(os.path.join(gaia_validation_files, "metadata.jsonl")) as fh:
+        for line in fh:
+            gaia_validation_tasks.append(json.loads(line))
+
+    models = {
+        "gpt4": "gpt-4",
+    }
+
+    templates = {
+        "two_agents": "Templates/DefaultTwoAgents",
+    }
+
+    # Create necessary symlinks
+    for t in templates.items():
+        template_dir = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir, "scenarios", "GAIA", t[1]))
+        try:
+            os.symlink(gaia_merged_files, os.path.join(template_dir, "gaia_files"))
+        except FileExistsError:
+            pass
+
+    # Create the various combinations of [models] x [templates]
+    for m in models.items():
+        for t in templates.items():
+            create_jsonl(f"gaia_validation_{t[0]}_{m[0]}", gaia_validation_tasks, t[1], m[1])

From 53cd5b45661c6d503d0c8cfea6e6567d6e73d289 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Wed, 29 Nov 2023 00:51:42 -0800
Subject: [PATCH 09/16] Fixed a bug in executing the finalize scripts.

---
 samples/tools/testbed/run_scenarios.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/samples/tools/testbed/run_scenarios.py b/samples/tools/testbed/run_scenarios.py
index 3ecaaae3266f..5b46e12d71f6 100644
--- a/samples/tools/testbed/run_scenarios.py
+++ b/samples/tools/testbed/run_scenarios.py
@@ -188,12 +188,12 @@ def run_scenario_natively(work_dir):
 fi
 
 # Run the scenario finalize script if it exists
-if [ -f scenario_init.sh ] ; then
+if [ -f scenario_finalize.sh ] ; then
     . ./scenario_finalize.sh
 fi
 
 # Run the global finalize script if it exists
-if [ -f scenario_init.sh ] ; then
+if [ -f global_finalize.sh ] ; then
     . ./global_finalize.sh
 fi
 
@@ -268,12 +268,12 @@ def run_scenario_in_docker(work_dir, requirements, timeout=600):
 fi
 
 # Run the scenario finalize script if it exists
-if [ -f scenario_init.sh ] ; then
+if [ -f scenario_finalize.sh ] ; then
     . ./scenario_finalize.sh
 fi
 
 # Run the global finalize script if it exists
-if [ -f scenario_init.sh ] ; then
+if [ -f global_finalize.sh ] ; then
     . ./global_finalize.sh
 fi
 

From 2d97bb85ad8fcd3de38e65f45c2c326267955001 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Wed, 29 Nov 2023 09:45:11 -0800
Subject: [PATCH 10/16] Generalized the template further to support multiple
 folder copy operations.

---
 samples/tools/testbed/run_scenarios.py | 92 +++++++++++++++++---------
 1 file changed, 59 insertions(+), 33 deletions(-)

diff --git a/samples/tools/testbed/run_scenarios.py b/samples/tools/testbed/run_scenarios.py
index 5b46e12d71f6..c8e8f0404f77 100644
--- a/samples/tools/testbed/run_scenarios.py
+++ b/samples/tools/testbed/run_scenarios.py
@@ -86,17 +86,14 @@ def run_scenarios(scenario, n_repeats, is_native, config_list, requirements, res
                         continue
                     print(f"Running scenario {results_repetition}")
 
-                    # Copy the contents of GLOBAL_INCLUDES_DIR to the result_repetition dir
-                    shutil.copytree(GLOBAL_INCLUDES_DIR, results_repetition, ignore=shutil.ignore_patterns("*.example"))
+                    # Expand the scenario
+                    expand_scenario(scenario_dir, instance, results_repetition)
 
                     # Append the config list to the ENV file
                     config_list_json = json.dumps(config_list)
                     with open(os.path.join(results_repetition, "ENV"), "at") as fh:
                         fh.write(f"export OAI_CONFIG_LIST='{config_list_json}'\n")
 
-                    # Expand the scenario
-                    expand_scenario(scenario_dir, instance, results_repetition)
-
                     # Run the scenario
                     if is_native:
                         run_scenario_natively(results_repetition)
@@ -107,41 +104,70 @@ def run_scenarios(scenario, n_repeats, is_native, config_list, requirements, res
 def expand_scenario(scenario_dir, scenario, output_dir):
     """
     Expand a scenario into a folder.
+    Despite some awkwardness created by backwards compatibility and notational conveniences, expansion is conceptually simple.
+    It is a series of copy commands (similar to `cp -R`), followed by a series of in-place fine and replace operations.
     """
 
-    template_path = os.path.join(scenario_dir, scenario["template"])
+    template = scenario["template"]
 
     # Either key works for finding the substiturions list. "values" may be deprecated in the future
     substitutions = scenario["substitutions"] if "substitutions" in scenario else scenario["values"]
 
-    # If the template is a folder, copy the tree, and treat the substitutions dictionary
-    # as nested [file]->[find_str]->[replace_str].
-    if os.path.isdir(template_path):
-        shutil.copytree(template_path, output_dir, dirs_exist_ok=True)
-        for templated_file in substitutions.keys():  # Keys are relative file paths
-            expand_file(
-                os.path.join(template_path, templated_file),
-                os.path.join(output_dir, templated_file),
-                substitutions[templated_file],
-            )
+    # Older versions are only one-level deep. Convert them,
+    if len(substitutions) > 0 and isinstance(substitutions[next(iter(substitutions))], str):
+        substitutions = {"scenario.py": substitutions}
+
+    copy_operations = []
+
+    # Handle file (str), folder (str), or mapping (List) templates
+    if isinstance(template, str):
+        template_path = os.path.join(scenario_dir, template)
+        if os.path.isdir(template_path):
+            copy_operations.append((template, ""))
+        else:
+            copy_operations.append((template, "scenario.py"))
+    elif isinstance(template, list):
+        for elm in template:
+            if isinstance(elm, list):
+                copy_operations.append((elm[0], elm[1]))
+            else:
+                copy_operations.append((elm, ""))
     else:
-        expand_file(template_path, os.path.join(output_dir, "scenario.py"), substitutions)
-
-
-def expand_file(template_file, output_file, values):
-    """
-    Expands a template to a file.
-    """
-    template_fh = open(template_file, "rt")
-    output_fh = open(output_file, "wt")
-
-    for line in template_fh:
-        for k, v in values.items():
-            line = line.replace(k, v)
-        output_fh.write(line)
-
-    template_fh.close()
-    output_fh.close()
+        raise ValueError("expand_scenario expects an str or list for 'template'")
+
+    # The global includes folder is always copied
+    shutil.copytree(GLOBAL_INCLUDES_DIR, output_dir, ignore=shutil.ignore_patterns("*.example"), dirs_exist_ok=False)
+
+    # Expand other folders
+    for items in copy_operations:
+        src_path = pathlib.Path(os.path.join(scenario_dir, items[0])).absolute()
+        dest_path = pathlib.Path(os.path.join(output_dir, items[1])).absolute()
+
+        if os.path.isdir(src_path):
+            shutil.copytree(src_path, dest_path, dirs_exist_ok=True)
+        else:
+            if os.path.isdir(dest_path):
+                # If the destination is a directory, use the same filename
+                shutil.copyfile(src_path, os.path.join(dest_path, os.path.basename(src_path)))
+            else:
+                # Otherwuse use the filename provided
+                shutil.copyfile(src_path, dest_path)
+
+    # Expand templated files
+    for templated_file in substitutions.keys():  # Keys are relative file paths
+        # Read the templated file into memory
+        template_contents = list()
+        with open(os.path.join(output_dir, templated_file), "rt") as fh:
+            for line in fh:
+                template_contents.append(line)
+
+        # Rewrite the templated file with substitutions
+        values = substitutions[templated_file]
+        with open(os.path.join(output_dir, templated_file), "wt") as fh:
+            for line in template_contents:
+                for k, v in values.items():
+                    line = line.replace(k, v)
+                fh.write(line)
 
 
 def run_scenario_natively(work_dir):

From a22bea7b04aeb6322c326e950fb6d1e8cd3a5c0c Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Wed, 29 Nov 2023 10:53:42 -0800
Subject: [PATCH 11/16] Refined GAIA support, and broke scenarios down by
 difficulty.

---
 .../expected_answer.txt                       |  0
 .../GAIA/Templates/BasicTwoAgents/scenario.py | 66 +++++++++++++++++++
 .../Templates/DefaultTwoAgents/scenario.py    | 54 ---------------
 .../DefaultTwoAgents/scenario_init.sh         |  6 --
 samples/tools/testbed/utils/expand_gaia.py    | 43 +++++++-----
 5 files changed, 92 insertions(+), 77 deletions(-)
 rename samples/tools/testbed/scenarios/GAIA/Templates/{DefaultTwoAgents => BasicTwoAgents}/expected_answer.txt (100%)
 create mode 100644 samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
 delete mode 100644 samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario.py
 delete mode 100644 samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario_init.sh

diff --git a/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/expected_answer.txt b/samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/expected_answer.txt
similarity index 100%
rename from samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/expected_answer.txt
rename to samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/expected_answer.txt
diff --git a/samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py b/samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
new file mode 100644
index 000000000000..df33ce92342e
--- /dev/null
+++ b/samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
@@ -0,0 +1,66 @@
+import os
+import json
+import autogen
+from datetime import datetime
+import testbed_utils
+
+testbed_utils.init()
+##############################
+
+
+GAIA_SYSTEM_MESSAGE = (
+    "You are a helpful AI assistant, and today's date is "
+    + datetime.now().date().isoformat()
+    + """.
+I will ask you a question. Answer this quesiton using your coding and language skills.
+In the following cases, suggest python code (presented in a coding block beginning ```python) or shell script (presented in a coding block beginning ```sh) for the user to execute:
+    1. When you need to collect info, use the code to output the info you need, for example, browse or search the web, download/read a file, print the content of a webpage or a file, check the operating system. After sufficient info is printed and the task is ready to be solved based on your language skill, you can solve the task by yourself.
+    2. When you need to perform some task with code, use the code to perform the task and output the result. Finish the task smartly.
+Answer the question step if you need to. If a plan is not provided, explain your plan first. Be clear which step uses code, and which step uses your language skill.
+The user cannot provide any other feedback or perform any other action beyond executing the code appearing in the code block. The user can't modify your code, so do not suggest incomplete code which requires users to modify. Don't use a code block if it's not intended to be executed by the user. Don't include multiple code blocks in one response. Do not ask users to copy and paste code or results. Instead, use the 'print' function for the output when relevant. Check the execution result reported by the user.
+If the result indicates there is an error, fix the error and output the code again. Suggest the full code instead of partial code or code changes. If the error can't be fixed or if the task is not solved even after the code is executed successfully, analyze the problem, revisit your assumption, collect additional info you need, and think of a different approach to try.
+When you find an answer, report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
+YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
+If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
+If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
+    """.strip()
+)
+
+
+config_list = autogen.config_list_from_json(
+    "OAI_CONFIG_LIST",
+    filter_dict={"model": ["__MODEL__"]},
+)
+
+assistant = autogen.AssistantAgent(
+    "assistant",
+    system_message=GAIA_SYSTEM_MESSAGE,
+    is_termination_msg=lambda x: x.get("content", "").rstrip().find("FINAL ANSWER") >= 0,
+    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
+)
+user_proxy = autogen.UserProxyAgent(
+    "user_proxy",
+    human_input_mode="NEVER",
+    is_termination_msg=lambda x: x.get("content", "").rstrip().find("FINAL ANSWER") >= 0,
+    code_execution_config={
+        "work_dir": "coding",
+        "use_docker": False,
+    },
+    max_consecutive_auto_reply=10,
+    default_auto_reply="",
+)
+
+filename = "__FILE_NAME__".strip()
+question = """
+__PROMPT__
+""".strip()
+
+if len(filename) > 0:
+    question = f"Consider the file '{filename}', which can be read from the current working directory. {question}"
+
+user_proxy.initiate_chat(assistant, message=question)
+
+
+##############################
+testbed_utils.finalize(agents=[assistant, user_proxy])
diff --git a/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario.py b/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario.py
deleted file mode 100644
index f149eaff30af..000000000000
--- a/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os
-import json
-import autogen
-import testbed_utils
-
-testbed_utils.init()
-##############################
-
-config_list = autogen.config_list_from_json(
-    "OAI_CONFIG_LIST",
-    filter_dict={"model": ["__MODEL__"]},
-)
-
-assistant = autogen.AssistantAgent(
-    "assistant",
-    is_termination_msg=lambda x: x.get("content", "").rstrip().find("TERMINATE") >= 0,
-    llm_config=testbed_utils.default_llm_config(config_list, timeout=180),
-)
-user_proxy = autogen.UserProxyAgent(
-    "user_proxy",
-    human_input_mode="NEVER",
-    is_termination_msg=lambda x: x.get("content", "").rstrip().find("TERMINATE") >= 0,
-    code_execution_config={
-        "work_dir": "coding",
-        "use_docker": False,
-    },
-    max_consecutive_auto_reply=10,
-    default_auto_reply="",
-)
-
-filename = "__FILE_NAME__".strip()
-question = """
-__PROMPT__
-""".strip()
-
-if len(filename) > 0:
-    question = f"Consider the file '{filename}', which can be read from the current working directory. {question}"
-
-user_proxy.initiate_chat(
-    assistant,
-    message=f"""
-I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
-YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
-If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
-If you are asked for a string, don’t use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
-If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
-
-Question: {question}
-""".strip(),
-)
-
-
-##############################
-testbed_utils.finalize(agents=[assistant, user_proxy])
diff --git a/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario_init.sh b/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario_init.sh
deleted file mode 100644
index 16ba19dddaf6..000000000000
--- a/samples/tools/testbed/scenarios/GAIA/Templates/DefaultTwoAgents/scenario_init.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-# Scenario Init.
-mkdir coding
-if [ -f "gaia_files/__FILE_NAME__" ] ; then
-    mv "gaia_files/__FILE_NAME__" coding/.
-fi
-rm -Rf gaia_files
diff --git a/samples/tools/testbed/utils/expand_gaia.py b/samples/tools/testbed/utils/expand_gaia.py
index 010837ec8b46..9f208532aeb1 100644
--- a/samples/tools/testbed/utils/expand_gaia.py
+++ b/samples/tools/testbed/utils/expand_gaia.py
@@ -11,27 +11,35 @@
 SCRIPT_PATH = os.path.realpath(__file__)
 SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
 SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
+SCENARIOS_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir, "scenarios", "GAIA"))
 
 
 def create_jsonl(name, tasks, template, model):
     """Creates a JSONL scenario file with a given name, list of HumanEval tasks, template path, and model."""
 
-    scenarios_dir = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir, "scenarios", "GAIA"))
-
-    with open(os.path.join(scenarios_dir, name + ".jsonl"), "wt") as fh:
+    with open(os.path.join(SCENARIOS_DIR, name + ".jsonl"), "wt") as fh:
         for task in tasks:
             print(f"Converting: [{name}] {task['task_id']}")
 
+            # Figure out what files we need to copy
+            template_cp_list = [template]
+            if len(task["file_name"].strip()) > 0:
+                template_cp_list.append(
+                    [
+                        os.path.join("GAIA_Files", task["file_name"].strip()),
+                        os.path.join("coding", task["file_name"].strip()),
+                    ]
+                )
+
             record = {
                 "id": task["task_id"],
-                "template": template,
+                "template": template_cp_list,
                 "substitutions": {
                     "scenario.py": {
                         "__MODEL__": model,
                         "__FILE_NAME__": task["file_name"],
                         "__PROMPT__": task["Question"],
                     },
-                    "scenario_init.sh": {"__FILE_NAME__": task["file_name"]},
                     "expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
                 },
             }
@@ -53,7 +61,7 @@ def create_jsonl(name, tasks, template, model):
     if not os.path.isdir(gaia_validation_files) or not os.path.isdir(gaia_test_files):
         sys.exit(f"Error: '{gaia_path}' does not appear to be a copy of the GAIA repository.")
 
-    gaia_merged_files = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir, "scenarios", "GAIA", "GAIA_Files"))
+    gaia_merged_files = os.path.realpath(os.path.join(SCENARIOS_DIR, "GAIA_Files"))
 
     shutil.copytree(
         gaia_validation_files, gaia_merged_files, ignore=shutil.ignore_patterns("metadata.jsonl"), dirs_exist_ok=True
@@ -63,28 +71,29 @@ def create_jsonl(name, tasks, template, model):
     )
 
     # Load the GAIA data
-    gaia_validation_tasks = []
+    gaia_validation_tasks = [[], [], []]
     with open(os.path.join(gaia_validation_files, "metadata.jsonl")) as fh:
         for line in fh:
-            gaia_validation_tasks.append(json.loads(line))
+            data = json.loads(line)
+            gaia_validation_tasks[data["Level"] - 1].append(data)
 
     models = {
         "gpt4": "gpt-4",
     }
 
     templates = {
-        "two_agents": "Templates/DefaultTwoAgents",
+        "two_agents": "Templates/BasicTwoAgents",
     }
 
-    # Create necessary symlinks
-    for t in templates.items():
-        template_dir = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir, "scenarios", "GAIA", t[1]))
-        try:
-            os.symlink(gaia_merged_files, os.path.join(template_dir, "gaia_files"))
-        except FileExistsError:
-            pass
+    # Add coding directories if needed (these are usually empty and left out of the repo)
+    for template in templates.values():
+        code_dir_path = os.path.join(SCENARIOS_DIR, template, "coding")
+        if not os.path.isdir(code_dir_path):
+            os.mkdir(code_dir_path)
 
     # Create the various combinations of [models] x [templates]
     for m in models.items():
         for t in templates.items():
-            create_jsonl(f"gaia_validation_{t[0]}_{m[0]}", gaia_validation_tasks, t[1], m[1])
+            create_jsonl(f"gaia_validation_level_1__{t[0]}_{m[0]}", gaia_validation_tasks[0], t[1], m[1])
+            create_jsonl(f"gaia_validation_level_2__{t[0]}_{m[0]}", gaia_validation_tasks[1], t[1], m[1])
+            create_jsonl(f"gaia_validation_level_3__{t[0]}_{m[0]}", gaia_validation_tasks[2], t[1], m[1])

From b26e26d1275b9ef26ed793d8d94c3bd07ae538a3 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Wed, 29 Nov 2023 13:32:30 -0800
Subject: [PATCH 12/16] Added some experimental scripts for computing metrics
 over GAIA. This is a first version, and will likely need refinement.

---
 samples/tools/testbed/utils/collate_gaia.py | 128 ++++++++++++++++++++
 samples/tools/testbed/utils/metrics_gaia.py |  97 +++++++++++++++
 2 files changed, 225 insertions(+)
 create mode 100644 samples/tools/testbed/utils/collate_gaia.py
 create mode 100644 samples/tools/testbed/utils/metrics_gaia.py

diff --git a/samples/tools/testbed/utils/collate_gaia.py b/samples/tools/testbed/utils/collate_gaia.py
new file mode 100644
index 000000000000..ff8e1376f110
--- /dev/null
+++ b/samples/tools/testbed/utils/collate_gaia.py
@@ -0,0 +1,128 @@
+import os
+import json
+import re
+import sys
+import argparse
+
+
+def normalize_answer(a):
+    # Lower case
+    # Trim (left and right)
+    # Replace multiple spaces with one space
+    # Remove trailing punctuation
+    return re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", a.strip().lower()))
+
+
+def collate(results_dir):
+    """
+    Collate the results of running GAIA
+
+    Args:
+        results_dir (path): The folder were results were be saved.
+    """
+
+    all_results = list()
+    max_instances = 0
+
+    for test_id in os.listdir(results_dir):
+        test_path = os.path.join(results_dir, test_id)
+
+        # Collect the reslts vector
+        results = [test_id]
+
+        instance = 0
+        instance_dir = os.path.join(test_path, str(instance))
+        while os.path.isdir(instance_dir):
+            expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
+            if not os.path.isfile(expected_answer_file):
+                # Expected ansewr is missing
+                results.append("")
+
+                instance += 1
+                instance_dir = os.path.join(test_path, str(instance))
+                continue
+
+            expected_answer = "!!!NULL ANSWER!!!"
+            with open(expected_answer_file, "rt") as fh:
+                expected_answer = fh.read().strip()
+
+            console_log_file = os.path.join(instance_dir, "console_log.txt")
+            if not os.path.isfile(console_log_file):
+                # Console log file missing
+                results.append("")
+
+                instance += 1
+                instance_dir = os.path.join(test_path, str(instance))
+                continue
+
+            with open(console_log_file, "rt") as fh:
+                console_log = fh.read()
+
+                final_answer = ""
+                m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
+                if m:
+                    final_answer = m.group(1).strip()
+
+                # print(f"Expected Answer: {expected_answer}\nAutogen Answer: {final_answer}\n")
+
+                if normalize_answer(expected_answer) == normalize_answer(final_answer):
+                    results.append("1")
+                else:
+                    results.append("-1")
+
+            instance += 1
+            instance_dir = os.path.join(test_path, str(instance))
+
+        max_instances = max(max_instances, instance)
+
+        # Buffer the results
+        all_results.append(results)
+
+    # Create a header
+    header = "TestId"
+    for i in range(0, max_instances):
+        header += ",Trial" + str(i)
+    print(header)
+
+    # Print a fully-populated table of results
+    for r in all_results:
+        while len(r) < max_instances + 1:
+            r.append("")
+        print(",".join(r))
+
+
+###############################################################################
+if __name__ == "__main__":
+    script_path = os.path.realpath(__file__)
+    script_name = os.path.basename(script_path)
+    script_dir = os.path.dirname(script_path)
+
+    # Path to the default results directory
+    # (relative to this script, up on directory, then into the results folder)
+    default_results_dir = os.path.realpath(
+        os.path.join(script_dir, os.path.pardir, "results", "gaia_validation_level_1__two_agents_gpt4")
+    )
+
+    parser = argparse.ArgumentParser(
+        description=f"""
+{script_name} will collate the results of the GAIA scenarios and output them to a CSV. The CSV format is as follows:
+
+TestId,      Trial0, Trial1, ...,    TrialN
+uuid_1,      x_10,   x_11,   ...,    X_1N
+uuid_2,      x_20,   x_21,   ...,    X_2N
+...
+uuid_M,      x_M0,   x_M1,   ...,    X_MN
+
+Where uuid_i is the identifier of the ith test question, and x_ij is 1 or -1 depending on if the test passed or failed, respectively. If data for the trial is missing (e.g., due to a runtime error, the value will be an empty string "".
+""".strip(),
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+
+    parser.add_argument(
+        "scenario",
+        nargs="?",
+        help="Path to the scenario results. (default: " + default_results_dir + ")",
+        default=default_results_dir,
+    )
+    args = parser.parse_args()
+    collate(args.scenario)
diff --git a/samples/tools/testbed/utils/metrics_gaia.py b/samples/tools/testbed/utils/metrics_gaia.py
new file mode 100644
index 000000000000..6119f4f38f49
--- /dev/null
+++ b/samples/tools/testbed/utils/metrics_gaia.py
@@ -0,0 +1,97 @@
+import os
+import sys
+import argparse
+import csv
+
+
+def metrics(results_fh):
+    """
+    Compute metrics from collated GAIA results.
+
+    Args:
+        results_fh (File Stream): A file stream containing the collated results in CSV.
+    """
+
+    reader = csv.reader(results_fh)
+    first_row = next(reader)  # Read the first line
+
+    num_trials = len(first_row) - 1  # Don't count the first column (TestId)
+
+    # Set up the counters
+    counters = []
+    for i in range(0, num_trials):
+        counters.append({"successes": 0, "failures": 0, "missing": 0})
+
+    # Load the results. We'll need to iterate over them a few times.
+    results = list()
+    for row in reader:
+        name = row[0]
+        trials = [(None if v.strip() == "" else int(v)) for v in row[1:]]
+        for i in range(0, len(trials)):
+            v = trials[i]
+            if v is None:
+                counters[i]["missing"] += 1
+            elif v > 0:
+                counters[i]["successes"] += 1
+            else:
+                counters[i]["failures"] += 1
+
+        results.append([name, trials])
+
+    def _safe_div(num, denom):
+        if denom == 0:
+            return ""
+        else:
+            return num / denom
+
+    # Print the header
+    for i in range(0, len(counters)):
+        counter = counters[i]
+        n = counter["successes"] + counter["failures"] + counter["missing"]
+        score = _safe_div(counter["successes"], n)
+        print(f"{i},{n},{counter['successes']},{counter['failures']},{counter['missing']},{score}")
+
+
+###############################################################################
+if __name__ == "__main__":
+    script_path = os.path.realpath(__file__)
+    script_name = os.path.basename(script_path)
+    script_dir = os.path.dirname(script_path)
+
+    parser = argparse.ArgumentParser(
+        description=f"""
+{script_name} will compute metrics on the collated results of the GAIA scenarios. Use collate_gaia.py to prepare input to this script.
+
+The output will be formatted as a CSV with the following schema:
+
+Trial,  n,      successes,  failures,   missing,    score
+0       N_0,    s_0         f_0         m_0,        p_0
+0       N_1,    s_1         f_1         m_1,        p_1
+...
+M       N_M,    s_M         f_M         m_M,        p_M
+
+Where:
+
+    N_i is the number of questions in trial i
+    s_i is the number of successes in trial i
+    f_i is the number of failures in trial i
+    m_i is the number of missing values in trial i
+    p_i is the proportion of successes in trail i (i.e, s_i / N_i)
+
+""".strip(),
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+
+    parser.add_argument(
+        "scenario",
+        nargs="?",
+        help="Path to collated results. If '-' or omitted, read from stdin. (default: '-')",
+        default="-",
+    )
+    args = parser.parse_args()
+
+    if args.scenario == "" or args.scenario == "-":
+        metrics(sys.stdin)
+    else:
+        with open(args.scenario, "rt") as fh:
+            metrics(fh)

From 4ee155b030bcd9e904ade0c90bff8935a91507a4 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Wed, 29 Nov 2023 13:40:27 -0800
Subject: [PATCH 13/16] Added instructions for cloning GAIA

---
 samples/tools/testbed/utils/expand_gaia.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/samples/tools/testbed/utils/expand_gaia.py b/samples/tools/testbed/utils/expand_gaia.py
index 9f208532aeb1..f4b78cb90c39 100644
--- a/samples/tools/testbed/utils/expand_gaia.py
+++ b/samples/tools/testbed/utils/expand_gaia.py
@@ -50,7 +50,9 @@ def create_jsonl(name, tasks, template, model):
 ###############################################################################
 if __name__ == "__main__":
     if len(sys.argv) != 2:
-        sys.exit(f"SYNTAX: python {SCRIPT_NAME} [path to GIA repository]")
+        sys.exit(
+            f"SYNTAX: python {SCRIPT_NAME} [path to GIA repository]\n\nNote: to clone the GAIA repository, do 'git clone https://huggingface.co/datasets/gaia-benchmark/GAIA'"
+        )
 
     # Copy the relevant GAIA files
     gaia_path = os.path.realpath(sys.argv[1])

From 66cd936f247d4c08cd76339d203fb7bccf28762c Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Thu, 30 Nov 2023 09:34:07 -0800
Subject: [PATCH 14/16] Updated README to fix some typos.

---
 samples/tools/testbed/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/tools/testbed/README.md b/samples/tools/testbed/README.md
index 3f6b89a92b04..d46f2da22404 100644
--- a/samples/tools/testbed/README.md
+++ b/samples/tools/testbed/README.md
@@ -197,9 +197,9 @@ cat human_eval_results_gpt35.csv
 
 ## (Example) Running GAIA
 
-The Testbed can also be used to run the recently released [GAIA benchmark](https://huggingface.co/gaia-benchmark). This integration is presently experimental, and needs further validation. In this scenario, agents are presented with a series of questions that may include file references, or multi-modal input. Agents then must provide a `FINAL ANSWER`, which is considered correct if (nearly) exactly matches an unambious accepted answer.
+The Testbed can also be used to run the recently released [GAIA benchmark](https://huggingface.co/gaia-benchmark). This integration is presently experimental, and needs further validation. In this scenario, agents are presented with a series of questions that may include file references, or multi-modal input. Agents then must provide a `FINAL ANSWER`, which is considered correct if it (nearly) exactly matches an unambiguously accepted answer.
 
-Accessing this scenario-type requires downloading and converting the GAIA dataset, running the Testbed, collating the results, and finally computing the metrics. The following commands will accomplish this, running each test instance 3 times with GPT-3.5-Turbo-16k:
+Accessing this scenario-type requires downloading and converting the GAIA dataset, running the Testbed, collating the results, and finally computing the metrics. The following commands will accomplish this, running each test instance once with GPT-4:
 
 ```
 # Clone the GAIA dataset repo (assuming a 'repos' folder in your home directory)

From 23fa2baafdbc673ac55de821c17a2872f3c8a05f Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Fri, 1 Dec 2023 10:08:30 -0800
Subject: [PATCH 15/16] Added a script to format GAIA reslts for the
 leaderboard.

---
 samples/tools/testbed/README.md               |  2 +-
 .../{collate_gaia.py => collate_gaia_csv.py}  |  0
 .../tools/testbed/utils/collate_gaia_jsonl.py | 76 +++++++++++++++++++
 3 files changed, 77 insertions(+), 1 deletion(-)
 rename samples/tools/testbed/utils/{collate_gaia.py => collate_gaia_csv.py} (100%)
 create mode 100644 samples/tools/testbed/utils/collate_gaia_jsonl.py

diff --git a/samples/tools/testbed/README.md b/samples/tools/testbed/README.md
index d46f2da22404..506c8b0835f0 100644
--- a/samples/tools/testbed/README.md
+++ b/samples/tools/testbed/README.md
@@ -214,5 +214,5 @@ python ./utils/expand_gaia.py ~/repos/GAIA
 python ./run_scenarios.py ./scenarios/GAIA/gaia_validation_level_1__two_agents_gpt4.jsonl
 
 # Compute Metrics
-python utils/collate_gaia.py ./results/gaia_validation_level_1__two_agents_gpt4 | python utils/metrics_gaia.py
+python utils/collate_gaia_csv.py ./results/gaia_validation_level_1__two_agents_gpt4 | python utils/metrics_gaia.py
 ```
diff --git a/samples/tools/testbed/utils/collate_gaia.py b/samples/tools/testbed/utils/collate_gaia_csv.py
similarity index 100%
rename from samples/tools/testbed/utils/collate_gaia.py
rename to samples/tools/testbed/utils/collate_gaia_csv.py
diff --git a/samples/tools/testbed/utils/collate_gaia_jsonl.py b/samples/tools/testbed/utils/collate_gaia_jsonl.py
new file mode 100644
index 000000000000..3ab88c287bea
--- /dev/null
+++ b/samples/tools/testbed/utils/collate_gaia_jsonl.py
@@ -0,0 +1,76 @@
+import os
+import json
+import re
+import sys
+import argparse
+
+
+def normalize_answer(a):
+    # Trim (left and right)
+    # Replace multiple spaces with one space
+    # Remove trailing punctuation
+    # Trim again
+    return re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", a.strip())).strip()
+
+
+def collate(results_dir, instance=0):
+    """
+    Collate the results of running GAIA. Print the results in the format acceped by the leaderboard.
+
+    Args:
+        results_dir (path): The folder were results were be saved.
+    """
+
+    for test_id in os.listdir(results_dir):
+        test_path = os.path.join(results_dir, test_id)
+
+        instance_dir = os.path.join(test_path, str(instance))
+        console_log_file = os.path.join(instance_dir, "console_log.txt")
+
+        final_answer = ""
+        if os.path.isfile(console_log_file):
+            with open(console_log_file, "rt") as fh:
+                console_log = fh.read()
+
+                final_answer = ""
+                m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
+                if m:
+                    final_answer = normalize_answer(m.group(1))
+
+        # Clean up the GAIA logs so they don't have the Docker setup preamble
+        m = re.search(r"^.*?\r?\n(user_proxy \(to assistant\).*$)", console_log, re.DOTALL)
+        if m:
+            console_log = m.group(1)
+
+        print(json.dumps({"task_id": test_id, "model_answer": final_answer, "reasoning_trace": console_log}))
+
+
+###############################################################################
+if __name__ == "__main__":
+    script_path = os.path.realpath(__file__)
+    script_name = os.path.basename(script_path)
+    script_dir = os.path.dirname(script_path)
+
+    # Path to the default results directory
+    # (relative to this script, up on directory, then into the results folder)
+    default_results_dir = os.path.realpath(
+        os.path.join(script_dir, os.path.pardir, "results", "gaia_validation_level_1__two_agents_gpt4")
+    )
+
+    parser = argparse.ArgumentParser(
+        description=f"""
+{script_name} will collate the results of the GAIA scenarios into the jsonl format that can be submit to the GAIA leaderboard.
+
+NOTE: You will likely need to concatenate resuls for level 1, level 2 and level 3 to form a complete submission.
+""".strip(),
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+
+    parser.add_argument(
+        "scenario",
+        nargs="?",
+        help="Path to the scenario results. (default: " + default_results_dir + ")",
+        default=default_results_dir,
+    )
+    args = parser.parse_args()
+    collate(args.scenario)

From 08eec2baca6c06f0d8b93e8f6d5aafaf49b98828 Mon Sep 17 00:00:00 2001
From: Qingyun Wu <qingyun.wu@psu.edu>
Date: Tue, 5 Dec 2023 20:41:19 -0500
Subject: [PATCH 16/16] Update
 samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py

Co-authored-by: LeoLjl <3110503618@qq.com>
---
 .../testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py b/samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
index df33ce92342e..f96f88364c4e 100644
--- a/samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
+++ b/samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
@@ -12,7 +12,7 @@
     "You are a helpful AI assistant, and today's date is "
     + datetime.now().date().isoformat()
     + """.
-I will ask you a question. Answer this quesiton using your coding and language skills.
+I will ask you a question. Answer this question using your coding and language skills.
 In the following cases, suggest python code (presented in a coding block beginning ```python) or shell script (presented in a coding block beginning ```sh) for the user to execute:
     1. When you need to collect info, use the code to output the info you need, for example, browse or search the web, download/read a file, print the content of a webpage or a file, check the operating system. After sufficient info is printed and the task is ready to be solved based on your language skill, you can solve the task by yourself.
     2. When you need to perform some task with code, use the code to perform the task and output the result. Finish the task smartly.