Skip to content

Commit 085bf6c

Browse files
afourneygagb
andauthored
Version 0.0.2 of Autogenbench (#1548)
* Prints the version of AutoGenBench from the command line, closing i1458 * Added autogenbench version to timestamp.txt * Attempting to fix formatting. * Add a gitignore for autogenbench * Generalize to read all template dirs from Templates * AutoGenBench logs telemetry when available. * Remove spaces if present from template names. * Bump version. * Fixed formatting. * Allow native warning to be skipped. Mount autogen repo in Docker if it can be found (experimental). * Native execution now occurs in a venv. * Bump version. * Fixed a prompt escaping bug evident in GAIA task '6f37996b-2ac7-44b0-8e68-6d28256631b4' * Updated all scenarios to use template discovery. * Update with main version of runtime_logging. --------- Co-authored-by: gagb <[email protected]>
1 parent 477598a commit 085bf6c

File tree

15 files changed

+201
-52
lines changed

15 files changed

+201
-52
lines changed

samples/tools/autogenbench/.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
scenarios/*/Downloads
2+
scenarios/*/Tasks
3+
*/Results

samples/tools/autogenbench/autogenbench/cli.py

+11
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import sys
2+
from .version import __version__
23
from .run_cmd import run_cli
34
from .clone_cmd import clone_cli
45
from .tabulate_cmd import tabulate_cli
@@ -9,6 +10,7 @@ def main(args=None):
910
args = sys.argv[:] # Shallow copy
1011

1112
invocation_cmd = "autogenbench"
13+
version_string = f"AutoGenBench version {__version__}"
1214

1315
commands = [
1416
{
@@ -26,6 +28,11 @@ def main(args=None):
2628
"description": "tabulate the results of a previous run",
2729
"function": tabulate_cli,
2830
},
31+
{
32+
"command": "--version",
33+
"description": f"print the version of {invocation_cmd}",
34+
"function": lambda _args: print(f"{version_string}"),
35+
},
2936
{"command": "--help", "description": "print this message", "function": None},
3037
]
3138

@@ -40,6 +47,8 @@ def main(args=None):
4047
commands_details += f" {padded_cmd}: {c['description']}\n"
4148

4249
usage_text = f"""
50+
{version_string}
51+
4352
usage: {invocation_cmd} COMMAND ARGS
4453
4554
Where, COMMAND is one of: {commands_list}
@@ -49,6 +58,8 @@ def main(args=None):
4958
""".strip()
5059

5160
help_text = f"""
61+
{version_string}
62+
5263
usage: {invocation_cmd} COMMAND ARGS
5364
5465
{invocation_cmd} is a tool for running and managing AutoGen benchmark scenarios. A typically session might resemble:

samples/tools/autogenbench/autogenbench/run_cmd.py

+102-18
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import random
1212
from autogen import config_list_from_json
1313
from autogen.oai.openai_utils import filter_config
14+
from .version import __version__
1415

1516
# Figure out where everything is
1617
SCRIPT_PATH = os.path.realpath(__file__)
@@ -247,17 +248,25 @@ def get_scenario_env(config_list, env_file=DEFAULT_ENV_FILE):
247248
Returns: A dictionary of keys and values that need to be added to the system environment.
248249
"""
249250
env = dict()
251+
252+
# Populate with commonly needed keys
253+
openai_api_key = os.environ.get("OPENAI_API_KEY")
254+
if openai_api_key is not None and len(openai_api_key.strip()) > 0:
255+
env["OPENAI_API_KEY"] = openai_api_key
256+
257+
bing_api_key = os.environ.get("BING_API_KEY")
258+
if bing_api_key is not None and len(bing_api_key.strip()) > 0:
259+
env["BING_API_KEY"] = bing_api_key
260+
261+
# Update with any values from the ENV.json file
250262
if os.path.isfile(env_file):
251263
with open(env_file, "rt") as fh:
252-
env = json.loads(fh.read())
264+
env.update(json.loads(fh.read()))
253265

266+
# Include the config_list that we are using
254267
config_list_json = json.dumps(config_list)
255268
env["OAI_CONFIG_LIST"] = config_list_json
256269

257-
openai_api_key = os.environ.get("OPENAI_API_KEY")
258-
if openai_api_key is not None and len(openai_api_key.strip()) > 0:
259-
env["OPENAI_API_KEY"] = openai_api_key
260-
261270
return env
262271

263272

@@ -286,6 +295,12 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
286295
f"""#
287296
echo RUN.SH STARTING !#!#
288297
export AUTOGEN_TESTBED_SETTING="Native"
298+
echo "autogenbench version: {__version__}" > timestamp.txt
299+
300+
# Create and activate the virtual environment
301+
# This is called in a subprocess, and will not impact the parent
302+
{sys.executable} -m venv .autogenbench_venv
303+
. .autogenbench_venv/bin/activate
289304
290305
# Run the global init script if it exists
291306
if [ -f global_init.sh ] ; then
@@ -298,6 +313,7 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
298313
fi
299314
300315
# Run the scenario
316+
pip install -r requirements.txt
301317
echo SCENARIO.PY STARTING !#!#
302318
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
303319
EXIT_CODE=$?
@@ -312,6 +328,10 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
312328
rm -Rf .cache
313329
fi
314330
331+
if [ -d __pycache__ ] ; then
332+
rm -Rf __pycache__
333+
fi
334+
315335
# Run the scenario finalize script if it exists
316336
if [ -f scenario_finalize.sh ] ; then
317337
. ./scenario_finalize.sh
@@ -322,6 +342,12 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
322342
. ./global_finalize.sh
323343
fi
324344
345+
# We don't need to deactivate the venv because it's
346+
# contained in the subprocess; but we should clean it up
347+
if [ -d .autogenbench_venv ] ; then
348+
rm -Rf .autogenbench_venv
349+
fi
350+
325351
echo RUN.SH COMPLETE !#!#
326352
"""
327353
)
@@ -387,7 +413,9 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
387413
f"""#
388414
echo RUN.SH STARTING !#!#
389415
export AUTOGEN_TESTBED_SETTING="Docker"
416+
390417
umask 000
418+
echo "autogenbench version: {__version__}" > timestamp.txt
391419
392420
# Run the global init script if it exists
393421
if [ -f global_init.sh ] ; then
@@ -415,6 +443,10 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
415443
rm -Rf .cache
416444
fi
417445
446+
if [ -d __pycache__ ] ; then
447+
rm -Rf __pycache__
448+
fi
449+
418450
# Run the scenario finalize script if it exists
419451
if [ -f scenario_finalize.sh ] ; then
420452
. ./scenario_finalize.sh
@@ -429,18 +461,31 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
429461
"""
430462
)
431463

432-
print("\n\n" + work_dir + "\n===================================================================")
464+
# Figure out what folders to mount
465+
volumes = {str(pathlib.Path(work_dir).absolute()): {"bind": "/workspace", "mode": "rw"}}
466+
467+
# Add the autogen repo if we can find it
468+
autogen_repo_base = os.environ.get("AUTOGENBENCH_REPO_BASE")
469+
if autogen_repo_base is None:
470+
autogen_repo_base = find_autogen_repo(os.getcwd())
471+
elif not os.path.isdir(autogen_repo_base):
472+
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), autogen_repo_base)
473+
474+
if autogen_repo_base is not None:
475+
volumes[str(pathlib.Path(autogen_repo_base).absolute())] = {"bind": "/autogen", "mode": "rw"}
476+
477+
print("Mounting:")
478+
for k in volumes:
479+
bind = volumes[k]["bind"]
480+
mode = volumes[k]["mode"].upper()
481+
if bind == "/workspace":
482+
k = os.path.relpath(k)
483+
print(f"[{mode}]\t'{k}' => '{bind}'")
484+
print("===================================================================")
433485

434486
# Create and run the container
435-
abs_path = str(pathlib.Path(work_dir).absolute())
436487
container = client.containers.run(
437-
image,
438-
command=["sh", "run.sh"],
439-
working_dir="/workspace",
440-
environment=env,
441-
detach=True,
442-
# get absolute path to the working directory
443-
volumes={abs_path: {"bind": "/workspace", "mode": "rw"}},
488+
image, command=["sh", "run.sh"], working_dir="/workspace", environment=env, detach=True, volumes=volumes
444489
)
445490

446491
# Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
@@ -485,6 +530,34 @@ def build_default_docker_image(docker_client, image_tag):
485530
sys.stdout.write(segment["stream"])
486531

487532

533+
def find_autogen_repo(path):
534+
"""
535+
Utility for identifying if the path is a subdirectory of the autogen repo.
536+
537+
Returns: the path to the root of the autogen repo if one is found, otherwise None
538+
"""
539+
540+
# Normalize the path (we expect a directory)
541+
path = os.path.abspath(path)
542+
if os.path.isfile(path):
543+
path = os.path.dirname(path)
544+
545+
while True:
546+
test_path = os.path.join(path, "autogen", "agentchat", "conversable_agent.py") # We found autogen
547+
if os.path.isfile(test_path):
548+
return path
549+
550+
# Stop if we hit the root
551+
parent_dir = os.path.abspath(os.path.join(path, os.pardir))
552+
if parent_dir == path:
553+
break
554+
555+
# Keep searching
556+
path = parent_dir
557+
558+
return None
559+
560+
488561
def run_cli(args):
489562
invocation_cmd = args[0]
490563
args = args[1:]
@@ -581,12 +654,23 @@ def run_cli(args):
581654
if parsed_args.requirements is not None:
582655
sys.exit("--requirements is not compatible with --native. Exiting.")
583656

584-
choice = input(
585-
'WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\nAre you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
657+
sys.stderr.write(
658+
"WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
586659
)
587660

588-
if choice.strip().lower() != "yes":
589-
sys.exit("Received '" + choice + "'. Exiting.")
661+
# Does an environment variable override the prompt?
662+
allow_native = os.environ.get("AUTOGENBENCH_ALLOW_NATIVE")
663+
if allow_native is None or allow_native == "":
664+
choice = input(
665+
'Are you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
666+
)
667+
if choice.strip().lower() != "yes":
668+
sys.exit("Received '" + choice + "'. Exiting.")
669+
elif allow_native.strip().lower() != "yes":
670+
sys.exit(f"Exiting because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
671+
else:
672+
sys.stderr.write(f"Continuing because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
673+
time.sleep(0.75) # Pause very briefly so the message isn't lost in the noise
590674

591675
# Parse the subsample
592676
subsample = None

samples/tools/autogenbench/autogenbench/template/testbed_utils.py

+17
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,15 @@
66

77
AUTOGEN_VERSION = packaging.version.parse(autogen.__version__)
88

9+
# Try importing the runtime_logging module (only available in some branches)
10+
LOGGING_ENABLED = False
11+
try:
12+
import autogen.runtime_logging
13+
14+
LOGGING_ENABLED = True
15+
except ImportError:
16+
pass
17+
918

1019
def default_llm_config(config_list, timeout=180):
1120
"""Return a default config list with a given timeout, and with caching disabled.
@@ -57,6 +66,10 @@ def init():
5766
if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):
5867
autogen.Completion.start_logging(compact=False)
5968

69+
# Start logging
70+
if LOGGING_ENABLED:
71+
autogen.runtime_logging.start(config={"dbname": "telemetry.db"})
72+
6073

6174
def finalize(agents):
6275
"""Helper function to finalize logging in a testbed scenario.
@@ -89,3 +102,7 @@ def messages_to_json(agent):
89102
with open(os.path.join(script_dir, "completion_log.json"), "wt") as fh:
90103
fh.write(json.dumps(autogen.Completion.logged_history, indent=4))
91104
autogen.Completion.stop_logging()
105+
106+
# Stop logging
107+
if LOGGING_ENABLED:
108+
autogen.runtime_logging.stop()
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.1"
1+
__version__ = "0.0.2a4"

samples/tools/autogenbench/pyproject.toml

+5
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,8 @@ exclude = ["*.tests*"]
4747

4848
[project.scripts]
4949
autogenbench = "autogenbench.cli:main"
50+
51+
[tool.black]
52+
# https://github.com/psf/black
53+
line-length = 120
54+
exclude = "(.eggs|.git|.hg|.mypy_cache|.venv|_build|buck-out|build|dist)"

samples/tools/autogenbench/scenarios/AutoGPT/Scripts/init_tasks.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import sys
99
import glob
1010
import base64
11+
import re
1112
from huggingface_hub import snapshot_download
1213

1314
SCRIPT_PATH = os.path.realpath(__file__)
@@ -88,7 +89,12 @@ def create_jsonl(name, template):
8889

8990
###############################################################################
9091
def main():
91-
templates = {"two_agents": os.path.join(TEMPLATES_DIR, "TwoAgents")}
92+
# list all directories in the Templates directory
93+
# and populate a dictionary with the name and path
94+
templates = {}
95+
for entry in os.scandir(TEMPLATES_DIR):
96+
if entry.is_dir():
97+
templates[re.sub(r"\s", "", entry.name)] = entry.path
9298

9399
# Add coding directories if needed (these are usually empty and left out of the repo)
94100
for template in templates.values():

samples/tools/autogenbench/scenarios/GAIA/MANIFEST.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
"Scripts/init_tasks.py": "Scripts/init_tasks.py",
55
"Scripts/custom_tabulate.py": "Scripts/custom_tabulate.py",
66
"Templates/BasicTwoAgents/expected_answer.txt": "Templates/BasicTwoAgents/expected_answer.txt",
7+
"Templates/BasicTwoAgents/prompt.txt": "Templates/BasicTwoAgents/prompt.txt",
78
"Templates/BasicTwoAgents/scenario.py": "Templates/BasicTwoAgents/scenario.py",
8-
"Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
99
"Templates/SocietyOfMind/expected_answer.txt": "Templates/SocietyOfMind/expected_answer.txt",
10+
"Templates/SocietyOfMind/prompt.txt": "Templates/SocietyOfMind/prompt.txt",
11+
"Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
1012
"Templates/SocietyOfMind/requirements.txt": "Templates/SocietyOfMind/requirements.txt"
1113
}
1214
}

samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import json
77
import os
88
import sys
9+
import re
910
from huggingface_hub import snapshot_download
1011

1112
SCRIPT_PATH = os.path.realpath(__file__)
@@ -60,9 +61,9 @@ def create_jsonl(name, tasks, files_dir, template):
6061
"substitutions": {
6162
"scenario.py": {
6263
"__FILE_NAME__": task["file_name"],
63-
"__PROMPT__": task["Question"],
6464
},
6565
"expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
66+
"prompt.txt": {"__PROMPT__": task["Question"]},
6667
},
6768
}
6869

@@ -97,10 +98,12 @@ def main():
9798

9899
gaia_test_tasks[data["Level"] - 1].append(data)
99100

100-
templates = {
101-
"two_agents": os.path.join(TEMPLATES_DIR, "BasicTwoAgents"),
102-
"soc": os.path.join(TEMPLATES_DIR, "SocietyOfMind"),
103-
}
101+
# list all directories in the Templates directory
102+
# and populate a dictionary with the name and path
103+
templates = {}
104+
for entry in os.scandir(TEMPLATES_DIR):
105+
if entry.is_dir():
106+
templates[re.sub(r"\s", "", entry.name)] = entry.path
104107

105108
# Add coding directories if needed (these are usually empty and left out of the repo)
106109
for template in templates.values():
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__PROMPT__

samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
testbed_utils.init()
88
##############################
99

10+
# Read the prompt
11+
PROMPT = ""
12+
with open("prompt.txt", "rt") as fh:
13+
PROMPT = fh.read().strip()
1014

1115
GAIA_SYSTEM_MESSAGE = (
1216
"You are a helpful AI assistant, and today's date is "
@@ -48,9 +52,7 @@
4852
)
4953

5054
filename = "__FILE_NAME__".strip()
51-
question = """
52-
__PROMPT__
53-
""".strip()
55+
question = PROMPT
5456

5557
if len(filename) > 0:
5658
question = f"Consider the file '{filename}', which can be read from the current working directory. If you need to read or write it, output python code in a code block (```python) to do so. {question}"

0 commit comments

Comments
 (0)