Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Version 0.0.2 of Autogenbench #1548

Merged
merged 20 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions samples/tools/autogenbench/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
scenarios/*/Downloads
scenarios/*/Tasks
*/Results
11 changes: 11 additions & 0 deletions samples/tools/autogenbench/autogenbench/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import sys
from .version import __version__
from .run_cmd import run_cli
from .clone_cmd import clone_cli
from .tabulate_cmd import tabulate_cli
Expand All @@ -9,6 +10,7 @@ def main(args=None):
args = sys.argv[:] # Shallow copy

invocation_cmd = "autogenbench"
version_string = f"AutoGenBench version {__version__}"

commands = [
{
Expand All @@ -26,6 +28,11 @@ def main(args=None):
"description": "tabulate the results of a previous run",
"function": tabulate_cli,
},
{
"command": "--version",
"description": f"print the version of {invocation_cmd}",
"function": lambda _args: print(f"{version_string}"),
},
{"command": "--help", "description": "print this message", "function": None},
]

Expand All @@ -40,6 +47,8 @@ def main(args=None):
commands_details += f" {padded_cmd}: {c['description']}\n"

usage_text = f"""
{version_string}

usage: {invocation_cmd} COMMAND ARGS

Where, COMMAND is one of: {commands_list}
Expand All @@ -49,6 +58,8 @@ def main(args=None):
""".strip()

help_text = f"""
{version_string}

usage: {invocation_cmd} COMMAND ARGS

{invocation_cmd} is a tool for running and managing AutoGen benchmark scenarios. A typically session might resemble:
Expand Down
120 changes: 102 additions & 18 deletions samples/tools/autogenbench/autogenbench/run_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import random
from autogen import config_list_from_json
from autogen.oai.openai_utils import filter_config
from .version import __version__

# Figure out where everything is
SCRIPT_PATH = os.path.realpath(__file__)
Expand Down Expand Up @@ -247,17 +248,25 @@ def get_scenario_env(config_list, env_file=DEFAULT_ENV_FILE):
Returns: A dictionary of keys and values that need to be added to the system environment.
"""
env = dict()

# Populate with commonly needed keys
openai_api_key = os.environ.get("OPENAI_API_KEY")
if openai_api_key is not None and len(openai_api_key.strip()) > 0:
env["OPENAI_API_KEY"] = openai_api_key

bing_api_key = os.environ.get("BING_API_KEY")
if bing_api_key is not None and len(bing_api_key.strip()) > 0:
env["BING_API_KEY"] = bing_api_key

# Update with any values from the ENV.json file
if os.path.isfile(env_file):
with open(env_file, "rt") as fh:
env = json.loads(fh.read())
env.update(json.loads(fh.read()))

# Include the config_list that we are using
config_list_json = json.dumps(config_list)
env["OAI_CONFIG_LIST"] = config_list_json

openai_api_key = os.environ.get("OPENAI_API_KEY")
if openai_api_key is not None and len(openai_api_key.strip()) > 0:
env["OPENAI_API_KEY"] = openai_api_key

return env


Expand Down Expand Up @@ -286,6 +295,12 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
f"""#
echo RUN.SH STARTING !#!#
export AUTOGEN_TESTBED_SETTING="Native"
echo "autogenbench version: {__version__}" > timestamp.txt

# Create and activate the virtual environment
# This is called in a subprocess, and will not impact the parent
{sys.executable} -m venv .autogenbench_venv
. .autogenbench_venv/bin/activate

# Run the global init script if it exists
if [ -f global_init.sh ] ; then
Expand All @@ -298,6 +313,7 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
fi

# Run the scenario
pip install -r requirements.txt
echo SCENARIO.PY STARTING !#!#
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
EXIT_CODE=$?
Expand All @@ -312,6 +328,10 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
rm -Rf .cache
fi

if [ -d __pycache__ ] ; then
rm -Rf __pycache__
fi

# Run the scenario finalize script if it exists
if [ -f scenario_finalize.sh ] ; then
. ./scenario_finalize.sh
Expand All @@ -322,6 +342,12 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
. ./global_finalize.sh
fi

# We don't need to deactivate the venv because it's
# contained in the subprocess; but we should clean it up
if [ -d .autogenbench_venv ] ; then
rm -Rf .autogenbench_venv
fi

echo RUN.SH COMPLETE !#!#
"""
)
Expand Down Expand Up @@ -387,7 +413,9 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
f"""#
echo RUN.SH STARTING !#!#
export AUTOGEN_TESTBED_SETTING="Docker"

umask 000
echo "autogenbench version: {__version__}" > timestamp.txt

# Run the global init script if it exists
if [ -f global_init.sh ] ; then
Expand Down Expand Up @@ -415,6 +443,10 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
rm -Rf .cache
fi

if [ -d __pycache__ ] ; then
rm -Rf __pycache__
fi

# Run the scenario finalize script if it exists
if [ -f scenario_finalize.sh ] ; then
. ./scenario_finalize.sh
Expand All @@ -429,18 +461,31 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
"""
)

print("\n\n" + work_dir + "\n===================================================================")
# Figure out what folders to mount
volumes = {str(pathlib.Path(work_dir).absolute()): {"bind": "/workspace", "mode": "rw"}}

# Add the autogen repo if we can find it
autogen_repo_base = os.environ.get("AUTOGENBENCH_REPO_BASE")
if autogen_repo_base is None:
autogen_repo_base = find_autogen_repo(os.getcwd())
elif not os.path.isdir(autogen_repo_base):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), autogen_repo_base)

if autogen_repo_base is not None:
volumes[str(pathlib.Path(autogen_repo_base).absolute())] = {"bind": "/autogen", "mode": "rw"}

print("Mounting:")
for k in volumes:
bind = volumes[k]["bind"]
mode = volumes[k]["mode"].upper()
if bind == "/workspace":
k = os.path.relpath(k)
print(f"[{mode}]\t'{k}' => '{bind}'")
print("===================================================================")

# Create and run the container
abs_path = str(pathlib.Path(work_dir).absolute())
container = client.containers.run(
image,
command=["sh", "run.sh"],
working_dir="/workspace",
environment=env,
detach=True,
# get absolute path to the working directory
volumes={abs_path: {"bind": "/workspace", "mode": "rw"}},
image, command=["sh", "run.sh"], working_dir="/workspace", environment=env, detach=True, volumes=volumes
)

# Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
Expand Down Expand Up @@ -485,6 +530,34 @@ def build_default_docker_image(docker_client, image_tag):
sys.stdout.write(segment["stream"])


def find_autogen_repo(path):
"""
Utility for identifying if the path is a subdirectory of the autogen repo.

Returns: the path to the root of the autogen repo if one is found, otherwise None
"""

# Normalize the path (we expect a directory)
path = os.path.abspath(path)
if os.path.isfile(path):
path = os.path.dirname(path)

while True:
test_path = os.path.join(path, "autogen", "agentchat", "conversable_agent.py") # We found autogen
if os.path.isfile(test_path):
return path

# Stop if we hit the root
parent_dir = os.path.abspath(os.path.join(path, os.pardir))
if parent_dir == path:
break

# Keep searching
path = parent_dir

return None


def run_cli(args):
invocation_cmd = args[0]
args = args[1:]
Expand Down Expand Up @@ -581,12 +654,23 @@ def run_cli(args):
if parsed_args.requirements is not None:
sys.exit("--requirements is not compatible with --native. Exiting.")

choice = input(
'WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\nAre you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
sys.stderr.write(
"WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
)

if choice.strip().lower() != "yes":
sys.exit("Received '" + choice + "'. Exiting.")
# Does an environment variable override the prompt?
allow_native = os.environ.get("AUTOGENBENCH_ALLOW_NATIVE")
if allow_native is None or allow_native == "":
choice = input(
'Are you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
)
if choice.strip().lower() != "yes":
sys.exit("Received '" + choice + "'. Exiting.")
elif allow_native.strip().lower() != "yes":
sys.exit(f"Exiting because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
else:
sys.stderr.write(f"Continuing because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
time.sleep(0.75) # Pause very briefly so the message isn't lost in the noise

# Parse the subsample
subsample = None
Expand Down
17 changes: 17 additions & 0 deletions samples/tools/autogenbench/autogenbench/template/testbed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,15 @@

AUTOGEN_VERSION = packaging.version.parse(autogen.__version__)

# Try importing the runtime_logging module (only available in some branches)
LOGGING_ENABLED = False
try:
import autogen.runtime_logging

LOGGING_ENABLED = True
except ImportError:
pass


def default_llm_config(config_list, timeout=180):
"""Return a default config list with a given timeout, and with caching disabled.
Expand Down Expand Up @@ -57,6 +66,10 @@ def init():
if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):
autogen.Completion.start_logging(compact=False)

# Start logging
if LOGGING_ENABLED:
autogen.runtime_logging.start(config={"dbname": "telemetry.db"})


def finalize(agents):
"""Helper function to finalize logging in a testbed scenario.
Expand Down Expand Up @@ -89,3 +102,7 @@ def messages_to_json(agent):
with open(os.path.join(script_dir, "completion_log.json"), "wt") as fh:
fh.write(json.dumps(autogen.Completion.logged_history, indent=4))
autogen.Completion.stop_logging()

# Stop logging
if LOGGING_ENABLED:
autogen.runtime_logging.stop()
2 changes: 1 addition & 1 deletion samples/tools/autogenbench/autogenbench/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.1"
__version__ = "0.0.2a4"
5 changes: 5 additions & 0 deletions samples/tools/autogenbench/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,8 @@ exclude = ["*.tests*"]

[project.scripts]
autogenbench = "autogenbench.cli:main"

[tool.black]
# https://github.com/psf/black
line-length = 120
exclude = "(.eggs|.git|.hg|.mypy_cache|.venv|_build|buck-out|build|dist)"
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
import glob
import base64
import re
from huggingface_hub import snapshot_download

SCRIPT_PATH = os.path.realpath(__file__)
Expand Down Expand Up @@ -88,7 +89,12 @@ def create_jsonl(name, template):

###############################################################################
def main():
templates = {"two_agents": os.path.join(TEMPLATES_DIR, "TwoAgents")}
# list all directories in the Templates directory
# and populate a dictionary with the name and path
templates = {}
for entry in os.scandir(TEMPLATES_DIR):
if entry.is_dir():
templates[re.sub(r"\s", "", entry.name)] = entry.path

# Add coding directories if needed (these are usually empty and left out of the repo)
for template in templates.values():
Expand Down
4 changes: 3 additions & 1 deletion samples/tools/autogenbench/scenarios/GAIA/MANIFEST.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
"Scripts/init_tasks.py": "Scripts/init_tasks.py",
"Scripts/custom_tabulate.py": "Scripts/custom_tabulate.py",
"Templates/BasicTwoAgents/expected_answer.txt": "Templates/BasicTwoAgents/expected_answer.txt",
"Templates/BasicTwoAgents/prompt.txt": "Templates/BasicTwoAgents/prompt.txt",
"Templates/BasicTwoAgents/scenario.py": "Templates/BasicTwoAgents/scenario.py",
"Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
"Templates/SocietyOfMind/expected_answer.txt": "Templates/SocietyOfMind/expected_answer.txt",
"Templates/SocietyOfMind/prompt.txt": "Templates/SocietyOfMind/prompt.txt",
"Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
"Templates/SocietyOfMind/requirements.txt": "Templates/SocietyOfMind/requirements.txt"
}
}
13 changes: 8 additions & 5 deletions samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import json
import os
import sys
import re
from huggingface_hub import snapshot_download

SCRIPT_PATH = os.path.realpath(__file__)
Expand Down Expand Up @@ -60,9 +61,9 @@ def create_jsonl(name, tasks, files_dir, template):
"substitutions": {
"scenario.py": {
"__FILE_NAME__": task["file_name"],
"__PROMPT__": task["Question"],
},
"expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
"prompt.txt": {"__PROMPT__": task["Question"]},
},
}

Expand Down Expand Up @@ -97,10 +98,12 @@ def main():

gaia_test_tasks[data["Level"] - 1].append(data)

templates = {
"two_agents": os.path.join(TEMPLATES_DIR, "BasicTwoAgents"),
"soc": os.path.join(TEMPLATES_DIR, "SocietyOfMind"),
}
# list all directories in the Templates directory
# and populate a dictionary with the name and path
templates = {}
for entry in os.scandir(TEMPLATES_DIR):
if entry.is_dir():
templates[re.sub(r"\s", "", entry.name)] = entry.path

# Add coding directories if needed (these are usually empty and left out of the repo)
for template in templates.values():
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__PROMPT__
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
testbed_utils.init()
##############################

# Read the prompt
PROMPT = ""
with open("prompt.txt", "rt") as fh:
PROMPT = fh.read().strip()

GAIA_SYSTEM_MESSAGE = (
"You are a helpful AI assistant, and today's date is "
Expand Down Expand Up @@ -48,9 +52,7 @@
)

filename = "__FILE_NAME__".strip()
question = """
__PROMPT__
""".strip()
question = PROMPT

if len(filename) > 0:
question = f"Consider the file '{filename}', which can be read from the current working directory. If you need to read or write it, output python code in a code block (```python) to do so. {question}"
Expand Down
Loading
Loading