Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add webarena in samples #2114

Merged
merged 9 commits into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions samples/tools/webarena/README.md
gagb marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# WebArena Benchmark

This directory is about the [WebArena](https://arxiv.org/pdf/2307.13854.pdf) benchmark in AutoGen.
olgavrou marked this conversation as resolved.
Show resolved Hide resolved

## Installation of WebArena
olgavrou marked this conversation as resolved.
Show resolved Hide resolved

WebArena can be installed by following the instructions from the [WebArena github]([email protected]:web-arena-x/webarena.git)
olgavrou marked this conversation as resolved.
Show resolved Hide resolved

If using WebArena with AutoGen there is a clash on the versions of OpenAI and some code changes are needed in WebArena to be compatible with AutoGen's OpenAI version.
ekzhu marked this conversation as resolved.
Show resolved Hide resolved

## Running with AutoGen agents

You can use the `run.py` file in the `webarena` directory to run WebArena with AutoGen. The OpenAI (or AzureOpenAI or other model) configuration can be setup via `OAI_CONFIG_LIST`. The config list will be filtered by whatever model is passed in the `--model` argument.

e.g. of running `run.py`: `python run.py --instruction_path <> --test_start_idx <> --test_end_idx <> --model gpt-4 --result_dir <>`
ekzhu marked this conversation as resolved.
Show resolved Hide resolved

The original `run.py` file has been modified to use AutoGen agents which are defined in the `webarena_agents.py` file.

## References
**WebArena: A Realistic Web Environment for Building Autonomous Agents**<br/>
Zhou, Shuyan and Xu, Frank F and Zhu, Hao and Zhou, Xuhui and Lo, Robert and Sridhar, Abishek and Cheng, Xianyi and Bisk, Yonatan and Fried, Daniel and Alon, Uri and others<br/>
[https://arxiv.org/pdf/2307.13854.pdf](https://arxiv.org/pdf/2307.13854.pdf)
367 changes: 367 additions & 0 deletions samples/tools/webarena/webarena/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,367 @@
"""Script to run end-to-end evaluation on the benchmark"""

import argparse
import glob
import json
import logging
import os
import random
import subprocess
import tempfile
import time
from pathlib import Path

import openai
import autogen
from webarena_agents import ActionTakingCapability, EnvironmentAgent

from agent import (
Agent,
PromptAgent,
TeacherForcingAgent,
construct_agent,
)

from agent.prompts import *
from browser_env import (
ScriptBrowserEnv,
)
from browser_env.auto_login import get_site_comb_from_filepath
from browser_env.helper_functions import (
RenderHelper,
)
from evaluation_harness import evaluator_router

LOG_FOLDER = "log_files"
Path(LOG_FOLDER).mkdir(parents=True, exist_ok=True)
LOG_FILE_NAME = f"{LOG_FOLDER}/log_{time.strftime('%Y%m%d%H%M%S', time.localtime())}_{random.randint(0, 10000)}.log"

logger = logging.getLogger("logger")
logger.setLevel(logging.INFO)

console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
logger.addHandler(console_handler)

file_handler = logging.FileHandler(LOG_FILE_NAME)
file_handler.setLevel(logging.DEBUG)
logger.addHandler(file_handler)

# Set the log format
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)


def config() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Run end-to-end evaluation on the benchmark"
)
parser.add_argument(
"--render", action="store_true", help="Render the browser"
)
parser.add_argument(
"--slow_mo",
type=int,
default=0,
help="Slow down the browser by the specified amount",
)
parser.add_argument(
"--action_set_tag", default="id_accessibility_tree", help="Action type"
)
parser.add_argument(
"--observation_type",
choices=["accessibility_tree", "html", "image"],
default="accessibility_tree",
help="Observation type",
)
parser.add_argument(
"--current_viewport_only",
action="store_true",
help="Only use the current viewport for the observation",
)
parser.add_argument("--viewport_width", type=int, default=1280)
parser.add_argument("--viewport_height", type=int, default=720)
parser.add_argument("--save_trace_enabled", action="store_true")
parser.add_argument("--sleep_after_execution", type=float, default=0.0)

parser.add_argument("--max_steps", type=int, default=30)

# agent config
parser.add_argument("--agent_type", type=str, default="prompt")
parser.add_argument(
"--instruction_path",
type=str,
default="agents/prompts/state_action_agent.json",
)
parser.add_argument(
"--parsing_failure_th",
help="When concesecutive parsing failure exceeds this threshold, the agent will stop",
type=int,
default=3,
)
parser.add_argument(
"--repeating_action_failure_th",
help="When concesecutive repeating action exceeds this threshold, the agent will stop",
type=int,
default=3,
)

# lm config
parser.add_argument("--provider", type=str, default="openai")
parser.add_argument("--model", type=str, default="gpt-3.5-turbo-0613")
parser.add_argument("--mode", type=str, default="chat")
parser.add_argument("--temperature", type=float, default=1.0)
parser.add_argument("--top_p", type=float, default=0.9)
parser.add_argument("--context_length", type=int, default=0)
parser.add_argument("--max_tokens", type=int, default=384)
parser.add_argument("--stop_token", type=str, default=None)
parser.add_argument(
"--max_retry",
type=int,
help="max retry times to perform generations when parsing fails",
default=1,
)
parser.add_argument(
"--max_obs_length",
type=int,
help="when not zero, will truncate the observation to this length before feeding to the model",
default=1920,
)
parser.add_argument(
"--model_endpoint",
help="huggingface model endpoint",
type=str,
default="",
)

# example config
parser.add_argument("--test_start_idx", type=int, default=0)
parser.add_argument("--test_end_idx", type=int, default=1000)

# logging related
parser.add_argument("--result_dir", type=str, default="")
args = parser.parse_args()

# check the whether the action space is compatible with the observation space
if (
args.action_set_tag == "id_accessibility_tree"
and args.observation_type != "accessibility_tree"
):
raise ValueError(
f"Action type {args.action_set_tag} is incompatible with the observation type {args.observation_type}"
)

return args




def test(
args: argparse.Namespace,
agent: Agent | PromptAgent | TeacherForcingAgent,
config_file_list: list[str],
) -> None:
scores = []
max_steps = args.max_steps

early_stop_thresholds = {
"parsing_failure": args.parsing_failure_th,
"repeating_action": args.repeating_action_failure_th,
}

env = ScriptBrowserEnv(
headless=not args.render,
slow_mo=args.slow_mo,
observation_type=args.observation_type,
current_viewport_only=args.current_viewport_only,
viewport_size={
"width": args.viewport_width,
"height": args.viewport_height,
},
save_trace_enabled=args.save_trace_enabled,
sleep_after_execution=args.sleep_after_execution,
)

for config_file in config_file_list:
try:
render_helper = RenderHelper(
config_file, args.result_dir, args.action_set_tag
)

# get intent
with open(config_file) as f:
_c = json.load(f)
intent = _c["intent"]
task_id = _c["task_id"]
# automatically login
if _c["storage_state"]:
cookie_file_name = os.path.basename(_c["storage_state"])
comb = get_site_comb_from_filepath(cookie_file_name)
temp_dir = tempfile.mkdtemp()
# subprocess to renew the cookie
subprocess.run(
[
"python",
"browser_env/auto_login.py",
"--auth_folder",
temp_dir,
"--site_list",
*comb,
]
)
_c["storage_state"] = f"{temp_dir}/{cookie_file_name}"
assert os.path.exists(_c["storage_state"])
# update the config file
config_file = f"{temp_dir}/{os.path.basename(config_file)}"
with open(config_file, "w") as f:
json.dump(_c, f)

logger.info(f"[Config file]: {config_file}")
logger.info(f"[Intent]: {intent}")

agent.reset(config_file)

env_agent = EnvironmentAgent(
name="env_agent",
env=env,
config_file=config_file,
result_dir=args.result_dir,
action_set_tag=args.action_set_tag,
llm_config=False,
code_execution_config=False,
)

config_list = autogen.config_list_from_json(
"OAI_CONFIG_LIST",
filter_dict={"model": [args.model]},
)

action_agent = autogen.AssistantAgent(
"action_agent",
llm_config={"config_list": config_list, "cache_seed": 32},
system_message="",
)

action_taking_capability = ActionTakingCapability(
prompt_constructor=agent.prompt_constructor,
action_set_tag=args.action_set_tag,
max_steps=max_steps,
early_stop_thresholds=early_stop_thresholds,
)

action_taking_capability.add_to_agent(action_agent)

env_agent.initiate_chat(
action_agent,
message={
"content": {"intent": intent},
},
)

evaluator = evaluator_router(config_file)
score = evaluator(
trajectory=env_agent.trajectory,
config_file=config_file,
page=env.page,
client=env.get_page_client(env.page),
)

scores.append(score)

if score == 1:
logger.info(f"[Result] (PASS) {config_file}")
else:
logger.info(f"[Result] (FAIL) {config_file}")

if args.save_trace_enabled:
env.save_trace(Path(args.result_dir) / "traces" / f"{task_id}.zip")

except openai.OpenAIError as e:
logger.info(f"[OpenAI Error] {repr(e)}")
except Exception as e:
logger.info(f"[Unhandled Error] {repr(e)}]")
import traceback

# write to error file
with open(Path(args.result_dir) / "error.txt", "a") as f:
f.write(f"[Config file]: {config_file}\n")
f.write(f"[Unhandled Error] {repr(e)}\n")
f.write(traceback.format_exc()) # write stack trace to file

render_helper.close()

env.close()
if len(scores) > 0:
logger.info(f"Average score: {sum(scores) / len(scores)}")


def prepare(args: argparse.Namespace) -> None:
# convert prompt python files to json
from agent.prompts import to_json

to_json.run()

# prepare result dir
result_dir = args.result_dir
if not result_dir:
result_dir = f"cache/results_{time.strftime('%Y%m%d%H%M%S', time.localtime())}"
if not Path(result_dir).exists():
Path(result_dir).mkdir(parents=True, exist_ok=True)
args.result_dir = result_dir
logger.info(f"Create result dir: {result_dir}")

if not (Path(result_dir) / "traces").exists():
(Path(result_dir) / "traces").mkdir(parents=True)

# log the log file
with open(os.path.join(result_dir, "log_files.txt"), "a+") as f:
f.write(f"{LOG_FILE_NAME}\n")


def get_unfinished(config_files: list[str], result_dir: str) -> list[str]:
result_files = glob.glob(f"{result_dir}/*.html")
task_ids = [os.path.basename(f).split(".")[0].split("_")[1] for f in result_files]
unfinished_configs = []
for config_file in config_files:
task_id = os.path.basename(config_file).split(".")[0]
if task_id not in task_ids:
unfinished_configs.append(config_file)
return unfinished_configs


def dump_config(args: argparse.Namespace) -> None:
config_file = Path(args.result_dir) / "config.json"
if not config_file.exists():
with open(config_file, "w") as f:
json.dump(vars(args), f, indent=4)
logger.info(f"Dump config to {config_file}")


# def run():
if __name__ == "__main__":
args = config()
args.sleep_after_execution = 2.0
prepare(args)

test_file_list = []
st_idx = args.test_start_idx
ed_idx = args.test_end_idx
for i in range(st_idx, ed_idx):
test_file_list.append(f"config_files/{i}.json")
if "debug" not in args.result_dir:
test_file_list = get_unfinished(test_file_list, args.result_dir)

if len(test_file_list) == 0:
logger.info("No task left to run")
else:
print(f"Total {len(test_file_list)} tasks left")
args.render = False
args.render_screenshot = True
args.save_trace_enabled = True

args.current_viewport_only = True
dump_config(args)

agent = construct_agent(args)
test(args, agent, test_file_list)
Loading
Loading