Skip to content

Commit af9b300

Browse files
olgavrougagbekzhu
authored
add webarena in samples (#2114)
* add webarena in samples/tools * Update samples/tools/webarena/README.md Co-authored-by: gagb <[email protected]> * Update samples/tools/webarena/README.md Co-authored-by: gagb <[email protected]> * Update samples/tools/webarena/README.md Co-authored-by: gagb <[email protected]> * update installation instructions * black formatting * Update README.md --------- Co-authored-by: gagb <[email protected]> Co-authored-by: Eric Zhu <[email protected]>
1 parent 2bfa181 commit af9b300

File tree

3 files changed

+591
-0
lines changed

3 files changed

+591
-0
lines changed

samples/tools/webarena/README.md

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# WebArena Benchmark
2+
3+
This directory helps run AutoGen agents on the [WebArena](https://arxiv.org/pdf/2307.13854.pdf) benchmark.
4+
5+
## Installing WebArena
6+
7+
WebArena can be installed by following the instructions from [WebArena's GitHub repository]([email protected]:web-arena-x/webarena.git)
8+
9+
If using WebArena with AutoGen there is a clash on the versions of OpenAI and some code changes are needed in WebArena to be compatible with AutoGen's OpenAI version:
10+
11+
- webarena's openai version is `openai==0.27.0`
12+
- autogen's openai version is: `openai>=1.3`
13+
14+
Prior to installation, in the WebArena codebase, any file containing `openai.error` needs to be replaced with `openai`.
15+
16+
## Running with AutoGen agents
17+
18+
You can use the `run.py` file in the `webarena` directory to run WebArena with AutoGen. The OpenAI (or AzureOpenAI or other model) configuration can be setup via `OAI_CONFIG_LIST`. The config list will be filtered by whatever model is passed in the `--model` argument.
19+
20+
e.g. of running `run.py`:
21+
22+
```
23+
mkdir myresultdir
24+
python run.py --instruction_path agent/prompts/jsons/p_cot_id_actree_2s.json --test_start_idx 27 --test_end_idx 28 --model gpt-4 --result_dir myresultdir
25+
```
26+
27+
The original `run.py` file has been modified to use AutoGen agents which are defined in the `webarena_agents.py` file.
28+
29+
## References
30+
**WebArena: A Realistic Web Environment for Building Autonomous Agents**<br/>
31+
Zhou, Shuyan and Xu, Frank F and Zhu, Hao and Zhou, Xuhui and Lo, Robert and Sridhar, Abishek and Cheng, Xianyi and Bisk, Yonatan and Fried, Daniel and Alon, Uri and others<br/>
32+
[https://arxiv.org/pdf/2307.13854.pdf](https://arxiv.org/pdf/2307.13854.pdf)
+354
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,354 @@
1+
"""Script to run end-to-end evaluation on the benchmark"""
2+
3+
import argparse
4+
import glob
5+
import json
6+
import logging
7+
import os
8+
import random
9+
import subprocess
10+
import tempfile
11+
import time
12+
from pathlib import Path
13+
14+
import openai
15+
import autogen
16+
from webarena_agents import ActionTakingCapability, EnvironmentAgent
17+
18+
from agent import (
19+
Agent,
20+
PromptAgent,
21+
TeacherForcingAgent,
22+
construct_agent,
23+
)
24+
25+
from agent.prompts import *
26+
from browser_env import (
27+
ScriptBrowserEnv,
28+
)
29+
from browser_env.auto_login import get_site_comb_from_filepath
30+
from browser_env.helper_functions import (
31+
RenderHelper,
32+
)
33+
from evaluation_harness import evaluator_router
34+
35+
LOG_FOLDER = "log_files"
36+
Path(LOG_FOLDER).mkdir(parents=True, exist_ok=True)
37+
LOG_FILE_NAME = f"{LOG_FOLDER}/log_{time.strftime('%Y%m%d%H%M%S', time.localtime())}_{random.randint(0, 10000)}.log"
38+
39+
logger = logging.getLogger("logger")
40+
logger.setLevel(logging.INFO)
41+
42+
console_handler = logging.StreamHandler()
43+
console_handler.setLevel(logging.DEBUG)
44+
logger.addHandler(console_handler)
45+
46+
file_handler = logging.FileHandler(LOG_FILE_NAME)
47+
file_handler.setLevel(logging.DEBUG)
48+
logger.addHandler(file_handler)
49+
50+
# Set the log format
51+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
52+
console_handler.setFormatter(formatter)
53+
file_handler.setFormatter(formatter)
54+
55+
56+
def config() -> argparse.Namespace:
57+
parser = argparse.ArgumentParser(description="Run end-to-end evaluation on the benchmark")
58+
parser.add_argument("--render", action="store_true", help="Render the browser")
59+
parser.add_argument(
60+
"--slow_mo",
61+
type=int,
62+
default=0,
63+
help="Slow down the browser by the specified amount",
64+
)
65+
parser.add_argument("--action_set_tag", default="id_accessibility_tree", help="Action type")
66+
parser.add_argument(
67+
"--observation_type",
68+
choices=["accessibility_tree", "html", "image"],
69+
default="accessibility_tree",
70+
help="Observation type",
71+
)
72+
parser.add_argument(
73+
"--current_viewport_only",
74+
action="store_true",
75+
help="Only use the current viewport for the observation",
76+
)
77+
parser.add_argument("--viewport_width", type=int, default=1280)
78+
parser.add_argument("--viewport_height", type=int, default=720)
79+
parser.add_argument("--save_trace_enabled", action="store_true")
80+
parser.add_argument("--sleep_after_execution", type=float, default=0.0)
81+
82+
parser.add_argument("--max_steps", type=int, default=30)
83+
84+
# agent config
85+
parser.add_argument("--agent_type", type=str, default="prompt")
86+
parser.add_argument(
87+
"--instruction_path",
88+
type=str,
89+
default="agents/prompts/state_action_agent.json",
90+
)
91+
parser.add_argument(
92+
"--parsing_failure_th",
93+
help="When concesecutive parsing failure exceeds this threshold, the agent will stop",
94+
type=int,
95+
default=3,
96+
)
97+
parser.add_argument(
98+
"--repeating_action_failure_th",
99+
help="When concesecutive repeating action exceeds this threshold, the agent will stop",
100+
type=int,
101+
default=3,
102+
)
103+
104+
# lm config
105+
parser.add_argument("--provider", type=str, default="openai")
106+
parser.add_argument("--model", type=str, default="gpt-3.5-turbo-0613")
107+
parser.add_argument("--mode", type=str, default="chat")
108+
parser.add_argument("--temperature", type=float, default=1.0)
109+
parser.add_argument("--top_p", type=float, default=0.9)
110+
parser.add_argument("--context_length", type=int, default=0)
111+
parser.add_argument("--max_tokens", type=int, default=384)
112+
parser.add_argument("--stop_token", type=str, default=None)
113+
parser.add_argument(
114+
"--max_retry",
115+
type=int,
116+
help="max retry times to perform generations when parsing fails",
117+
default=1,
118+
)
119+
parser.add_argument(
120+
"--max_obs_length",
121+
type=int,
122+
help="when not zero, will truncate the observation to this length before feeding to the model",
123+
default=1920,
124+
)
125+
parser.add_argument(
126+
"--model_endpoint",
127+
help="huggingface model endpoint",
128+
type=str,
129+
default="",
130+
)
131+
132+
# example config
133+
parser.add_argument("--test_start_idx", type=int, default=0)
134+
parser.add_argument("--test_end_idx", type=int, default=1000)
135+
136+
# logging related
137+
parser.add_argument("--result_dir", type=str, default="")
138+
args = parser.parse_args()
139+
140+
# check the whether the action space is compatible with the observation space
141+
if args.action_set_tag == "id_accessibility_tree" and args.observation_type != "accessibility_tree":
142+
raise ValueError(
143+
f"Action type {args.action_set_tag} is incompatible with the observation type {args.observation_type}"
144+
)
145+
146+
return args
147+
148+
149+
def test(
150+
args: argparse.Namespace,
151+
agent: Agent | PromptAgent | TeacherForcingAgent,
152+
config_file_list: list[str],
153+
) -> None:
154+
scores = []
155+
max_steps = args.max_steps
156+
157+
early_stop_thresholds = {
158+
"parsing_failure": args.parsing_failure_th,
159+
"repeating_action": args.repeating_action_failure_th,
160+
}
161+
162+
env = ScriptBrowserEnv(
163+
headless=not args.render,
164+
slow_mo=args.slow_mo,
165+
observation_type=args.observation_type,
166+
current_viewport_only=args.current_viewport_only,
167+
viewport_size={
168+
"width": args.viewport_width,
169+
"height": args.viewport_height,
170+
},
171+
save_trace_enabled=args.save_trace_enabled,
172+
sleep_after_execution=args.sleep_after_execution,
173+
)
174+
175+
for config_file in config_file_list:
176+
try:
177+
render_helper = RenderHelper(config_file, args.result_dir, args.action_set_tag)
178+
179+
# get intent
180+
with open(config_file) as f:
181+
_c = json.load(f)
182+
intent = _c["intent"]
183+
task_id = _c["task_id"]
184+
# automatically login
185+
if _c["storage_state"]:
186+
cookie_file_name = os.path.basename(_c["storage_state"])
187+
comb = get_site_comb_from_filepath(cookie_file_name)
188+
temp_dir = tempfile.mkdtemp()
189+
# subprocess to renew the cookie
190+
subprocess.run(
191+
[
192+
"python",
193+
"browser_env/auto_login.py",
194+
"--auth_folder",
195+
temp_dir,
196+
"--site_list",
197+
*comb,
198+
]
199+
)
200+
_c["storage_state"] = f"{temp_dir}/{cookie_file_name}"
201+
assert os.path.exists(_c["storage_state"])
202+
# update the config file
203+
config_file = f"{temp_dir}/{os.path.basename(config_file)}"
204+
with open(config_file, "w") as f:
205+
json.dump(_c, f)
206+
207+
logger.info(f"[Config file]: {config_file}")
208+
logger.info(f"[Intent]: {intent}")
209+
210+
agent.reset(config_file)
211+
212+
env_agent = EnvironmentAgent(
213+
name="env_agent",
214+
env=env,
215+
config_file=config_file,
216+
result_dir=args.result_dir,
217+
action_set_tag=args.action_set_tag,
218+
llm_config=False,
219+
code_execution_config=False,
220+
)
221+
222+
config_list = autogen.config_list_from_json(
223+
"OAI_CONFIG_LIST",
224+
filter_dict={"model": [args.model]},
225+
)
226+
227+
action_agent = autogen.AssistantAgent(
228+
"action_agent",
229+
llm_config={"config_list": config_list, "cache_seed": 32},
230+
system_message="",
231+
)
232+
233+
action_taking_capability = ActionTakingCapability(
234+
prompt_constructor=agent.prompt_constructor,
235+
action_set_tag=args.action_set_tag,
236+
max_steps=max_steps,
237+
early_stop_thresholds=early_stop_thresholds,
238+
)
239+
240+
action_taking_capability.add_to_agent(action_agent)
241+
242+
env_agent.initiate_chat(
243+
action_agent,
244+
message={
245+
"content": {"intent": intent},
246+
},
247+
)
248+
249+
evaluator = evaluator_router(config_file)
250+
score = evaluator(
251+
trajectory=env_agent.trajectory,
252+
config_file=config_file,
253+
page=env.page,
254+
client=env.get_page_client(env.page),
255+
)
256+
257+
scores.append(score)
258+
259+
if score == 1:
260+
logger.info(f"[Result] (PASS) {config_file}")
261+
else:
262+
logger.info(f"[Result] (FAIL) {config_file}")
263+
264+
if args.save_trace_enabled:
265+
env.save_trace(Path(args.result_dir) / "traces" / f"{task_id}.zip")
266+
267+
except openai.OpenAIError as e:
268+
logger.info(f"[OpenAI Error] {repr(e)}")
269+
except Exception as e:
270+
logger.info(f"[Unhandled Error] {repr(e)}]")
271+
import traceback
272+
273+
# write to error file
274+
with open(Path(args.result_dir) / "error.txt", "a") as f:
275+
f.write(f"[Config file]: {config_file}\n")
276+
f.write(f"[Unhandled Error] {repr(e)}\n")
277+
f.write(traceback.format_exc()) # write stack trace to file
278+
279+
render_helper.close()
280+
281+
env.close()
282+
if len(scores) > 0:
283+
logger.info(f"Average score: {sum(scores) / len(scores)}")
284+
285+
286+
def prepare(args: argparse.Namespace) -> None:
287+
# convert prompt python files to json
288+
from agent.prompts import to_json
289+
290+
to_json.run()
291+
292+
# prepare result dir
293+
result_dir = args.result_dir
294+
if not result_dir:
295+
result_dir = f"cache/results_{time.strftime('%Y%m%d%H%M%S', time.localtime())}"
296+
if not Path(result_dir).exists():
297+
Path(result_dir).mkdir(parents=True, exist_ok=True)
298+
args.result_dir = result_dir
299+
logger.info(f"Create result dir: {result_dir}")
300+
301+
if not (Path(result_dir) / "traces").exists():
302+
(Path(result_dir) / "traces").mkdir(parents=True)
303+
304+
# log the log file
305+
with open(os.path.join(result_dir, "log_files.txt"), "a+") as f:
306+
f.write(f"{LOG_FILE_NAME}\n")
307+
308+
309+
def get_unfinished(config_files: list[str], result_dir: str) -> list[str]:
310+
result_files = glob.glob(f"{result_dir}/*.html")
311+
task_ids = [os.path.basename(f).split(".")[0].split("_")[1] for f in result_files]
312+
unfinished_configs = []
313+
for config_file in config_files:
314+
task_id = os.path.basename(config_file).split(".")[0]
315+
if task_id not in task_ids:
316+
unfinished_configs.append(config_file)
317+
return unfinished_configs
318+
319+
320+
def dump_config(args: argparse.Namespace) -> None:
321+
config_file = Path(args.result_dir) / "config.json"
322+
if not config_file.exists():
323+
with open(config_file, "w") as f:
324+
json.dump(vars(args), f, indent=4)
325+
logger.info(f"Dump config to {config_file}")
326+
327+
328+
# def run():
329+
if __name__ == "__main__":
330+
args = config()
331+
args.sleep_after_execution = 2.0
332+
prepare(args)
333+
334+
test_file_list = []
335+
st_idx = args.test_start_idx
336+
ed_idx = args.test_end_idx
337+
for i in range(st_idx, ed_idx):
338+
test_file_list.append(f"config_files/{i}.json")
339+
if "debug" not in args.result_dir:
340+
test_file_list = get_unfinished(test_file_list, args.result_dir)
341+
342+
if len(test_file_list) == 0:
343+
logger.info("No task left to run")
344+
else:
345+
print(f"Total {len(test_file_list)} tasks left")
346+
args.render = False
347+
args.render_screenshot = True
348+
args.save_trace_enabled = True
349+
350+
args.current_viewport_only = True
351+
dump_config(args)
352+
353+
agent = construct_agent(args)
354+
test(args, agent, test_file_list)

0 commit comments

Comments
 (0)