diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9ce3602..79be870 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -34,7 +34,7 @@ jobs: mypy --version # Run this mypy instance against our main package. mypy --install-types --non-interactive . - mypy --strict . + mypy --strict . --exclude scripts - name: Enviroment prepare run: | bash prepare.sh diff --git a/.gitignore b/.gitignore index 1da6709..54703d6 100644 --- a/.gitignore +++ b/.gitignore @@ -141,18 +141,19 @@ run.sh # trajectory visualization render_cache/* +cache/* # TMP IGNORE agent/prompts/jsons/* log_files/ -config_files/*0.json -config_files/*1.json -config_files/*2.json -config_files/*3.json -config_files/*4.json -config_files/*5.json -config_files/*6.json -config_files/*7.json -config_files/*8.json -config_files/*9.json -config_files/test.json +config_files*/*0.json +config_files*/*1.json +config_files*/*2.json +config_files*/*3.json +config_files*/*4.json +config_files*/*5.json +config_files*/*6.json +config_files*/*7.json +config_files*/*8.json +config_files*/*9.json +config_files*/test.json diff --git a/agent/agent.py b/agent/agent.py index 240ce0b..923ebce 100644 --- a/agent/agent.py +++ b/agent/agent.py @@ -15,11 +15,14 @@ create_playwright_action, ) from browser_env.utils import Observation, StateInfo -from llms import lm_config -from llms.providers.openai_utils import ( +from llms import ( + call_llm, + generate_from_huggingface_completion, generate_from_openai_chat_completion, generate_from_openai_completion, + lm_config, ) +from llms.tokenizers import Tokenizer class Agent: @@ -120,49 +123,33 @@ def next_action( trajectory, intent, meta_data ) lm_config = self.lm_config - if lm_config.provider == "openai": - if lm_config.mode == "chat": - response = generate_from_openai_chat_completion( - messages=prompt, - model=lm_config.model, - temperature=lm_config.gen_config["temperature"], - top_p=lm_config.gen_config["top_p"], - context_length=lm_config.gen_config["context_length"], - max_tokens=lm_config.gen_config["max_tokens"], - stop_token=None, - ) - elif lm_config.mode == "completion": - response = generate_from_openai_completion( - prompt=prompt, - engine=lm_config.model, - temperature=lm_config.gen_config["temperature"], - max_tokens=lm_config.gen_config["max_tokens"], - top_p=lm_config.gen_config["top_p"], - stop_token=lm_config.gen_config["stop_token"], - ) - else: - raise ValueError( - f"OpenAI models do not support mode {lm_config.mode}" + n = 0 + while True: + response = call_llm(lm_config, prompt) + force_prefix = self.prompt_constructor.instruction[ + "meta_data" + ].get("force_prefix", "") + response = f"{force_prefix}{response}" + n += 1 + try: + parsed_response = self.prompt_constructor.extract_action( + response ) - else: - raise NotImplementedError( - f"Provider {lm_config.provider} not implemented" - ) - - try: - parsed_response = self.prompt_constructor.extract_action(response) - if self.action_set_tag == "id_accessibility_tree": - action = create_id_based_action(parsed_response) - elif self.action_set_tag == "playwright": - action = create_playwright_action(parsed_response) - else: - raise ValueError(f"Unknown action type {self.action_set_tag}") - - action["raw_prediction"] = response - - except ActionParsingError as e: - action = create_none_action() - action["raw_prediction"] = response + if self.action_set_tag == "id_accessibility_tree": + action = create_id_based_action(parsed_response) + elif self.action_set_tag == "playwright": + action = create_playwright_action(parsed_response) + else: + raise ValueError( + f"Unknown action type {self.action_set_tag}" + ) + action["raw_prediction"] = response + break + except ActionParsingError as e: + if n >= lm_config.gen_config["max_retry"]: + action = create_none_action() + action["raw_prediction"] = response + break return action @@ -170,24 +157,8 @@ def reset(self, test_config_file: str) -> None: pass -def construct_llm_config(args: argparse.Namespace) -> lm_config.LMConfig: - llm_config = lm_config.LMConfig( - provider=args.provider, model=args.model, mode=args.mode - ) - if args.provider == "openai": - llm_config.gen_config["temperature"] = args.temperature - llm_config.gen_config["top_p"] = args.top_p - llm_config.gen_config["context_length"] = args.context_length - llm_config.gen_config["max_tokens"] = args.max_tokens - llm_config.gen_config["stop_token"] = args.stop_token - llm_config.gen_config["max_obs_length"] = args.max_obs_length - else: - raise NotImplementedError(f"provider {args.provider} not implemented") - return llm_config - - def construct_agent(args: argparse.Namespace) -> Agent: - llm_config = construct_llm_config(args) + llm_config = lm_config.construct_llm_config(args) agent: Agent if args.agent_type == "teacher_forcing": @@ -195,7 +166,7 @@ def construct_agent(args: argparse.Namespace) -> Agent: elif args.agent_type == "prompt": with open(args.instruction_path) as f: constructor_type = json.load(f)["meta_data"]["prompt_constructor"] - tokenizer = tiktoken.encoding_for_model(llm_config.model) + tokenizer = Tokenizer(args.provider, args.model) prompt_constructor = eval(constructor_type)( args.instruction_path, lm_config=llm_config, tokenizer=tokenizer ) diff --git a/agent/prompts/prompt_constructor.py b/agent/prompts/prompt_constructor.py index 6e2d3cb..a0ca408 100644 --- a/agent/prompts/prompt_constructor.py +++ b/agent/prompts/prompt_constructor.py @@ -3,14 +3,12 @@ from pathlib import Path from typing import Any, TypedDict -import tiktoken - from browser_env import Action, ActionParsingError, Trajectory from browser_env.env_config import URL_MAPPINGS from browser_env.utils import StateInfo from llms import lm_config - -APIInput = str | list[Any] | dict[str, Any] +from llms.tokenizers import Tokenizer +from llms.utils import APIInput class Instruction(TypedDict): @@ -27,12 +25,12 @@ def __init__( self, instruction_path: str | Path, lm_config: lm_config.LMConfig, - tokenizer: tiktoken.core.Encoding, + tokenizer: Tokenizer, ): - self.instrction_path = Path(instruction_path) + self.instruction_path = Path(instruction_path) self.obs_modality = "text" self.lm_config = lm_config - instruction = json.load(open(self.instrction_path)) + instruction = json.load(open(self.instruction_path)) instruction["examples"] = [tuple(e) for e in instruction["examples"]] self.instruction: Instruction = instruction self.tokenizer = tokenizer @@ -77,6 +75,37 @@ def get_lm_api_input( raise ValueError( f"OpenAI models do not support mode {self.lm_config.mode}" ) + elif "huggingface" in self.lm_config.provider: + # https://huggingface.co/blog/llama2#how-to-prompt-llama-2 + # https://github.com/facebookresearch/llama/blob/main/llama/generation.py#L320 + if "Llama-2" in self.lm_config.model: + if self.lm_config.mode == "chat": + B_INST, E_INST = "[INST]", "[/INST]" + B_SYS, E_SYS = "<>\n", "\n<>\n\n" + BOS, EOS = "", "" + # adding the system message to be the starting of the first example + examples = [ + ( + B_SYS + intro + E_SYS + examples[0][0], + examples[0][1], + ) + ] + examples[1:] + message = "".join( + [ + f"{BOS}{B_INST} {x.strip()} {E_INST} {y.strip()} {EOS}" + for (x, y) in examples + ] + ) + # add the current observation + message += f"{BOS}{B_INST} {current.strip()} {E_INST} {self.instruction['meta_data'].get('force_prefix', '')}" + + return message + else: + raise ValueError("Only chat mode is supported for Llama-2") + else: + raise ValueError( + f"Huggingface models do not support model_tag {self.lm_config.gen_config['model_tag']}" + ) else: raise NotImplementedError( f"Provider {self.lm_config.provider} not implemented" @@ -102,6 +131,9 @@ def map_url_to_local(self, url: str) -> str: for i, j in URL_MAPPINGS.items(): if j in url: url = url.replace(j, i) + # https + if j.replace("http", "https") in url: + url = url.replace(j.replace("http", "https"), i) return url def _extract_action(self, response: str) -> str: @@ -120,7 +152,7 @@ def __init__( self, instruction_path: str | Path, lm_config: lm_config.LMConfig, - tokenizer: tiktoken.core.Encoding, + tokenizer: Tokenizer, ): super().__init__(instruction_path, lm_config, tokenizer) @@ -161,10 +193,10 @@ def construct( def _extract_action(self, response: str) -> str: action_splitter = self.instruction["meta_data"]["action_splitter"] - pattern = rf"{action_splitter}(.*?){action_splitter}" + pattern = rf"{action_splitter}((.|\n)*?){action_splitter}" match = re.search(pattern, response) if match: - return match.group(1) + return match.group(1).strip() else: raise ActionParsingError( f"Cannot parse action from response {response}" @@ -178,7 +210,7 @@ def __init__( self, instruction_path: str | Path, lm_config: lm_config.LMConfig, - tokenizer: tiktoken.core.Encoding, + tokenizer: Tokenizer, ): super().__init__(instruction_path, lm_config, tokenizer) self.answer_phrase = self.instruction["meta_data"]["answer_phrase"] @@ -218,10 +250,10 @@ def construct( def _extract_action(self, response: str) -> str: # find the first occurence of action action_splitter = self.instruction["meta_data"]["action_splitter"] - pattern = rf"{action_splitter}(.*?){action_splitter}" + pattern = rf"{action_splitter}((.|\n)*?){action_splitter}" match = re.search(pattern, response) if match: - return match.group(1) + return match.group(1).strip() else: raise ActionParsingError( f'Cannot find the answer phrase "{self.answer_phrase}" in "{response}"' diff --git a/agent/prompts/raw/p_cot_id_actree_2s_no_na.py b/agent/prompts/raw/p_cot_id_actree_2s_no_na.py new file mode 100644 index 0000000..945cd95 --- /dev/null +++ b/agent/prompts/raw/p_cot_id_actree_2s_no_na.py @@ -0,0 +1,82 @@ +prompt = { + "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. + +Here's the information you'll have: +The user's objective: This is the task you're trying to complete. +The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information. +The current web page's URL: This is the page you're currently navigating. +The open tabs: These are the tabs you have open. +The previous action: This is the action you just performed. It may be helpful to track your progress. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. +`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). +`scroll [direction=down|up]`: Scroll the page up or down. + +Tab Management Actions: +`new_tab`: Open a new, empty browser tab. +`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. +`close_tab`: Close the currently active tab. + +URL Navigation Actions: +`goto [url]`: Navigate to a specific URL. +`go_back`: Navigate to the previously viewed page. +`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. + +Homepage: +If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. +http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites. + +To be successful, it is very important to follow the following rules: +1. You should only issue an action that is valid given the current observation +2. You should only issue one action at a time. +3. You should follow the examples to reason step by step and then issue the next action. +4. Generate the action in the correct format. Start with a "In summary, the next action I will perform is" phrase, followed by action inside ``````. For example, "In summary, the next action I will perform is ```click [1234]```". +5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""", + "examples": [ + ( + """OBSERVATION: +[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' + [1749] StaticText '$279.49' + [1757] button 'Add to Cart' + [1760] button 'Add to Wish List' + [1761] button 'Add to Compare' +URL: http://onestopmarket.com/office-products/office-electronics.html +OBJECTIVE: What is the price of HP Inkjet Fax Machine +PREVIOUS ACTION: None""", + "Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```", + ), + ( + """OBSERVATION: +[164] textbox 'Search' focused: True required: False +[171] button 'Go' +[174] link 'Find directions between two points' +[212] heading 'Search Results' +[216] button 'Close' +URL: http://openstreetmap.org +OBJECTIVE: Show me the restaurants near CMU +PREVIOUS ACTION: None""", + "Let's think step-by-step. This page has a search box whose ID is [164]. According to the nominatim rule of openstreetmap, I can search for the restaurants near a location by \"restaurants near\". I can submit my typing by pressing the Enter afterwards. In summary, the next action I will perform is ```type [164] [restaurants near CMU] [1]```", + ), + ], + "template": """OBSERVATION: +{observation} +URL: {url} +OBJECTIVE: {objective} +PREVIOUS ACTION: {previous_action}""", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": ["url", "objective", "observation", "previous_action"], + "prompt_constructor": "CoTPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```" + }, +} diff --git a/agent/prompts/raw/p_direct_id_actree_2s_no_na.py b/agent/prompts/raw/p_direct_id_actree_2s_no_na.py new file mode 100644 index 0000000..c399454 --- /dev/null +++ b/agent/prompts/raw/p_direct_id_actree_2s_no_na.py @@ -0,0 +1,81 @@ +prompt = { + "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. + +Here's the information you'll have: +The user's objective: This is the task you're trying to complete. +The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information. +The current web page's URL: This is the page you're currently navigating. +The open tabs: These are the tabs you have open. +The previous action: This is the action you just performed. It may be helpful to track your progress. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. +`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). +`scroll [direction=down|up]`: Scroll the page up or down. + +Tab Management Actions: +`new_tab`: Open a new, empty browser tab. +`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. +`close_tab`: Close the currently active tab. + +URL Navigation Actions: +`goto [url]`: Navigate to a specific URL. +`go_back`: Navigate to the previously viewed page. +`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. + +Homepage: +If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. +http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites. + +To be successful, it is very important to follow the following rules: +1. You should only issue an action that is valid given the current observation +2. You should only issue one action at a time. +4. Generate the action in the correct format, wrap the action inside ``````. For example, ```click [1234]```". +5. Issue stop action when you think you have achieved the objective.""", + "examples": [ + ( + """OBSERVATION: +[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' + [1749] StaticText '$279.49' + [1757] button 'Add to Cart' + [1760] button 'Add to Wish List' + [1761] button 'Add to Compare' +URL: http://onestopmarket.com/office-products/office-electronics.html +OBJECTIVE: What is the price of HP Inkjet Fax Machine +PREVIOUS ACTION: None""", + "```stop [$279.49]```", + ), + ( + """OBSERVATION: +[164] textbox 'Search' focused: True required: False +[171] button 'Go' +[174] link 'Find directions between two points' +[212] heading 'Search Results' +[216] button 'Close' +URL: http://openstreetmap.org +OBJECTIVE: Show me the restaurants near CMU +PREVIOUS ACTION: None""", + "```type [164] [restaurants near CMU] [1]```", + ), + ], + "template": """OBSERVATION: +{observation} +URL: {url} +OBJECTIVE: {objective} +PREVIOUS ACTION: {previous_action}""", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": ["url", "objective", "observation", "previous_action"], + "prompt_constructor": "CoTPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```" + }, +} diff --git a/agent/prompts/raw/p_direct_id_actree_3s_llama.py b/agent/prompts/raw/p_direct_id_actree_3s_llama.py new file mode 100644 index 0000000..6278d2b --- /dev/null +++ b/agent/prompts/raw/p_direct_id_actree_3s_llama.py @@ -0,0 +1,83 @@ +prompt = { + "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. +`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). +`scroll [direction=down|up]`: Scroll the page up or down. + +Tab Management Actions: +`new_tab`: Open a new, empty browser tab. +`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. +`close_tab`: Close the currently active tab. + +URL Navigation Actions: +`goto [url]`: Navigate to a specific URL. +`go_back`: Navigate to the previously viewed page. +`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. + +Homepage: +If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. + +You can only issue one action at a time""", + + "examples": [ + ( + """Observation: +[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' + [1749] StaticText '$279.49' + [1757] button 'Add to Cart' + [1760] button 'Add to Wish List' + [1761] button 'Add to Compare' +URL: http://onestopmarket.com/office-products/office-electronics.html +Objective: What is the price of HP Inkjet Fax Machine +Previous action: None""", + "```stop [$279.49]```", + ), + ( + """Observation: +[164] textbox 'Search' focused: True required: False +[171] button 'Go' +[174] link 'Find directions between two points' +[212] heading 'Search Results' +[216] button 'Close' +URL: http://openstreetmap.org +Objective: Show me the restaurants near CMU +Previous action: None""", + "```type [164] [restaurants near CMU] [1]```", + ), + ( + """Observation: +[2036] button 'Sort by: New' hasPopup: menu expanded: False + [587] link 'US Marine’s adoption of Afghan war orphan voided' + [989] time 'March 30, 2023 at 15:03:48 AM UTC' + [602] link 'York student uses AI chatbot to get parking fine revoked' + [1025] time 'March 15, 2023 at 7:48:34 AM UTC' + [617] link 'Loveland parents furious after teachers leave, communication lagged during school threat investigation' + [1025] time 'March 2, 2023 at 3:46:01 AM UTC' +URL: http://reddit.com/f/news/new +Objective: Open the most recent post that was published prior to March 1st. +Previous action: None""", + "```scroll [down]```", + ) + ], + "template": """Observation: +{observation} +URL: {url} +Objective: {objective} +Previous action: {previous_action}""", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": ["url", "objective", "observation", "previous_action"], + "prompt_constructor": "DirectPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```", + "force_prefix": "```" + }, +} diff --git a/browser_env/actions.py b/browser_env/actions.py index 3495933..04ed355 100644 --- a/browser_env/actions.py +++ b/browser_env/actions.py @@ -125,6 +125,7 @@ def action2str( action_str = f"click [{element_id}] where [{element_id}] is {semantic_element}" case ActionTypes.TYPE: text = "".join([_id2key[i] for i in action["text"]]) + text = text.replace("\n", " ") action_str = f"type [{element_id}] [{text}] where [{element_id}] is {semantic_element}" case ActionTypes.HOVER: action_str = f"hover [{element_id}] where [{element_id}] is {semantic_element}" diff --git a/browser_env/auto_login.py b/browser_env/auto_login.py index d466603..1354a21 100644 --- a/browser_env/auto_login.py +++ b/browser_env/auto_login.py @@ -1,5 +1,9 @@ """Script to automatically login each website""" +import argparse import glob +import os +import time +from concurrent.futures import ThreadPoolExecutor from itertools import combinations from pathlib import Path @@ -17,6 +21,17 @@ SLOW_MO = 0 +SITES = ["gitlab", "shopping", "shopping_admin", "reddit"] +URLS = [ + f"{GITLAB}/-/profile", + f"{SHOPPING}/wishlist/", + f"{SHOPPING_ADMIN}/dashboard", + f"{REDDIT}/user/{ACCOUNTS['reddit']['username']}/account", +] +EXACT_MATCH = [True, True, True, True] +KEYWORDS = ["", "", "Dashboard", "Delete"] + + def is_expired( storage_state: Path, url: str, keyword: str, url_exact: bool = True ) -> bool: @@ -26,10 +41,11 @@ def is_expired( context_manager = sync_playwright() playwright = context_manager.__enter__() - browser = playwright.chromium.launch(headless=HEADLESS, slow_mo=SLOW_MO) + browser = playwright.chromium.launch(headless=True, slow_mo=SLOW_MO) context = browser.new_context(storage_state=storage_state) page = context.new_page() page.goto(url) + time.sleep(1) d_url = page.url content = page.content() context_manager.__exit__() @@ -42,7 +58,7 @@ def is_expired( return url not in d_url -def renew_comb(comb: list[str]) -> None: +def renew_comb(comb: list[str], auth_folder: str = "./.auth") -> None: context_manager = sync_playwright() playwright = context_manager.__enter__() browser = playwright.chromium.launch(headless=HEADLESS) @@ -83,42 +99,61 @@ def renew_comb(comb: list[str]) -> None: page.get_by_test_id("password-field").fill(password) page.get_by_test_id("sign-in-button").click() - context.storage_state(path=f"./.auth/{'.'.join(comb)}_state.json") + context.storage_state(path=f"{auth_folder}/{'.'.join(comb)}_state.json") context_manager.__exit__() -def main() -> None: - sites = ["gitlab", "shopping", "shopping_admin", "reddit"] - urls = [ - f"{GITLAB}/-/profile", - f"{SHOPPING}/wishlist/", - f"{SHOPPING_ADMIN}/dashboard", - f"{REDDIT}/user/{ACCOUNTS['reddit']['username']}/account", - ] - exact_match = [True, True, True, True] - keywords = ["", "", "Dashboard", "Delete"] - - pairs = list(combinations(sites, 2)) - for pair in pairs: - # TODO[shuyanzh] auth don't work on these two sites - if "reddit" in pair and ( - "shopping" in pair or "shopping_admin" in pair - ): - continue - renew_comb(list(sorted(pair))) - - for site in sites: - renew_comb([site]) - - for c_file in glob.glob("./.auth/*.json"): - comb = c_file.split("/")[-1].rsplit("_", 1)[0].split(".") - for cur_site in comb: - url = urls[sites.index(cur_site)] - keyword = keywords[sites.index(cur_site)] - match = exact_match[sites.index(cur_site)] - assert not is_expired(Path(c_file), url, keyword, match) +def get_site_comb_from_filepath(file_path: str) -> list[str]: + comb = os.path.basename(file_path).rsplit("_", 1)[0].split(".") + return comb + + +def main(auth_folder: str = "./.auth") -> None: + pairs = list(combinations(SITES, 2)) + + max_workers = 8 + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for pair in pairs: + # TODO[shuyanzh] auth don't work on these two sites + if "reddit" in pair and ( + "shopping" in pair or "shopping_admin" in pair + ): + continue + executor.submit( + renew_comb, list(sorted(pair)), auth_folder=auth_folder + ) + + for site in SITES: + executor.submit(renew_comb, [site], auth_folder=auth_folder) + + futures = [] + cookie_files = list(glob.glob(f"{auth_folder}/*.json")) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for c_file in cookie_files: + comb = get_site_comb_from_filepath(c_file) + for cur_site in comb: + url = URLS[SITES.index(cur_site)] + keyword = KEYWORDS[SITES.index(cur_site)] + match = EXACT_MATCH[SITES.index(cur_site)] + future = executor.submit( + is_expired, Path(c_file), url, keyword, match + ) + futures.append(future) + + for i, future in enumerate(futures): + assert not future.result(), f"Cookie {cookie_files[i]} expired." if __name__ == "__main__": - main() + parser = argparse.ArgumentParser() + parser.add_argument("--site_list", nargs="+", default=[]) + parser.add_argument("--auth_folder", type=str, default="./.auth") + args = parser.parse_args() + if not args.site_list: + main() + else: + if "all" in args.site_list: + main(auth_folder=args.auth_folder) + else: + renew_comb(args.site_list, auth_folder=args.auth_folder) diff --git a/browser_env/env_config.py b/browser_env/env_config.py index e3eac6a..81cf52d 100644 --- a/browser_env/env_config.py +++ b/browser_env/env_config.py @@ -18,14 +18,14 @@ and MAP and HOMEPAGE ), ( - f"Please setup the URLs to each site. Current: " - + f"Reddit: {REDDIT}" - + f"Shopping: {SHOPPING}" - + f"Shopping Admin: {SHOPPING_ADMIN}" - + f"Gitlab: {GITLAB}" - + f"Wikipedia: {WIKIPEDIA}" - + f"Map: {MAP}" - + f"Homepage: {HOMEPAGE}" + f"Please setup the URLs to each site. Current: \n" + + f"Reddit: {REDDIT}\n" + + f"Shopping: {SHOPPING}\n" + + f"Shopping Admin: {SHOPPING_ADMIN}\n" + + f"Gitlab: {GITLAB}\n" + + f"Wikipedia: {WIKIPEDIA}\n" + + f"Map: {MAP}\n" + + f"Homepage: {HOMEPAGE}\n" ) diff --git a/config_files/test.raw.json b/config_files/test.raw.json index cb4ee8b..91e88d7 100644 --- a/config_files/test.raw.json +++ b/config_files/test.raw.json @@ -1077,7 +1077,7 @@ "reference_answers": { "must_include": [ "DoubleTree by Hilton Hotel Pittsburgh Airport", - "2.0km" + "1.4km" ] }, "reference_url": "", @@ -1182,7 +1182,7 @@ "string_match" ], "reference_answers": { - "exact_match": "Yes" + "must_include": ["Yes"] }, "reference_url": "", "program_html": [], @@ -1212,7 +1212,7 @@ "string_match" ], "reference_answers": { - "exact_match": "Yes" + "must_include": ["Yes"] }, "reference_url": "", "program_html": [], @@ -1242,7 +1242,7 @@ "string_match" ], "reference_answers": { - "exact_match": "Yes" + "must_include": ["Yes"] }, "reference_url": "", "program_html": [], @@ -1272,7 +1272,7 @@ "string_match" ], "reference_answers": { - "exact_match": "Yes" + "must_include": ["Yes"] }, "reference_url": "", "program_html": [], @@ -1302,7 +1302,7 @@ "string_match" ], "reference_answers": { - "exact_match": "Yes" + "must_include": ["Yes"] }, "reference_url": "", "program_html": [], @@ -1395,7 +1395,7 @@ "must_include": [ "hollister", "Joust Bag", - "Antonia Race Tank" + "Antonia Racer Tank" ] }, "reference_url": "", @@ -1425,7 +1425,7 @@ "reference_answers": null, "reference_url": "__GITLAB__/dashboard/todos", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 303 }, @@ -1449,7 +1449,7 @@ "reference_answers": null, "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?sort=created_asc&state=opened", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 300 }, @@ -1473,7 +1473,7 @@ "reference_answers": null, "reference_url": "__GITLAB__/primer/design/-/issues/?sort=created_date&state=opened", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 300 }, @@ -2859,14 +2859,13 @@ "must_include": [ "Rhode Island", "Massachusetts", - "New York", - "New Jersey" + "New York" ] }, "reference_url": "", "program_html": [], "string_note": "", - "reference_answer_raw_annotation": "Rhode Island, Massachusetts, New York, New Jersey" + "reference_answer_raw_annotation": "Rhode Island, Massachusetts, New York" }, "intent_template_id": 67 }, @@ -2894,13 +2893,15 @@ "Ohio", "Maryland", "New York", - "Virginia" + "New Jersey", + "Delaware", + "West Virginia" ] }, "reference_url": "", "program_html": [], "string_note": "", - "reference_answer_raw_annotation": "Ohio, Maryland, New York, Virginia" + "reference_answer_raw_annotation": "Ohio, Maryland, New York, New Jersey, Delaware, West Virginia" }, "intent_template_id": 67 }, @@ -3288,7 +3289,7 @@ "url_match" ], "reference_answers": null, - "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?sort=priority_desc&state=opened&label_name%5B%5D=help%20wanted", + "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?label_name%5B%5D=help%20wanted", "program_html": [], "url_note": "GOLD in PRED" }, @@ -3315,7 +3316,7 @@ "url_match" ], "reference_answers": null, - "reference_url": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?sort=priority_desc&state=opened&label_name%5B%5D=question", + "reference_url": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?label_name%5B%5D=question", "program_html": [], "url_note": "GOLD in PRED" }, @@ -3342,7 +3343,7 @@ "url_match" ], "reference_answers": null, - "reference_url": "__GITLAB__/keycloak/keycloak/-/issues/?sort=priority_desc&state=opened&label_name%5B%5D=flaky-test", + "reference_url": "__GITLAB__/keycloak/keycloak/-/issues/?label_name%5B%5D=flaky-test", "program_html": [], "url_note": "GOLD in PRED" }, @@ -3369,7 +3370,7 @@ "url_match" ], "reference_answers": null, - "reference_url": "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?sort=priority_desc&state=opened&label_name%5B%5D=OpenAPI%20Generator%20CLI", + "reference_url": "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?label_name%5B%5D=OpenAPI%20Generator%20CLI", "program_html": [], "url_note": "GOLD in PRED" }, @@ -3396,7 +3397,7 @@ "url_match" ], "reference_answers": null, - "reference_url": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?sort=priority_desc&state=opened&label_name%5B%5D=BUG", + "reference_url": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?label_name%5B%5D=BUG", "program_html": [], "url_note": "GOLD in PRED" }, @@ -3711,12 +3712,12 @@ "string_match" ], "reference_answers": { - "exact_match": "Teofila" + "exact_match": "N/A" }, "reference_url": "", "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Teofila" + "string_note": "There is no negative review for Chloe tank", + "reference_answer_raw_annotation": "" }, "intent_template_id": 245 }, @@ -5026,7 +5027,7 @@ "reference_answers": null, "reference_url": "__GITLAB__/dashboard/merge_requests?assignee_username=byteblaze", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 290 }, @@ -5076,7 +5077,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 171 }, @@ -5102,7 +5103,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 171 }, @@ -5128,7 +5129,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 171 }, @@ -5154,7 +5155,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 171 }, @@ -5180,7 +5181,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 171 }, @@ -5537,12 +5538,13 @@ "url_match" ], "reference_answers": { - "exact_match": "No" + "fuzzy_match": ["No, it is open"] }, "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/8", "program_html": [], "reference_answer_raw_annotation": "Not closed", - "string_note": "" + "string_note": "", + "url_note": "GOLD in PRED" }, "intent_template_id": 310 }, @@ -5567,7 +5569,7 @@ "url_match" ], "reference_answers": { - "exact_match": "No" + "fuzzy_match": ["No, it is open"] }, "reference_url": "__GITLAB__/byteblaze/a11y-webring.club/-/issues/71", "program_html": [], @@ -5597,7 +5599,7 @@ "url_match" ], "reference_answers": { - "exact_match": "No" + "fuzzy_match": ["No, it is open"] }, "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/18", "program_html": [], @@ -5627,7 +5629,7 @@ "url_match" ], "reference_answers": { - "exact_match": "No" + "fuzzy_match": ["No, it is open"] }, "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/1", "program_html": [], @@ -5657,7 +5659,7 @@ "url_match" ], "reference_answers": { - "exact_match": "Yes" + "fuzzy_match": ["Yes, it is closed"] }, "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/719", "program_html": [], @@ -7463,23 +7465,21 @@ "geolocation": null, "intent_template": "Get the order number of my most recent {{status}} order ", "instantiation_dict": { - "status": "" + "status": "under delivery" }, - "intent": "Get the order number of my most recent order ", + "intent": "Get the order number of my most recent under delivery order ", "require_reset": false, "eval": { "eval_types": [ "string_match" ], "reference_answers": { - "must_include": [ - "136" - ] + "exact_match": "N/A" }, "reference_url": "", "program_html": [], "string_note": "", - "reference_answer_raw_annotation": "000000136" + "reference_answer_raw_annotation": "There is no under delivery order" }, "intent_template_id": 213 }, @@ -7578,7 +7578,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 138 }, @@ -7604,7 +7604,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 138 }, @@ -7630,7 +7630,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/women-cross-flower-beachwear-tankini-bandeau-bandage-bikini-set-push-up-swimwear-bathing-suit-two-pieces-swimsuits.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 138 }, @@ -7656,7 +7656,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 138 }, @@ -7682,7 +7682,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 138 }, @@ -7857,8 +7857,8 @@ ], "reference_answers": { "must_include": [ - "40.4424191", - "-79.9397388" + "40.442", + "-79.939" ] }, "reference_url": "", @@ -7889,8 +7889,8 @@ ], "reference_answers": { "must_include": [ - "40.46076", - "-79.94666" + "40.460", + "-79.946" ] }, "reference_url": "", @@ -7921,8 +7921,8 @@ ], "reference_answers": { "must_include": [ - "40.4511693", - "-79.9334241" + "40.451", + "-79.933" ] }, "reference_url": "", @@ -7953,8 +7953,8 @@ ], "reference_answers": { "must_include": [ - "40.4443", - "-79.94889" + "40.444", + "-79.948" ] }, "reference_url": "", @@ -7985,8 +7985,8 @@ ], "reference_answers": { "must_include": [ - "40.45761", - "-79.92934" + "40.457", + "-79.929" ] }, "reference_url": "", @@ -8168,7 +8168,7 @@ "reference_answers": null, "reference_url": "__GITLAB__/explore", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 325 }, @@ -8221,7 +8221,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/video-games.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 211 }, @@ -8247,7 +8247,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/electronics/headphones.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 211 }, @@ -8273,7 +8273,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 211 }, @@ -8299,7 +8299,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 211 }, @@ -8325,7 +8325,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 211 }, @@ -8485,7 +8485,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html?price=0-25", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 139 }, @@ -8512,7 +8512,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html?price=0-30", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 139 }, @@ -8539,7 +8539,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html?price=0-46.99", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 139 }, @@ -8566,7 +8566,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html?price=0-78", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 139 }, @@ -8593,7 +8593,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html?price=0-199", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 139 }, @@ -8619,7 +8619,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/catalogsearch/result/?q=usb+wifi", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 212 }, @@ -8645,7 +8645,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/catalogsearch/result/?q=xbox", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 212 }, @@ -8671,7 +8671,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/catalogsearch/result/?q=switch+accessories", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 212 }, @@ -8697,7 +8697,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/catalogsearch/result/?q=iphone+13", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 212 }, @@ -8723,7 +8723,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/catalogsearch/result/?q=green+tea+bag+for+weight+loss", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 212 }, @@ -8902,7 +8902,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 210 }, @@ -8929,7 +8929,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 207 }, @@ -8956,7 +8956,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 207 }, @@ -8983,7 +8983,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/external-hard-drive-2tb-ultra-thin-external-hard-drive-2000gb-ultra-high-speed-portable-3-1-type-c-storage-drive-compatible-with-pc-laptop-and-mac-2tb-a1.html", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 207 }, @@ -9341,7 +9341,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/sales/order/view/order_id/180/", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 180 }, @@ -9367,7 +9367,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/sales/order/view/order_id/170/", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 180 }, @@ -9393,7 +9393,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/sales/order/view/order_id/189/", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 180 }, @@ -9415,12 +9415,12 @@ "eval": { "eval_types": [ "string_match" + "string_match" ], - "reference_answers": "N/A", + "reference_answers": {"exact_match": "N/A"}, "reference_url": "", "program_html": [], - "string_note": "There is no order in the processing status", - "reference_answer_raw_annotation": "N/A" + "string_note": "there is no order in processing" }, "intent_template_id": 180 }, @@ -9442,12 +9442,12 @@ "eval": { "eval_types": [ "string_match" + "string_match" ], - "reference_answers": "N/A", + "reference_answers": {"exact_match": "N/A"}, "reference_url": "", "program_html": [], - "string_note": "There is no order out of delivery", - "reference_answer_raw_annotation": "N/A" + "string_note": "there is no order in processing" }, "intent_template_id": 180 }, @@ -10131,7 +10131,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=chairs&product_list_dir=asc", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 208 }, @@ -10158,7 +10158,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/catalogsearch/result/index/?q=mouth%20night%20guard%20&product_list_order=price", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 208 }, @@ -10185,7 +10185,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/catalogsearch/result/?q=Canon+photo+printer", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 208 }, @@ -10212,7 +10212,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/catalogsearch/result/index/?q=%20iphone%2012%20phone%20case&product_list_order=name", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 208 }, @@ -10239,7 +10239,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=%20iphone%2012%20phone%20case", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 208 }, @@ -10577,7 +10577,7 @@ "reference_answers": null, "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?label_name%5B%5D=bug", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 299 }, @@ -10603,7 +10603,7 @@ "reference_answers": null, "reference_url": "__GITLAB__/primer/design/-/issues/?label_name%5B%5D=type%3A%20bug%20%F0%9F%90%9E", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 299 }, @@ -10629,7 +10629,7 @@ "reference_answers": null, "reference_url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=enhancement", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 299 }, @@ -10653,9 +10653,9 @@ "url_match" ], "reference_answers": null, - "reference_url": "__GITLAB__/root/metaseq/-/issues/?search=OPT&sort=priority_desc&state=opened&label_name%5B%5D=question&first_page_size=20", + "reference_url": "__GITLAB__/root/metaseq/-/issues/?search=OPT&label_name%5B%5D=question", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 299 }, @@ -10679,9 +10679,9 @@ "url_match" ], "reference_answers": null, - "reference_url": "__GITLAB__/root/metaseq/-/issues/?sort=priority_desc&state=opened&label_name%5B%5D=None&first_page_size=20", + "reference_url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=None", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 299 }, @@ -10923,7 +10923,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/video-games/playstation-4/accessories.html?product_list_order=price", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 137 }, @@ -10950,7 +10950,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html?product_list_order=price", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 137 }, @@ -10977,7 +10977,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html?product_list_order=price", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 137 }, @@ -11004,7 +11004,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html?product_list_order=price&product_list_dir=desc", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 137 }, @@ -11031,7 +11031,7 @@ "reference_answers": null, "reference_url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html?product_list_dir=desc", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 137 }, @@ -11106,7 +11106,7 @@ "reference_answers": null, "reference_url": "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze", "program_html": [], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 291 }, @@ -11651,7 +11651,7 @@ "url_match" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1/key/", + "reference_url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1", "program_html": [], "url_note": "GOLD in PRED" }, @@ -11733,7 +11733,7 @@ "url_match" ], "reference_answers": null, - "reference_url": "__MAP__search?query=restaurants%20near%20CMU%20ArtPark%20Lab", + "reference_url": "__MAP__/search?query=restaurants%20near%20CMU%20ArtPark%20Lab", "program_html": [], "url_note": "GOLD in PRED" }, @@ -11760,7 +11760,7 @@ "url_match" ], "reference_answers": null, - "reference_url": "__MAP__search?query=parking%20near%20carnegie%20mellon%20university", + "reference_url": "__MAP__/search?query=parking%20near%20carnegie%20mellon%20university", "program_html": [], "url_note": "GOLD in PRED" }, @@ -11787,7 +11787,7 @@ "url_match" ], "reference_answers": null, - "reference_url": "__MAP__search?query=hotels%20near%20carnegie%20mellon%20university", + "reference_url": "__MAP__/search?query=hotels%20near%20carnegie%20mellon%20university", "program_html": [], "url_note": "GOLD in PRED" }, @@ -11814,7 +11814,7 @@ "url_match" ], "reference_answers": null, - "reference_url": "__MAP__search?query=bars%20near%20Carnegie%20Music%20Hall", + "reference_url": "__MAP__/search?query=bars%20near%20Carnegie%20Music%20Hall", "program_html": [], "url_note": "GOLD in PRED" }, @@ -11841,7 +11841,7 @@ "url_match" ], "reference_answers": null, - "reference_url": "__MAP__search?query=hotels%20near%20Carnegie%20Music%20Hall", + "reference_url": "__MAP__/search?query=hotels%20near%20Carnegie%20Music%20Hall", "program_html": [], "url_note": "GOLD in PRED" }, @@ -11994,7 +11994,7 @@ ], "reference_answers": { "must_include": [ - "3" + "65 |OR| 3" ] }, "reference_url": "", @@ -12484,7 +12484,7 @@ "program_html": [ { "url": "__REDDIT__/user/MarvelsGrantMan136", - "locator": "document.querySelector(\".profile-user-bio\").outerText", + "locator": "document.querySelector(\".user-bio__biography\").outerText", "required_contents": { "exact_match": "I am a robot" } @@ -12517,7 +12517,7 @@ "program_html": [ { "url": "__REDDIT__/user/MarvelsGrantMan136", - "locator": "document.querySelector(\".profile-user-bio\").outerText", + "locator": "document.querySelector(\".user-bio__biography\").outerText", "required_contents": { "exact_match": "Pro Python Developer with 20 years of Experience" } @@ -12550,7 +12550,7 @@ "program_html": [ { "url": "__REDDIT__/user/MarvelsGrantMan136", - "locator": "document.querySelector(\".profile-user-bio\").outerText", + "locator": "document.querySelector(\".user-bio__biography\").outerText", "required_contents": { "exact_match": "Seeking SDE positions" } @@ -12583,7 +12583,7 @@ "program_html": [ { "url": "__REDDIT__/user/MarvelsGrantMan136", - "locator": "document.querySelector(\".profile-user-bio\").outerText", + "locator": "document.querySelector(\".user-bio__biography\").outerText", "required_contents": { "exact_match": "Freelance Web Developer" } @@ -12616,7 +12616,7 @@ "program_html": [ { "url": "__REDDIT__/user/MarvelsGrantMan136", - "locator": "document.querySelector(\".profile-user-bio\").outerText", + "locator": "document.querySelector(\".user-bio__biography\").outerText", "required_contents": { "exact_match": "Awesome Prompt Artist" } @@ -12648,8 +12648,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/f/books/new", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/books/124260/adults-reading-to-each-other-out-loud", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -12683,8 +12683,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/f/diy/new", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/DIY/119019/how-can-i-bring-an-hdmi-cable-from-my-pc-downstairs-to-my-tv", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -12718,8 +12718,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/f/futurology/new", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/Futurology/119517/openai-ceo-it-s-not-funny-that-i-m-afraid-of-the-ai-we-re", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -12753,8 +12753,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/f/books/deeplearning", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/deeplearning/124993/meta-s-llama-weights-leaked-on-torrent-and-the-best-thing", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -12788,8 +12788,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/f/books/explainlikeimfive", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/explainlikeimfive/39244/eli5-how-does-pinching-a-ribbon-and-sliding-your-finger", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -13137,7 +13137,7 @@ "program_html": [ { "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.profile-user-bio').outerText", + "locator": "document.querySelector('.cover-status').lastChild.textContent", "required_contents": { "exact_match": "Busy" } @@ -13170,7 +13170,7 @@ "program_html": [ { "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.profile-user-bio').outerText", + "locator": "document.querySelector('.cover-status').lastChild.textContent", "required_contents": { "exact_match": "Enjoying life" } @@ -13203,7 +13203,7 @@ "program_html": [ { "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.profile-user-bio').outerText", + "locator": "document.querySelector('.cover-status').lastChild.textContent", "required_contents": { "exact_match": "Playing Badminton" } @@ -13236,7 +13236,7 @@ "program_html": [ { "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.profile-user-bio').outerText", + "locator": "document.querySelector('.cover-status').lastChild.textContent", "required_contents": { "exact_match": "Resting due to leg injury" } @@ -13269,7 +13269,7 @@ "program_html": [ { "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.profile-user-bio').outerText", + "locator": "document.querySelector('.cover-status').lastChild.textContent", "required_contents": { "exact_match": "Out of Office" } @@ -14562,7 +14562,7 @@ "task_id": 460, "require_login": true, "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/418/", "geolocation": null, "intent_template": "{{action}} the price of this product by {{amount}}", "instantiation_dict": { @@ -14579,10 +14579,10 @@ "reference_url": "", "program_html": [ { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/418/", "locator": "document.querySelector('[name=\"product[price]\"').value", "required_contents": { - "exact_match": "58.65" + "exact_match": "38.25" } } ] @@ -14596,7 +14596,7 @@ "task_id": 461, "require_login": true, "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/721/", "geolocation": null, "intent_template": "{{action}} the price of this product by {{amount}}", "instantiation_dict": { @@ -14613,10 +14613,10 @@ "reference_url": "", "program_html": [ { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/721/", "locator": "document.querySelector('[name=\"product[price]\"').value", "required_contents": { - "exact_match": "43.50" + "exact_match": "29.50" } } ] @@ -14630,7 +14630,7 @@ "task_id": 462, "require_login": true, "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/710/", "geolocation": null, "intent_template": "{{action}} the price of this product by {{amount}}", "instantiation_dict": { @@ -14647,10 +14647,10 @@ "reference_url": "", "program_html": [ { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/710/", "locator": "document.querySelector('[name=\"product[price]\"').value", "required_contents": { - "exact_match": "75.90" + "exact_match": "19.80" } } ] @@ -14664,7 +14664,7 @@ "task_id": 463, "require_login": true, "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/996/", "geolocation": null, "intent_template": "{{action}} the price of this product by {{amount}}", "instantiation_dict": { @@ -14681,10 +14681,10 @@ "reference_url": "", "program_html": [ { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/996/", "locator": "document.querySelector('[name=\"product[price]\"').value", "required_contents": { - "exact_match": "79.35" + "exact_match": "36.80" } } ] @@ -14714,11 +14714,13 @@ "reference_url": "", "program_html": [ { - "url": "__SHOPPING_ADMIN__/antonia-racer-tank.html", - "locator": "document.querySelector('.product.info.detailed').outerText", + "url": "__SHOPPING_ADMIN__/../antonia-racer-tank.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", "required_contents": { "must_include": [ - "This is in regular rotation at the gym. Its colorful and looks kinda cute under my exercise tanks." + "This is in regular rotation at the gym", + "Its colorful and looks kinda cute under my exercise tanks", + "it's very stylish for yoga or something else low impact" ] } } @@ -15640,19 +15642,14 @@ "require_reset": false, "eval": { "eval_types": [ - "program_html" + "string_match" ], - "reference_answers": null, + "reference_answers": { + "exact_match": "N/A" + }, "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/299/", - "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", - "required_contents": { - "exact_match": "the order is ready to be shipped soon!" - } - } - ] + "program_html": [], + "string_note": "System message: We cannot add order history." }, "intent_template_id": 280 }, @@ -16804,8 +16801,7 @@ "AndroidAsync", "Pytorch-GAN", "administrate", - "keycloak", - "openapi-generator" + "keycloak" ] } } @@ -17545,12 +17541,14 @@ "reference_url": "", "program_html": [ { - "url": "__SHOPPING_ADMIN__/bella-tank.html", - "locator": "document.querySelector('.product.info.detailed').outerText", + "url": "__SHOPPING_ADMIN__/../bella-tank.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", "required_contents": { "must_include": [ - "Good choice for working out and stylin' enough to wear when I'm hanging with friends on hot days. Also washes really well!", - "Always a sweet n sporty look for the gym! Keeps me cool and the seams don't rub up against me like some of my other tanks." + "Good choice for working out and stylin' enough to wear when I'm hanging with friends on hot days", + "Also washes really well", + "Always a sweet n sporty look for the gym", + "Keeps me cool and the seams don't rub up against me like some of my other tanks" ] } } @@ -17581,14 +17579,15 @@ "reference_url": "", "program_html": [ { - "url": "__SHOPPING_ADMIN__/selene-yoga-hoodie.html", - "locator": "document.querySelector('.product.info.detailed').outerText", + "url": "__SHOPPING_ADMIN__/../selene-yoga-hoodie.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", "required_contents": { "must_include": [ "I was super cold and it did the job.", "The sleeves are definitely thicker than you realize, which is a good thing", "really quite substantial", - "m planning on buying another one of these in another color. the best hoodie ive ever owned." + "planning on buying another one of these in another color", + "the best hoodie ive ever owned" ] } } @@ -17619,15 +17618,16 @@ "reference_url": "", "program_html": [ { - "url": "__SHOPPING_ADMIN__/radiant-tee.html", - "locator": "document.querySelector('.product.info.detailed').outerText", + "url": "__SHOPPING_ADMIN__/../radiant-tee.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", "required_contents": { "must_include": [ "What I rally love here is that it does the job of keeping me cool and dry", - "I'm a big guy and sweat A LOT! Even after a day of gulf, I'm still dry and comfortable", - "What a versatile shirt!", - "Not only does it feel very soft compared to my old worn out polos, but it also does the job promised.", - "I like going out after my game for drinks so I look good then too and don't need to change into something fresh." + "I'm a big guy and sweat A LOT", + "Even after a day of gulf, I'm still dry and comfortable", + "What a versatile shirt", + "Not only does it feel very soft compared to my old worn out polos, but it also does the job promised", + "I like going out after my game for drinks so I look good then too and don't need to change into something fresh" ] } } @@ -17658,11 +17658,11 @@ "reference_url": "", "program_html": [ { - "url": "__SHOPPING_ADMIN__/affirm-water-bottle.html", - "locator": "document.querySelector('.product.info.detailed').outerText", + "url": "__SHOPPING_ADMIN__/../affirm-water-bottle.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", "required_contents": { "must_include": [ - "Wide mouth opening makes it easy to clean!" + "Wide mouth opening makes it easy to clean" ] } } @@ -19107,7 +19107,7 @@ "program_html": [ { "url": "__REDDIT__/f/sci_fi/edit", - "locator": "document.querySelector(\"#forum_description\").outerText", + "locator": "document.querySelector(\"#forum_description\").value", "required_contents": { "must_include": [ "A wild place for sci-fi enthusiasts" @@ -19116,7 +19116,7 @@ }, { "url": "__REDDIT__/f/sci_fi/edit", - "locator": "document.querySelector(\"#forum_sidebar\").outerText", + "locator": "document.querySelector(\"#forum_sidebar\").value", "required_contents": { "must_include": [ "New", @@ -19161,7 +19161,7 @@ "program_html": [ { "url": "__REDDIT__/f/cmu_lti/edit", - "locator": "document.querySelector(\"#forum_description\").outerText", + "locator": "document.querySelector(\"#forum_description\").value", "required_contents": { "must_include": [ "Language Technologies Institute at Carnegie Mellon University" @@ -19170,7 +19170,7 @@ }, { "url": "__REDDIT__/f/cmu_lti/edit", - "locator": "document.querySelector(\"#forum_sidebar\").outerText", + "locator": "document.querySelector(\"#forum_sidebar\").value", "required_contents": { "must_include": [ "announcement", @@ -19214,7 +19214,7 @@ "program_html": [ { "url": "__REDDIT__/f/Cyberpunk/edit", - "locator": "document.querySelector(\"#forum_description\").outerText", + "locator": "document.querySelector(\"#forum_description\").value", "required_contents": { "must_include": [ "Welcome to the future" @@ -19223,7 +19223,7 @@ }, { "url": "__REDDIT__/f/Cyberpunk/edit", - "locator": "document.querySelector(\"#forum_sidebar\").outerText", + "locator": "document.querySelector(\"#forum_sidebar\").value", "required_contents": { "must_include": [ "Games", @@ -19268,7 +19268,7 @@ "program_html": [ { "url": "__REDDIT__/f/PlantsForCatParents/edit", - "locator": "document.querySelector(\"#forum_description\").outerText", + "locator": "document.querySelector(\"#forum_description\").value", "required_contents": { "must_include": [ "Cat parents & plan lovers" @@ -19277,7 +19277,7 @@ }, { "url": "__REDDIT__/f/PlantsForCatParents/edit", - "locator": "document.querySelector(\"#forum_sidebar\").outerText", + "locator": "document.querySelector(\"#forum_sidebar\").value", "required_contents": { "must_include": [ "Cat friendly", @@ -19320,7 +19320,7 @@ "program_html": [ { "url": "__REDDIT__/f/Karaoke", - "locator": "document.querySelector(\"#forum_description\").outerText", + "locator": "document.querySelector(\"#forum_description\").value", "required_contents": { "must_include": [ "Place for Karaoke lovers" @@ -19329,7 +19329,7 @@ }, { "url": "__REDDIT__/f/Karaoke", - "locator": "document.querySelector(\"#forum_sidebar\").outerText", + "locator": "document.querySelector(\"#forum_sidebar\").value", "required_contents": { "must_include": [ "devices", @@ -20032,7 +20032,7 @@ "locator": "document.querySelector(\"#sidebar > section\").outerText", "required_contents": { "must_include": [ - "machine learning" + "MachineLearning" ] } } @@ -21681,11 +21681,11 @@ "geolocation": null, "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", "instantiation_dict": { - "interest": "Harry Poter", + "interest": "Harry Potter", "date": "July 8th", "subreddit": "books" }, - "intent": "Post a notice on a virtual meetup for Harry Poter enthusiasts on July 8th in the books subreddit", + "intent": "Post a notice on a virtual meetup for Harry Potter enthusiasts on July 8th in the books subreddit", "require_reset": false, "eval": { "eval_types": [ @@ -21700,7 +21700,7 @@ "locator": "document.querySelector('.submission__inner').outerText", "required_contents": { "must_include": [ - "Harry Poter", + "Harry Potter", "July 8th", "virtual meetup" ] @@ -22154,7 +22154,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING__/contact/", + "reference_url": "__SHOPPING__/contact", "program_html": [ { "url": "last", @@ -22169,7 +22169,7 @@ } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 153 }, @@ -22195,7 +22195,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING__/contact/", + "reference_url": "__SHOPPING__/contact", "program_html": [ { "url": "last", @@ -22210,7 +22210,7 @@ } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 153 }, @@ -22236,7 +22236,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING__/contact/", + "reference_url": "__SHOPPING__/contact", "program_html": [ { "url": "last", @@ -22251,7 +22251,7 @@ } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 153 }, @@ -22277,7 +22277,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING__/contact/", + "reference_url": "__SHOPPING__/contact", "program_html": [ { "url": "last", @@ -22292,7 +22292,7 @@ } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 153 }, @@ -22318,7 +22318,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING__/contact/", + "reference_url": "__SHOPPING__/contact", "program_html": [ { "url": "last", @@ -22333,7 +22333,7 @@ } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 153 }, @@ -23068,9 +23068,10 @@ "required_contents": { "must_include": [ "Unable to set neutral steering", - "Doesn\u2019t work with PC.", - "Crazy problems in automatic mode; then pedals stopped working", - "Only works with certain games." + "Doesn\u2019t work with PC", + "Crazy problems in automatic mode", + "pedals stopped working", + "Only works with certain games" ] } } @@ -23698,7 +23699,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING__/contact/", + "reference_url": "__SHOPPING__/contact", "program_html": [ { "url": "last", @@ -23711,7 +23712,7 @@ } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 163 }, @@ -23736,7 +23737,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING__/contact/", + "reference_url": "__SHOPPING__/contact", "program_html": [ { "url": "last", @@ -23749,7 +23750,7 @@ } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 163 }, @@ -23774,7 +23775,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING__/contact/", + "reference_url": "__SHOPPING__/contact", "program_html": [ { "url": "last", @@ -23787,7 +23788,7 @@ } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 163 }, @@ -23812,7 +23813,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING__/contact/", + "reference_url": "__SHOPPING__/contact", "program_html": [ { "url": "last", @@ -23825,7 +23826,7 @@ } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 163 }, @@ -23850,7 +23851,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING__/contact/", + "reference_url": "__SHOPPING__/contact", "program_html": [ { "url": "last", @@ -23863,7 +23864,7 @@ } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 163 }, @@ -23892,7 +23893,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/catalog/product/edit/id", + "reference_url": "__SHOPPING_ADMIN__/catalog/product", "program_html": [ { "url": "last", @@ -23903,7 +23904,7 @@ }, { "url": "last", - "locator": "document.querySelector('[name=\"product[name]\"').outerText", + "locator": "document.querySelector('[name=\"product[name]\"').value", "required_contents": { "must_include": [ "Energy-Bulk Women Shirt" @@ -23979,7 +23980,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/catalog/product/edit/id", + "reference_url": "__SHOPPING_ADMIN__/catalog/product", "program_html": [ { "url": "last", @@ -23990,7 +23991,7 @@ }, { "url": "last", - "locator": "document.querySelector('[name=\"product[name]\"').outerText", + "locator": "document.querySelector('[name=\"product[name]\"').value", "required_contents": { "must_include": [ "Energy-Bulk Man Yoga Pant" @@ -24066,11 +24067,11 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/catalog/product/edit/id", + "reference_url": "__SHOPPING_ADMIN__/catalog/product", "program_html": [ { "url": "last", - "locator": "document.querySelector('[name=\"product[name]\"').outerText", + "locator": "document.querySelector('[name=\"product[name]\"').value", "required_contents": { "must_include": [ "FancyBoy Man Causal Jeans" @@ -24153,11 +24154,11 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/catalog/product/edit/id", + "reference_url": "__SHOPPING_ADMIN__/catalog/product", "program_html": [ { "url": "last", - "locator": "document.querySelector('[name=\"product[name]\"').outerText", + "locator": "document.querySelector('[name=\"product[name]\"').value", "required_contents": { "must_include": [ "Swaatch Smart Watch" @@ -24233,11 +24234,11 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/catalog/product/edit/id", + "reference_url": "__SHOPPING_ADMIN__/catalog/product", "program_html": [ { "url": "last", - "locator": "document.querySelector('[name=\"product[name]\"').outerText", + "locator": "document.querySelector('[name=\"product[name]\"').value", "required_contents": { "must_include": [ "Lelelumon Yoga Mat" @@ -24310,7 +24311,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/new/", + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", "program_html": [ { "url": "last", @@ -24338,6 +24339,7 @@ { "url": "last", "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], "required_contents": { "exact_match": "by_percent" } @@ -24345,12 +24347,13 @@ { "url": "last", "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], "required_contents": { "exact_match": "20" } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 258 }, @@ -24376,7 +24379,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/new/", + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", "program_html": [ { "url": "last", @@ -24404,6 +24407,7 @@ { "url": "last", "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], "required_contents": { "exact_match": "cart_fixed" } @@ -24411,12 +24415,13 @@ { "url": "last", "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], "required_contents": { "exact_match": "10" } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 258 }, @@ -24442,7 +24447,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/new/", + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", "program_html": [ { "url": "last", @@ -24470,6 +24475,7 @@ { "url": "last", "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], "required_contents": { "exact_match": "cart_fixed" } @@ -24477,12 +24483,13 @@ { "url": "last", "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], "required_contents": { "exact_match": "15" } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 258 }, @@ -24508,7 +24515,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/new/", + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", "program_html": [ { "url": "last", @@ -24536,6 +24543,7 @@ { "url": "last", "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], "required_contents": { "exact_match": "by_percent" } @@ -24543,12 +24551,13 @@ { "url": "last", "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], "required_contents": { "exact_match": "45" } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 258 }, @@ -24574,7 +24583,7 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/new/", + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", "program_html": [ { "url": "last", @@ -24602,6 +24611,7 @@ { "url": "last", "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], "required_contents": { "exact_match": "cart_fixed" } @@ -24609,12 +24619,13 @@ { "url": "last", "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], "required_contents": { "exact_match": "40" } } ], - "url_note": "EXACT" + "url_note": "GOLD in PRED" }, "intent_template_id": 258 }, @@ -24646,14 +24657,14 @@ "url": "last", "locator": "document.querySelector('[id=\"sales_report_from\"').value", "required_contents": { - "exact_match": "02/1/2023" + "exact_match": "2/1/23" } }, { "url": "last", "locator": "document.querySelector('[id=\"sales_report_to\"').value", "required_contents": { - "exact_match": "02/28/2023" + "exact_match": "2/28/23" } } ], @@ -24689,14 +24700,14 @@ "url": "last", "locator": "document.querySelector('[id=\"sales_report_from\"').value", "required_contents": { - "exact_match": "01/29/2023" + "exact_match": "1/29/23" } }, { "url": "last", "locator": "document.querySelector('[id=\"sales_report_to\"').value", "required_contents": { - "exact_match": "03/15/2023" + "exact_match": "3/15/23" } } ], @@ -24726,20 +24737,20 @@ "program_html" ], "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/refunded/", + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/refunded", "program_html": [ { "url": "last", "locator": "document.querySelector('[id=\"sales_report_from\"').value", "required_contents": { - "exact_match": "01/1/2023" + "exact_match": "1/1/23" } }, { "url": "last", "locator": "document.querySelector('[id=\"sales_report_to\"').value", "required_contents": { - "exact_match": "03/31/2023" + "exact_match": "3/31/23" } } ], @@ -24775,7 +24786,7 @@ "url": "last", "locator": "document.querySelector('[id=\"sales_report_from\"').value", "required_contents": { - "exact_match": "01/1/2022" + "exact_match": "1/1/2022" } }, { @@ -24818,7 +24829,7 @@ "url": "last", "locator": "document.querySelector('[id=\"sales_report_from\"').value", "required_contents": { - "exact_match": "01/1/2023" + "exact_match": "1/1/2023" } }, { @@ -24862,14 +24873,14 @@ "url": "last", "locator": "document.querySelector('[id=\"sales_report_from\"').value", "required_contents": { - "exact_match": "05/1/2021" + "exact_match": "5/1/2021" } }, { "url": "last", "locator": "document.querySelector('[id=\"sales_report_to\"').value", "required_contents": { - "exact_match": "03/31/2022" + "exact_match": "3/31/2022" } } ], @@ -24906,14 +24917,14 @@ "url": "last", "locator": "document.querySelector('[id=\"sales_report_from\"').value", "required_contents": { - "exact_match": "08/5/2022" + "exact_match": "8/5/22" } }, { "url": "last", "locator": "document.querySelector('[id=\"sales_report_to\"').value", "required_contents": { - "exact_match": "03/1/2023" + "exact_match": "3/1/23" } } ], @@ -24950,14 +24961,14 @@ "url": "last", "locator": "document.querySelector('[id=\"sales_report_from\"').value", "required_contents": { - "exact_match": "07/5/2021" + "exact_match": "7/5/21" } }, { "url": "last", "locator": "document.querySelector('[id=\"sales_report_to\"').value", "required_contents": { - "exact_match": "05/31/2023" + "exact_match": "5/31/23" } } ], @@ -24994,14 +25005,14 @@ "url": "last", "locator": "document.querySelector('[id=\"sales_report_from\"').value", "required_contents": { - "exact_match": "05/1/2021" + "exact_match": "5/1/21" } }, { "url": "last", "locator": "document.querySelector('[id=\"sales_report_to\"').value", "required_contents": { - "exact_match": "05/15/2023" + "exact_match": "5/15/23" } } ], @@ -25038,14 +25049,14 @@ "url": "last", "locator": "document.querySelector('[id=\"sales_report_from\"').value", "required_contents": { - "exact_match": "05/1/2022" + "exact_match": "5/1/22" } }, { "url": "last", "locator": "document.querySelector('[id=\"sales_report_to\"').value", "required_contents": { - "exact_match": "05/31/2023" + "exact_match": "5/31/23" } } ], @@ -25077,8 +25088,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/f/gadgets/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/gadgets/19459/a-custom-gaming-pc-built-inside-a-vintage-1940s-motorola", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25113,8 +25124,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/f/history/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/history/84338/the-scientist-who-discovered-sperm-was-so-grossed-out-he", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25122,8 +25133,8 @@ } }, { - "url": "__REDDIT__/f/history/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[1].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/history/105990/4-500-year-old-sumerian-temple-dedicated-to-mighty-thunder", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25158,8 +25169,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/f/books/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/books/81371/the-letters-of-t-s-eliot-to-emily-hale-that-were-kept-sealed", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25167,8 +25178,8 @@ } }, { - "url": "__REDDIT__/f/books/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[1].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25176,8 +25187,8 @@ } }, { - "url": "__REDDIT__/f/books/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[2].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/books/59447/appalachian-prison-book-project-seeks-notebook-donations-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25212,8 +25223,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/f/movies/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/movies/86174/who-will-win-the-oscar-for-actress-in-a-supporting-role", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25221,8 +25232,8 @@ } }, { - "url": "__REDDIT__/f/movies/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[1].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/movies/86029/who-will-win-the-oscar-for-film-editing", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25230,8 +25241,8 @@ } }, { - "url": "__REDDIT__/f/movies/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[2].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/movies/86055/cindy-williams-dies-laverne-amp-shirley-star-who-appeared-in", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25239,8 +25250,8 @@ } }, { - "url": "__REDDIT__/f/movies/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[3].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/movies/42682/michelle-yeoh-to-receive-palm-springs-film-festival-s", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25275,8 +25286,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/f/technology/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/technology/48670/brain-cancer-vaccine-succeeds-at-prolonging-survival-in", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25284,8 +25295,8 @@ } }, { - "url": "__REDDIT__/f/technology/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[1].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/technology/134696/india-cuts-internet-for-27-million-people-amid-search-for", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25293,8 +25304,8 @@ } }, { - "url": "__REDDIT__/f/technology/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[2].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/technology/48785/us-judge-orders-amazon-to-cease-and-desist-anti-union", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25302,8 +25313,8 @@ } }, { - "url": "__REDDIT__/f/technology/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[3].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/technology/70354/activision-s-boston-studio-workers-announce-unionization", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25311,8 +25322,8 @@ } }, { - "url": "__REDDIT__/f/technology/top?t=all", - "locator": "document.querySelectorAll('div.submission__vote')[4].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/technology/70233/social-media-influencers-are-charged-with-feeding-followers", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25347,8 +25358,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/user/ThetaGang_wsb/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/wallstreetbets/29478/how-will-airbnb-close-following-their-earnings-report-on", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25356,8 +25367,8 @@ } }, { - "url": "__REDDIT__/user/ThetaGang_wsb/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[1].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/wallstreetbets/29458/how-much-will-the-federal-reserve-raise-interest-rates-in", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25392,8 +25403,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/user/CameronKelsey/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/EarthPorn/98332/my-favorite-place-on-the-planet-henry-s-fork-of-the-snake", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25401,8 +25412,8 @@ } }, { - "url": "__REDDIT__/user/CameronKelsey/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[1].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/EarthPorn/98297/2-years-later-this-is-still-one-of-the-most-incredible", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25410,8 +25421,8 @@ } }, { - "url": "__REDDIT__/user/CameronKelsey/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[2].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/EarthPorn/98256/i-can-t-wait-for-all-this-green-to-start-coming-back-little", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25446,8 +25457,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/user/UniversityofBath/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/IAmA/119742/hi-i-m-vienne-a-doctoral-student-at-the-university-of-bath-i", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25455,8 +25466,8 @@ } }, { - "url": "__REDDIT__/user/UniversityofBath/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[1].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/IAmA/119719/hello-reddit-i-m-nazia-mehrban-a-lecturer-in-biotechnology", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25464,8 +25475,8 @@ } }, { - "url": "__REDDIT__/user/UniversityofBath/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[2].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/IAmA/119714/i-m-ellie-jarvis-she-her-a-2nd-year-phd-student-in-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25473,8 +25484,8 @@ } }, { - "url": "__REDDIT__/user/UniversityofBath/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[3].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/IAmA/55155/hi-i-m-dr-lucy-maddox-from-bath-university-uk-i-m-a-clinical", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25482,8 +25493,8 @@ } }, { - "url": "__REDDIT__/user/UniversityofBath/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[4].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/IAmA/55142/we-re-sadeka-nujhat-hannah-leese-and-sandhya-moise-from-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25491,8 +25502,8 @@ } }, { - "url": "__REDDIT__/user/UniversityofBath/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[5].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/IAmA/34032/we-re-sandhya-moise-david-phillips-and-chan-lee-from-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25500,8 +25511,8 @@ } }, { - "url": "__REDDIT__/user/UniversityofBath/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[6].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/IAmA/13175/hi-i-m-kit-yates-i-m-a-mathematical-biologist-at-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25509,8 +25520,8 @@ } }, { - "url": "__REDDIT__/user/UniversityofBath/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[7].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/IAmA/13170/hello-i-m-dr-sara-fontani-from-the-university-of", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25545,8 +25556,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/user/Don_Gato1/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[1].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/nyc/44650/fox-news-hosts-cast-new-york-as-crime-ridden-and-chaotic", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25612,8 +25623,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129816/gov-whitmer-signs-bills-to-repeal-right-to-work-restore", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25621,8 +25632,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[1].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129808/disney-world-deal-with-union-will-raise-minimum-wage-to-18", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25630,8 +25641,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[2].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129794/judge-halts-wyoming-abortion-ban-days-after-it-took-effect", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25639,8 +25650,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[3].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129783/don-t-say-gay-lawmaker-pleads-guilty-to-covid-relief-fraud", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25648,8 +25659,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[4].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129594/arizona-gov-katie-hobbs-refuses-to-proceed-with-execution", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25657,8 +25668,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[5].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129508/tennessee-governor-oks-bill-to-cut-nashville-council-in-half", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25666,8 +25677,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[7].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/43839/philadelphia-da-larry-krasner-impeached-by-pa-house", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25675,8 +25686,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[8].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/43781/crypto-giant-ftx-to-file-for-bankruptcy-ceo-sam-bankman", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25684,8 +25695,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[9].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/43572/sec-doj-investigating-crypto-platform-ftx", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25693,8 +25704,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[10].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/43558/kansas-gov-laura-kelly-wins-re-election-defeating-gop", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-upvoted" @@ -25729,8 +25740,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/user/RickyDontLoseThat/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/massachusetts/84954/the-last-of-lincoln", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25795,8 +25806,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/user/PatientBuilder499/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[7].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/videos/115139/hundreds-of-civilian-turkish-volunteers-waiting-to-be-sent", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25831,8 +25842,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/user/sirbarani/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[3].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/sports/48303/iran-football-legend-daei-will-not-attend-world-cup-amid", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25867,8 +25878,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[7].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/UpliftingNews/16087/same-sex-marriage-is-now-legal-in-all-of-mexico-s-states", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25903,8 +25914,8 @@ "reference_url": "", "program_html": [ { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[0].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129816/gov-whitmer-signs-bills-to-repeal-right-to-work-restore", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25912,8 +25923,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[1].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129808/disney-world-deal-with-union-will-raise-minimum-wage-to-18", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25921,8 +25932,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[2].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129794/judge-halts-wyoming-abortion-ban-days-after-it-took-effect", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25930,8 +25941,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[3].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129783/don-t-say-gay-lawmaker-pleads-guilty-to-covid-relief-fraud", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25939,8 +25950,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[4].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129594/arizona-gov-katie-hobbs-refuses-to-proceed-with-execution", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25948,8 +25959,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[5].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/129508/tennessee-governor-oks-bill-to-cut-nashville-council-in-half", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25957,8 +25968,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[7].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/43839/philadelphia-da-larry-krasner-impeached-by-pa-house", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25966,8 +25977,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[8].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/43781/crypto-giant-ftx-to-file-for-bankruptcy-ceo-sam-bankman", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25975,8 +25986,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[9].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/43572/sec-doj-investigating-crypto-platform-ftx", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -25984,8 +25995,8 @@ } }, { - "url": "__REDDIT__/user/Hrekires/submissions", - "locator": "document.querySelectorAll('div.submission__vote')[10].querySelector('form').getAttribute('class')", + "url": "__REDDIT__/f/news/43558/kansas-gov-laura-kelly-wins-re-election-defeating-gop", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", "required_contents": { "must_include": [ "vote vote--user-downvoted" @@ -26481,7 +26492,7 @@ "locator": "document.querySelector('[name=\"route_to\"').value", "required_contents": { "must_include": [ - "150, Causeway Street", + "TD Garden", "Boston", "Massachusetts" ] @@ -26796,11 +26807,11 @@ "geolocation": null, "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", "instantiation_dict": { - "project_name": "web_agent_android", + "project_name": "web_agent_android_xl", "template": "Android", "account_list": "primer, convexegg, abishek" }, - "intent": "Start a private project web_agent_android with Android template and add primer, convexegg, abishek as members", + "intent": "Start a private project web_agent_android_xl with Android template and add primer, convexegg, abishek as members", "require_reset": false, "eval": { "eval_types": [ @@ -26810,7 +26821,7 @@ "reference_url": "", "program_html": [ { - "url": "__GITLAB__/byteblaze/web_agent_android", + "url": "__GITLAB__/byteblaze/web_agent_android_xl", "locator": "document.querySelector('.visibility-icon').getAttribute('title')", "required_contents": { "must_include": [ @@ -26819,7 +26830,7 @@ } }, { - "url": "__GITLAB__/byteblaze/web_agent_android/-/commits", + "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/commits", "locator": "", "required_contents": { "must_include": [ @@ -26828,7 +26839,7 @@ } }, { - "url": "__GITLAB__/byteblaze/web_agent_android/-/project_members", + "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/project_members", "locator": "", "required_contents": { "must_include": [ @@ -26933,7 +26944,7 @@ } }, { - "url": "__GITLAB__/byteblaze/agi_index/-/commits", + "url": "__GITLAB__/byteblaze/agi_index", "locator": "document.querySelector('.home-panel-description-markdown').outerText", "required_contents": { "must_include": [ @@ -26988,7 +26999,7 @@ } }, { - "url": "__GITLAB__/byteblaze/AGISite/-/commits", + "url": "__GITLAB__/byteblaze/AGISite", "locator": "document.querySelector('.home-panel-description-markdown').outerText", "required_contents": { "must_include": [ @@ -27066,10 +27077,10 @@ "geolocation": null, "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", "instantiation_dict": { - "project_name": "web_agent_android", + "project_name": "web_agent_android_xs", "template": "Android" }, - "intent": "Create a private Android repository called \"web_agent_android\" using the right template to speed up development.", + "intent": "Create a private Android repository called \"web_agent_android_xs\" using the right template to speed up development.", "require_reset": false, "eval": { "eval_types": [ @@ -27079,7 +27090,7 @@ "reference_url": "", "program_html": [ { - "url": "__GITLAB__/byteblaze/web_agent_android", + "url": "__GITLAB__/byteblaze/web_agent_android_xs", "locator": "document.querySelector('.visibility-icon').getAttribute('title')", "required_contents": { "must_include": [ @@ -27088,7 +27099,7 @@ } }, { - "url": "__GITLAB__/byteblaze/web_agent_android/-/commits", + "url": "__GITLAB__/byteblaze/web_agent_android_xs/-/commits", "locator": "", "required_contents": { "must_include": [ @@ -27178,7 +27189,7 @@ } }, { - "url": "__GITLAB__/byteblaze/agi_index/-/commits", + "url": "__GITLAB__/byteblaze/web_agent_index", "locator": "document.querySelector('.home-panel-description-markdown').outerText", "required_contents": { "must_include": [ @@ -27223,7 +27234,7 @@ } }, { - "url": "__GITLAB__/byteblaze/AGISite/-/commits", + "url": "__GITLAB__/byteblaze/11711_gitlab", "locator": "document.querySelector('.home-panel-description-markdown').outerText", "required_contents": { "must_include": [ @@ -27540,7 +27551,7 @@ "required_contents": { "must_include": [ "Carnegie Hall", - "West 56th Street", + "West 57th Street", "Manhattan", "New York" ] @@ -28275,7 +28286,7 @@ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/123/", "locator": "document.querySelector('[name=\"product[price]\"').value", "required_contents": { - "exact_match": "47" + "exact_match": "47.00" } } ] diff --git a/evaluation_harness/evaluators.py b/evaluation_harness/evaluators.py index 069c55b..df20431 100644 --- a/evaluation_harness/evaluators.py +++ b/evaluation_harness/evaluators.py @@ -1,5 +1,7 @@ """base class for evaluation""" # answer string match +import collections +import html import importlib import json import time @@ -7,15 +9,14 @@ from pathlib import Path from typing import Any, Tuple, Union -import evaluate # type: ignore[import] from beartype import beartype -from beartype.door import is_bearable from nltk.tokenize import word_tokenize # type: ignore from playwright.sync_api import CDPSession, Page from browser_env.actions import Action from browser_env.utils import StateInfo from evaluation_harness.helper_functions import ( + PseudoPage, gitlab_get_project_memeber_role, llm_fuzzy_match, reddit_get_post_url, @@ -36,7 +37,7 @@ def __call__( self, trajectory: Trajectory, config_file: Path | str, - page: Page, + page: Page | PseudoPage, client: CDPSession, ) -> float: raise NotImplementedError @@ -76,6 +77,7 @@ class StringEvaluator(Evaluator): @staticmethod @beartype def clean_answer(answer: str) -> str: + answer = answer.strip() if answer.startswith("'") and answer.endswith("'"): answer = answer[1:-1] elif answer.startswith('"') and answer.endswith('"'): @@ -92,12 +94,16 @@ def exact_match(ref: str, pred: str) -> float: @staticmethod @beartype - def must_include(ref: str, pred: str) -> float: + def must_include(ref: str, pred: str, tokenize: bool = False) -> float: clean_ref = StringEvaluator.clean_answer(ref) clean_pred = StringEvaluator.clean_answer(pred) # tokenize the answer if the ref is a single word # prevent false positive (e.g, 0) - if len(word_tokenize(clean_ref)) == 1: + if ( + tokenize + and len(clean_ref) == 1 + and len(word_tokenize(clean_ref)) == 1 + ): tok_pred = word_tokenize(clean_pred) return float(clean_ref in tok_pred) else: @@ -112,7 +118,7 @@ def __call__( self, trajectory: Trajectory, config_file: Path | str, - page: Page | None = None, + page: Page | PseudoPage | None = None, client: CDPSession | None = None, ) -> float: with open(config_file, "r") as f: @@ -129,7 +135,11 @@ def __call__( case "must_include": assert isinstance(value, list) for must_value in value: - score *= self.must_include(ref=must_value, pred=pred) + score *= self.must_include( + ref=must_value, + pred=pred, + tokenize=(len(value) == 1), + ) case "fuzzy_match": intent = configs["intent"] assert isinstance(value, list) @@ -140,38 +150,15 @@ def __call__( return score -class StringSoftEvaluator(Evaluator): - """Use text generation metrics such as BLEU, ROUGE, etc. to evaluate the answer""" - - @beartype - def __call__( - self, - trajectory: Trajectory, - config_file: Path | str, - page: Page | None = None, - client: CDPSession | None = None, - ) -> float: - with open(config_file, "r") as f: - configs = json.load(f) - - last_action = self.get_last_action(trajectory) - pred = last_action["answer"] - ref = configs["eval"]["reference_answers"] - # rouge - m = evaluate.load("rouge") - rouge = m.compute(predictions=[pred], references=[ref]) - return float(rouge["rouge1"]) - - -class URLExactEvaluator(Evaluator): - """Check whether the URL is exactly the same as of the reference URLs""" +class URLEvaluator(Evaluator): + """Check URL matching""" @beartype def __call__( self, trajectory: Trajectory, config_file: Path | str, - page: Page, + page: Page | PseudoPage, client: CDPSession | None = None, ) -> float: with open(config_file, "r") as f: @@ -179,29 +166,62 @@ def __call__( def clean_url(url: str) -> str: url = str(url) - if url.endswith("/"): - url = url[:-1] + url = url.rstrip("/") return url + def parse_url(url: str) -> tuple[str, dict[str, list[str]]]: + """Parse a URL into its base, path, and query components.""" + parsed_url = urllib.parse.urlparse(url) + base_path = parsed_url.netloc + parsed_url.path + query = urllib.parse.parse_qs(parsed_url.query) + return base_path, query + + def parse_urls( + urls: list[str], + ) -> tuple[list[str], dict[str, set[str]]]: + """Parse a list of URLs.""" + base_paths = [] + queries = collections.defaultdict(set) + for url in urls: + base_path, query = parse_url(url) + base_paths.append(base_path) + for k, v in query.items(): + queries[k].update(v) + return base_paths, queries + pred = clean_url(page.url) ref_urls = configs["eval"]["reference_url"].split(" |OR| ") ref_urls = [clean_url(url) for url in ref_urls] - matching_rule = configs["eval"].get("url_note", "EXACT") - if matching_rule == "EXACT": - if pred in ref_urls: - return 1.0 - else: - return 0.0 - elif matching_rule == "GOLD in PRED": - if any([ref in pred for ref in ref_urls]): - return 1.0 - else: - return 0.0 + matching_rule = configs["eval"].get("url_note", "GOLD in PRED") + if matching_rule == "GOLD in PRED": + ref_base_paths, ref_queries = parse_urls(ref_urls) + pred_base_paths, pred_query = parse_url(pred) + + base_score = float( + any( + [ + ref_base_path in pred_base_paths + for ref_base_path in ref_base_paths + ] + ) + ) + query_score = 1.0 + for k, possible_values in ref_queries.items(): + query_score *= float( + any( + possible_ref_value in pred_query.get(k, []) + for possible_ref_value in possible_values + ) + ) + score = base_score * query_score + else: raise ValueError(f"Unknown matching rule: {matching_rule}") + return score -class HTMLContentExactEvaluator(Evaluator): + +class HTMLContentEvaluator(Evaluator): """Check whether the contents appear in the page""" @beartype @@ -209,7 +229,7 @@ def __call__( self, trajectory: Trajectory, config_file: Path | str, - page: Page, + page: Page | PseudoPage, client: CDPSession | None = None, ) -> float: with open(config_file, "r") as f: @@ -236,12 +256,19 @@ def __call__( if not locator.strip(): selected_element = page.content() # use JS to select the element - elif locator.startswith("document."): + elif locator.startswith("document.") or locator.startswith( + "[...document." + ): + if "prep_actions" in target: + try: + for prep_action in target["prep_actions"]: + page.evaluate(f"() => {prep_action}") + except Exception: + pass try: - selected_element = page.evaluate(f"() => {locator}") + selected_element = str(page.evaluate(f"() => {locator}")) if not selected_element: selected_element = "" - selected_element = str(selected_element) except Exception: # the page is wrong, return empty selected_element = "" @@ -253,96 +280,36 @@ def __call__( else: raise ValueError(f"Unknown locator: {locator}") + selected_element = html.unescape(selected_element) + if "exact_match" in target["required_contents"]: required_contents = target["required_contents"]["exact_match"] - score *= StringEvaluator.exact_match( + cur_score = StringEvaluator.exact_match( ref=required_contents, pred=selected_element ) + score *= float(cur_score) + # print(f"[exact match] {cur_score}, selected element: {selected_element}, required contents: {required_contents}") elif "must_include" in target["required_contents"]: required_contents = target["required_contents"]["must_include"] assert isinstance(required_contents, list) for content in required_contents: content_or = content.split(" |OR| ") - score *= any( + cur_score = any( [ StringEvaluator.must_include( - ref=content, pred=selected_element + ref=content, + pred=selected_element, + tokenize=False, ) for content in content_or ] ) + score *= float(cur_score) + # print(f"[must include] {cur_score}, selected element: {selected_element}, required contents: {content_or}") else: raise ValueError( f"Unknown required_contents: {target['required_contents'].keys()}" ) - - return score - - -###### -# soft matches. -# mainly for partial scores -# !!under development!! -# TODO[shuyanzh] -###### - - -class EvaluatorPartial(Evaluator): - def __init__(self) -> None: - raise NotImplementedError - - def __call__( - self, - trajectory: Trajectory, - config_file: Path | str, - page: Page, - client: CDPSession, - ) -> float: - raise NotImplementedError - - -class URLSoftEvaluator(EvaluatorPartial): - """Parse the URL and compare the domain and parameters""" - - def __call__( - self, - trajectory: Trajectory, - config_file: Path | str, - page: Page, - client: CDPSession, - ) -> float: - with open(config_file, "r") as f: - configs = json.load(f) - - last_state = self.get_last_state(trajectory) - pred = last_state["info"]["page"].url - ref = configs["eval"]["reference_url"] - - # parse url to get domain, parameters, etc. - parsed_pred = urllib.parse.urlparse(pred) - parsed_ref = urllib.parse.urlparse(ref) - - # check domain - domain_match = int(parsed_pred.netloc == parsed_ref.netloc) - - def get_param_set(query: dict[str, list[str]]) -> set[str]: - param_set = set() - for k, v in query.items(): - for vv in v: - param_set.add(f"{k}={vv}") - return param_set - - # calculate parameter f1 - param_set_ref = get_param_set(urllib.parse.parse_qs(parsed_ref.query)) - param_set_pred = get_param_set( - urllib.parse.parse_qs(parsed_pred.query) - ) - r = len(param_set_ref & param_set_pred) / len(param_set_ref) - p = len(param_set_ref & param_set_pred) / len(param_set_pred) - f1 = 2 * r * p / (r + p) if r + p > 0 else 1.0 - - score = domain_match * f1 # domain match is a must - return score @@ -355,7 +322,7 @@ def __call__( self, trajectory: Trajectory, config_file: Path | str, - page: Page, + page: Page | PseudoPage, client: CDPSession, ) -> float: @@ -363,7 +330,6 @@ def __call__( for evaluator in self.evaluators: cur_score = evaluator(trajectory, config_file, page, client) score *= cur_score - return score @@ -374,15 +340,15 @@ def evaluator_router(config_file: Path | str) -> EvaluatorComb: configs = json.load(f) eval_types = configs["eval"]["eval_types"] - evaluators: list[Evaluator | EvaluatorPartial] = [] + evaluators: list[Evaluator] = [] for eval_type in eval_types: match eval_type: case "string_match": evaluators.append(StringEvaluator()) case "url_match": - evaluators.append(URLExactEvaluator()) + evaluators.append(URLEvaluator()) case "program_html": - evaluators.append(HTMLContentExactEvaluator()) + evaluators.append(HTMLContentEvaluator()) case _: raise ValueError(f"eval_type {eval_type} is not supported") diff --git a/evaluation_harness/helper_functions.py b/evaluation_harness/helper_functions.py index 915ef1f..3906240 100644 --- a/evaluation_harness/helper_functions.py +++ b/evaluation_harness/helper_functions.py @@ -146,27 +146,41 @@ def gitlab_get_project_memeber_role(page: Page, account_name: str) -> str: def llm_fuzzy_match(pred: str, reference: str, question: str) -> float: """Check whether the prediction matches the reference with GPT-3.5""" messages: list[dict[str, Any]] = [] - messages.append( - {"role": "system", "content": "You are a helpful assistant"} - ) - - messages.append( - { - "role": "user", - "content": f'Given the statement "{pred}", would it be correct to infer "{reference}"? Yes or No', - } - ) + # construct the question to ask + message = "Help a teacher to grade the answer of a student given a question. Keep in mind that the student may use different phrasing or wording to answer the question. The goal is to evaluate whether the answer is semantically equivalent to the reference answer.\n" + message += f"question: {question}\n" + message += f"reference answer: {reference}\n" + message += "all the string 'N/A' that you see is a special sequence that means 'not achievable'\n" + message += f"student answer: {pred}\n" + message += "Conclude the judgement by correct/incorrect/partially correct." + messages = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": message}, + ] response = generate_from_openai_chat_completion( + model="gpt-4", messages=messages, - model="gpt-3.5-turbo", temperature=0, - top_p=1, + max_tokens=768, + top_p=1.0, context_length=0, - max_tokens=16, - stop_token=None, - ) - if "Yes" in response: - return 1.0 - else: + ).lower() + if "partially correct" in response or "incorrect" in response: return 0.0 + else: + assert "correct" in response + return 1.0 + + +class PseudoPage: + def __init__(self, original_page: Page, url: str): + self.url = url + self.original_page = original_page + + def __getattr__(self, attr: str) -> Any: + # Delegate attribute access to the original page object + if attr not in ["url"]: + return getattr(self.original_page, attr) + else: + return getattr(self, attr) diff --git a/llms/__init__.py b/llms/__init__.py index 8dd1547..7a8c942 100644 --- a/llms/__init__.py +++ b/llms/__init__.py @@ -1 +1,14 @@ """This module is adapt from https://github.com/zeno-ml/zeno-build""" +from .providers.hf_utils import generate_from_huggingface_completion +from .providers.openai_utils import ( + generate_from_openai_chat_completion, + generate_from_openai_completion, +) +from .utils import call_llm + +__all__ = [ + "generate_from_openai_completion", + "generate_from_openai_chat_completion", + "generate_from_huggingface_completion", + "call_llm", +] diff --git a/llms/lm_config.py b/llms/lm_config.py index 6d67579..2156ef9 100644 --- a/llms/lm_config.py +++ b/llms/lm_config.py @@ -2,6 +2,7 @@ from __future__ import annotations +import argparse import dataclasses from dataclasses import dataclass from typing import Any @@ -27,3 +28,30 @@ class LMConfig: tokenizer_cls: type | None = None mode: str | None = None gen_config: dict[str, Any] = dataclasses.field(default_factory=dict) + + +def construct_llm_config(args: argparse.Namespace) -> LMConfig: + llm_config = LMConfig( + provider=args.provider, model=args.model, mode=args.mode + ) + if args.provider == "openai": + llm_config.gen_config["temperature"] = args.temperature + llm_config.gen_config["top_p"] = args.top_p + llm_config.gen_config["context_length"] = args.context_length + llm_config.gen_config["max_tokens"] = args.max_tokens + llm_config.gen_config["stop_token"] = args.stop_token + llm_config.gen_config["max_obs_length"] = args.max_obs_length + llm_config.gen_config["max_retry"] = args.max_retry + elif args.provider == "huggingface": + llm_config.gen_config["temperature"] = args.temperature + llm_config.gen_config["top_p"] = args.top_p + llm_config.gen_config["max_new_tokens"] = args.max_tokens + llm_config.gen_config["stop_sequences"] = ( + [args.stop_token] if args.stop_token else None + ) + llm_config.gen_config["max_obs_length"] = args.max_obs_length + llm_config.gen_config["model_endpoint"] = args.model_endpoint + llm_config.gen_config["max_retry"] = args.max_retry + else: + raise NotImplementedError(f"provider {args.provider} not implemented") + return llm_config diff --git a/llms/providers/hf_utils.py b/llms/providers/hf_utils.py new file mode 100644 index 0000000..b5e8987 --- /dev/null +++ b/llms/providers/hf_utils.py @@ -0,0 +1,21 @@ +from text_generation import Client # type: ignore + + +def generate_from_huggingface_completion( + prompt: str, + model_endpoint: str, + temperature: float, + top_p: float, + max_new_tokens: int, + stop_sequences: list[str] | None = None, +) -> str: + client = Client(model_endpoint, timeout=60) + generation: str = client.generate( + prompt=prompt, + temperature=temperature, + top_p=top_p, + max_new_tokens=max_new_tokens, + stop_sequences=stop_sequences, + ).generated_text + + return generation diff --git a/llms/providers/openai_utils.py b/llms/providers/openai_utils.py index 75d03ee..4dcdad2 100644 --- a/llms/providers/openai_utils.py +++ b/llms/providers/openai_utils.py @@ -19,7 +19,7 @@ def retry_with_exponential_backoff( # type: ignore initial_delay: float = 1, exponential_base: float = 2, jitter: bool = True, - max_retries: int = 10, + max_retries: int = 3, errors: tuple[Any] = (openai.error.RateLimitError,), ): """Retry a function with exponential backoff.""" @@ -32,9 +32,7 @@ def wrapper(*args, **kwargs): # type: ignore # Loop until a successful response or max_retries is hit or an exception is raised while True: try: - return func(*args, **kwargs) - # Retry on specified errors except errors as e: # Increment retries @@ -48,7 +46,7 @@ def wrapper(*args, **kwargs): # type: ignore # Increment the delay delay *= exponential_base * (1 + jitter * random.random()) - + print(f"Retrying in {delay} seconds.") # Sleep for the delay time.sleep(delay) @@ -115,6 +113,7 @@ async def agenerate_from_openai_completion( "OPENAI_API_KEY environment variable must be set when using OpenAI API." ) openai.api_key = os.environ["OPENAI_API_KEY"] + openai.organization = os.environ.get("OPENAI_ORGANIZATION", "") limiter = aiolimiter.AsyncLimiter(requests_per_minute) async_responses = [ @@ -147,6 +146,7 @@ def generate_from_openai_completion( "OPENAI_API_KEY environment variable must be set when using OpenAI API." ) openai.api_key = os.environ["OPENAI_API_KEY"] + openai.organization = os.environ.get("OPENAI_ORGANIZATION", "") response = openai.Completion.create( # type: ignore prompt=prompt, engine=engine, @@ -218,6 +218,7 @@ async def agenerate_from_openai_chat_completion( "OPENAI_API_KEY environment variable must be set when using OpenAI API." ) openai.api_key = os.environ["OPENAI_API_KEY"] + openai.organization = os.environ.get("OPENAI_ORGANIZATION", "") limiter = aiolimiter.AsyncLimiter(requests_per_minute) async_responses = [ @@ -250,6 +251,7 @@ def generate_from_openai_chat_completion( "OPENAI_API_KEY environment variable must be set when using OpenAI API." ) openai.api_key = os.environ["OPENAI_API_KEY"] + openai.organization = os.environ.get("OPENAI_ORGANIZATION", "") response = openai.ChatCompletion.create( # type: ignore model=model, @@ -279,5 +281,6 @@ def fake_generate_from_openai_chat_completion( "OPENAI_API_KEY environment variable must be set when using OpenAI API." ) openai.api_key = os.environ["OPENAI_API_KEY"] + openai.organization = os.environ.get("OPENAI_ORGANIZATION", "") answer = "Let's think step-by-step. This page shows a list of links and buttons. There is a search box with the label 'Search query'. I will click on the search box to type the query. So the action I will perform is \"click [60]\"." return answer diff --git a/llms/tokenizers.py b/llms/tokenizers.py index 24763a6..8e45ccf 100644 --- a/llms/tokenizers.py +++ b/llms/tokenizers.py @@ -1,14 +1,27 @@ from typing import Any import tiktoken +from transformers import LlamaTokenizer # type: ignore class Tokenizer(object): - def __init__(self, model_name: str) -> None: - if model_name in ["gpt-4", "gpt-turbo-3.5"]: + def __init__(self, provider: str, model_name: str) -> None: + if provider == "openai": self.tokenizer = tiktoken.encoding_for_model(model_name) + elif provider == "huggingface": + self.tokenizer = LlamaTokenizer.from_pretrained(model_name) + # turn off adding special tokens automatically + self.tokenizer.add_special_tokens = False # type: ignore[attr-defined] + self.tokenizer.add_bos_token = False # type: ignore[attr-defined] + self.tokenizer.add_eos_token = False # type: ignore[attr-defined] else: raise NotImplementedError + def encode(self, text: str) -> list[int]: + return self.tokenizer.encode(text) + + def decode(self, ids: list[int]) -> str: + return self.tokenizer.decode(ids) + def __call__(self, text: str) -> list[int]: return self.tokenizer.encode(text) diff --git a/llms/utils.py b/llms/utils.py new file mode 100644 index 0000000..ea91a10 --- /dev/null +++ b/llms/utils.py @@ -0,0 +1,60 @@ +import argparse +from typing import Any + +from llms import ( + generate_from_huggingface_completion, + generate_from_openai_chat_completion, + generate_from_openai_completion, + lm_config, +) + +APIInput = str | list[Any] | dict[str, Any] + + +def call_llm( + lm_config: lm_config.LMConfig, + prompt: APIInput, +) -> str: + response: str + if lm_config.provider == "openai": + if lm_config.mode == "chat": + assert isinstance(prompt, list) + response = generate_from_openai_chat_completion( + messages=prompt, + model=lm_config.model, + temperature=lm_config.gen_config["temperature"], + top_p=lm_config.gen_config["top_p"], + context_length=lm_config.gen_config["context_length"], + max_tokens=lm_config.gen_config["max_tokens"], + stop_token=None, + ) + elif lm_config.mode == "completion": + assert isinstance(prompt, str) + response = generate_from_openai_completion( + prompt=prompt, + engine=lm_config.model, + temperature=lm_config.gen_config["temperature"], + max_tokens=lm_config.gen_config["max_tokens"], + top_p=lm_config.gen_config["top_p"], + stop_token=lm_config.gen_config["stop_token"], + ) + else: + raise ValueError( + f"OpenAI models do not support mode {lm_config.mode}" + ) + elif lm_config.provider == "huggingface": + assert isinstance(prompt, str) + response = generate_from_huggingface_completion( + prompt=prompt, + model_endpoint=lm_config.gen_config["model_endpoint"], + temperature=lm_config.gen_config["temperature"], + top_p=lm_config.gen_config["top_p"], + stop_sequences=lm_config.gen_config["stop_sequences"], + max_new_tokens=lm_config.gen_config["max_new_tokens"], + ) + else: + raise NotImplementedError( + f"Provider {lm_config.provider} not implemented" + ) + + return response diff --git a/parallel_run.sh b/parallel_run.sh new file mode 100644 index 0000000..fb56cc3 --- /dev/null +++ b/parallel_run.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +result_dir="cache/919_gpt35_16k_cot_na" +model="gpt-3.5-turbo-16k-0613" +instruction_path="agent/prompts/jsons/p_cot_id_actree_2s.json" + +SERVER="" +OPENAI_API_KEY="" +OPENAI_ORGANIZATION="" +CONDA_ENV_NAME="webarena" +ENV_VARIABLES="export SHOPPING='http://${SERVER}:7770';export SHOPPING_ADMIN='http://${SERVER}:7780/admin';export REDDIT='http://${SERVER}:9999';export GITLAB='http://${SERVER}:8023';export MAP='http://miniserver1875.asuscomm.com:3000';export WIKIPEDIA='http://${SERVER}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing';export HOMEPAGE='http://${SERVER}:4399';export OPENAI_API_KEY=${OPENAI_API_KEY};export OPENAI_ORGANIZATION=${OPENAI_ORGANIZATION}" + +# get the number of tmux panes +num_panes=$(tmux list-panes | wc -l) + +# calculate how many panes need to be created +let "panes_to_create = 5 - num_panes" + +# array of tmux commands to create each pane +tmux_commands=( + 'tmux split-window -h' + 'tmux split-window -v' + 'tmux select-pane -t 0; tmux split-window -v' + 'tmux split-window -v' + 'tmux select-pane -t 3; tmux split-window -v' +) + +# create panes up to 5 +for ((i=0; i<$panes_to_create; i++)); do + eval ${tmux_commands[$i]} +done + +#!/bin/bash + +# Function to run a job +run_job() { + tmux select-pane -t $1 + tmux send-keys "conda activate ${CONDA_ENV_NAME}; ${ENV_VARIABLES}; until python run.py --test_start_idx $2 --test_end_idx $3 --model ${model} --instruction_path ${instruction_path} --result_dir ${result_dir}; do echo 'crashed' >&2; sleep 1; done" C-m + sleep 3 +} + +TOLERANCE=2 +run_batch() { + args=("$@") # save all arguments in an array + num_jobs=${#args[@]} # get number of arguments + + for ((i=1; i<$num_jobs; i++)); do + run_job $i ${args[i-1]} ${args[i]} + done + + # Wait for all jobs to finish + while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do + sleep 100 # wait for 10 seconds before checking again + done + + # Run checker + while ! python scripts/check_error_runs.py ${result_dir} --delete_errors --tolerance ${TOLERANCE}; do + echo "Check failed, rerunning jobs..." + for ((i=1; i<$num_jobs; i++)); do + run_job $i ${args[i-1]} ${args[i]} + done + + # Wait for all jobs to finish + while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do + sleep 100 # wait for 10 seconds before checking again + done + done + +} + +run_batch 0 100 200 300 380 +run_batch 380 480 580 680 770 +run_batch 770 812 diff --git a/requirements.txt b/requirements.txt index 2567aa5..b2f109b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,5 @@ aiolimiter beartype==0.12.0 flask nltk +text-generation +transformers diff --git a/run.py b/run.py index c4781c2..cee3c98 100644 --- a/run.py +++ b/run.py @@ -5,6 +5,8 @@ import logging import os import random +import subprocess +import tempfile import time from pathlib import Path @@ -26,6 +28,7 @@ create_stop_action, ) from browser_env.actions import is_equivalent +from browser_env.auto_login import get_site_comb_from_filepath from browser_env.helper_functions import ( RenderHelper, get_action_description, @@ -116,12 +119,24 @@ def config() -> argparse.Namespace: parser.add_argument("--context_length", type=int, default=0) parser.add_argument("--max_tokens", type=int, default=384) parser.add_argument("--stop_token", type=str, default=None) + parser.add_argument( + "--max_retry", + type=int, + help="max retry times to perform generations when parsing fails", + default=1, + ) parser.add_argument( "--max_obs_length", type=int, help="when not zero, will truncate the observation to this length before feeding to the model", default=1920, ) + parser.add_argument( + "--model_endpoint", + help="huggingface model endpoint", + type=str, + default="", + ) # example config parser.add_argument("--test_start_idx", type=int, default=0) @@ -236,6 +251,28 @@ def test( _c = json.load(f) intent = _c["intent"] task_id = _c["task_id"] + # automatically login + if _c["storage_state"]: + cookie_file_name = os.path.basename(_c["storage_state"]) + comb = get_site_comb_from_filepath(cookie_file_name) + temp_dir = tempfile.mkdtemp() + # subprocess to renew the cookie + subprocess.run( + [ + "python", + "browser_env/auto_login.py", + "--auth_folder", + temp_dir, + "--site_list", + *comb, + ] + ) + _c["storage_state"] = f"{temp_dir}/{cookie_file_name}" + assert os.path.exists(_c["storage_state"]) + # update the config file + config_file = f"{temp_dir}/{os.path.basename(config_file)}" + with open(config_file, "w") as f: + json.dump(_c, f) logger.info(f"[Config file]: {config_file}") logger.info(f"[Intent]: {intent}") @@ -376,7 +413,7 @@ def dump_config(args: argparse.Namespace) -> None: if __name__ == "__main__": args = config() - args.sleep_after_execution = 2.5 + args.sleep_after_execution = 2.0 prepare(args) test_file_list = [] @@ -384,14 +421,19 @@ def dump_config(args: argparse.Namespace) -> None: ed_idx = args.test_end_idx for i in range(st_idx, ed_idx): test_file_list.append(f"config_files/{i}.json") - test_file_list = get_unfinished(test_file_list, args.result_dir) - print(f"Total {len(test_file_list)} tasks left") - args.render = True - args.render_screenshot = True - args.save_trace_enabled = True + if "debug" not in args.result_dir: + test_file_list = get_unfinished(test_file_list, args.result_dir) + + if len(test_file_list) == 0: + logger.info("No task left to run") + else: + print(f"Total {len(test_file_list)} tasks left") + args.render = False + args.render_screenshot = True + args.save_trace_enabled = True - args.current_viewport_only = True - dump_config(args) + args.current_viewport_only = True + dump_config(args) - agent = construct_agent(args) - test(args, agent, test_file_list) + agent = construct_agent(args) + test(args, agent, test_file_list) diff --git a/scripts/check_error_runs.py b/scripts/check_error_runs.py new file mode 100644 index 0000000..0039b56 --- /dev/null +++ b/scripts/check_error_runs.py @@ -0,0 +1,157 @@ +"""Some executions may failed. +This script checks the recordings, print the task ids. +It deletes the recordings if needed.""" +import argparse +import glob +import os +import shutil +import sys + + +def merge_logs(result_folder: str, args: argparse.Namespace) -> str: + if not os.path.exists(f"{result_folder}/log_files.txt"): + sys.exit(1) + + with open(f"{result_folder}/log_files.txt", "r") as f: + log_files = f.readlines() + + merged_results = {} + for file in log_files: + with open(file.strip(), "r") as f: + lines = f.readlines() + + cur_log: list[str] = [] + index = None + for line in lines: + if "[Config file]" in line: + if ( + cur_log + and index + and os.path.exists(f"{result_folder}/render_{index}.html") + and len(cur_log) >= 3 + ): + merged_results[index] = cur_log + # update index and log + index = line.split("/")[-1].split(".")[0] + cur_log = [line] + else: + cur_log.append(line) + + if ( + cur_log + and index + and os.path.exists(f"{result_folder}/render_{index}.html") + and len(cur_log) >= 3 + ): + + merged_results[index] = cur_log + + # sort by the key + merged_results = dict( + sorted(merged_results.items(), key=lambda x: int(x[0])) + ) + + merged_log_path = f"{result_folder}/tmp_merged_log.txt" + with open(merged_log_path, "w") as f: + for k, v in merged_results.items(): + for line in v: + f.write(line) + print(f"Number of examples: {len(merged_results)}") + + unlog_examples = [] + for i in range(812): + if ( + os.path.exists(f"{result_folder}/render_{i}.html") + and str(i) not in merged_results + ): + unlog_examples.append(i) + + print(f"Number of unlogged examples: {len(unlog_examples)}") + print(unlog_examples) + if ( + args.delete_errors + or input("Do you want to delete these examples? (y/n)") == "y" + ): + for idx in unlog_examples: + os.remove(f"{args.result_folder}/render_{idx}.html") + + unifinished_examples = [ + i for i in range(0, 812) if str(i) not in merged_results + ] + print(f"Number of unfinished examples: {len(unifinished_examples)}") + print(unifinished_examples) + + return merged_log_path + + +def check_unhandled_errors(args: argparse.Namespace) -> int: + log_path = merge_logs(args.result_folder, args) + with open(log_path, "r") as f: + logs = f.read() + + error_examples = [] + for line in logs.split("\n"): + if "[Config file]" in line: + example_idx = line.split("/")[-1].split(".")[0] + if "[Unhandled Error]" in line or "[OpenAI Error]" in line: + error_examples.append(int(example_idx)) + + num_errors = len(error_examples) + print(f"Number of unhandled errors: {len(error_examples)}") + print(error_examples) + if ( + args.delete_errors + or input("Do you want to delete these examples? (y/n)") == "y" + ): + for idx in error_examples: + if os.path.exists(f"{args.result_folder}/render_{idx}.html"): + os.remove(f"{args.result_folder}/render_{idx}.html") + return num_errors + + +def check_unexpected_logout(args: argparse.Namespace) -> int: + target_strings = set( + [ + "Creating an account has many benefits: check out faster", + "Welcome, please sign in", + "Username or email", + "Keep me logged in", + ] + ) + + error_examples = [] + for render_file in glob.glob(f"{args.result_folder}/render_*.html"): + with open(render_file, "r") as f: + contents = f.read() + if any([s in contents for s in target_strings]): + task_id = int( + render_file.split("/")[-1].split(".")[0].split("_")[-1] + ) + error_examples.append(task_id) + print(f"Number of unexpected logout: {len(error_examples)}") + print(error_examples) + num_errors = len(error_examples) + if ( + args.delete_errors + or input("Do you want to delete these examples? (y/n)") == "y" + ): + for idx in error_examples: + if os.path.exists(f"{args.result_folder}/render_{idx}.html"): + os.remove(f"{args.result_folder}/render_{idx}.html") + + return num_errors + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("result_folder", type=str) + parser.add_argument("--delete_errors", action="store_true") + parser.add_argument("--tolerance", type=int, default=0) + + args = parser.parse_args() + n1 = check_unhandled_errors(args) + n2 = check_unexpected_logout(args) + if n1 + n2 > args.tolerance: + sys.exit(1) + else: + sys.exit(0) diff --git a/tests/test_evaluation_harness/configs/func_url_func_1.json b/tests/test_evaluation_harness/configs/func_url_func_1.json index 7dbd8a2..993a246 100644 --- a/tests/test_evaluation_harness/configs/func_url_func_1.json +++ b/tests/test_evaluation_harness/configs/func_url_func_1.json @@ -17,7 +17,7 @@ { "url": "func:reddit_get_post_url('__last_url__')", "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": {"must_include": ["​"]} + "required_contents": {"must_include": ["How will SPY close on Monday 11/28"]} } ] } diff --git a/tests/test_evaluation_harness/test_exact_evaluators.py b/tests/test_evaluation_harness/test_evaluators.py similarity index 94% rename from tests/test_evaluation_harness/test_exact_evaluators.py rename to tests/test_evaluation_harness/test_evaluators.py index 9715ccf..bef0db6 100644 --- a/tests/test_evaluation_harness/test_exact_evaluators.py +++ b/tests/test_evaluation_harness/test_evaluators.py @@ -12,9 +12,9 @@ from browser_env import ActionTypes, ScriptBrowserEnv from browser_env.env_config import * from evaluation_harness import ( - HTMLContentExactEvaluator, + HTMLContentEvaluator, StringEvaluator, - URLExactEvaluator, + URLEvaluator, ) from evaluation_harness.evaluators import EvaluatorComb @@ -99,7 +99,7 @@ def test_url_exact_match_success(script_browser_env: ScriptBrowserEnv) -> None: trajectory = tf_roll_out(agent, env, config_file) - evalutor = URLExactEvaluator() + evalutor = URLEvaluator() score = evalutor( trajectory, config_file, env.page, env.get_page_client(env.page) ) @@ -119,7 +119,7 @@ def test_url_exact_match_fail(script_browser_env: ScriptBrowserEnv) -> None: trajectory = tf_roll_out(agent, env, config_file) - evalutor = URLExactEvaluator() + evalutor = URLEvaluator() score = evalutor( trajectory, config_file, env.page, env.get_page_client(env.page) ) @@ -143,7 +143,7 @@ def test_html_content_match_success( trajectory = tf_roll_out(agent, env, config_file) - evalutor = HTMLContentExactEvaluator() + evalutor = HTMLContentEvaluator() score = evalutor( trajectory, config_file, env.page, env.get_page_client(env.page) ) @@ -164,7 +164,7 @@ def test_html_content_match_fail(script_browser_env: ScriptBrowserEnv) -> None: trajectory = tf_roll_out(agent, env, config_file) - evalutor = HTMLContentExactEvaluator() + evalutor = HTMLContentEvaluator() score = evalutor( trajectory, config_file, env.page, env.get_page_client(env.page) ) @@ -189,7 +189,7 @@ def test_html_content_element_match_success( trajectory = tf_roll_out(agent, env, config_file) - evalutor = HTMLContentExactEvaluator() + evalutor = HTMLContentEvaluator() score = evalutor( trajectory, config_file, env.page, env.get_page_client(env.page) ) @@ -214,7 +214,7 @@ def test_html_content_element_match_fail( trajectory = tf_roll_out(agent, env, config_file) - evalutor = HTMLContentExactEvaluator() + evalutor = HTMLContentEvaluator() score = evalutor( trajectory, config_file, env.page, env.get_page_client(env.page) ) @@ -239,9 +239,7 @@ def test_html_content_url_comb_success( trajectory = tf_roll_out(agent, env, config_file) - evaluators = EvaluatorComb( - [URLExactEvaluator(), HTMLContentExactEvaluator()] - ) + evaluators = EvaluatorComb([URLEvaluator(), HTMLContentEvaluator()]) score = evaluators( trajectory, config_file, env.page, env.get_page_client(env.page) ) @@ -264,7 +262,7 @@ def test_func_success( env = script_browser_env trajectory = tf_roll_out(agent, env, config_file) - evalutor = HTMLContentExactEvaluator() + evalutor = HTMLContentEvaluator() score = evalutor( trajectory, config_file, env.page, env.get_page_client(env.page) ) @@ -287,7 +285,7 @@ def test_func_fail( env = script_browser_env trajectory = tf_roll_out(agent, env, config_file) - evalutor = HTMLContentExactEvaluator() + evalutor = HTMLContentEvaluator() score = evalutor( trajectory, config_file, env.page, env.get_page_client(env.page) ) @@ -308,7 +306,7 @@ def test_func_url_func_last_success( env = script_browser_env trajectory = tf_roll_out(agent, env, config_file) - evalutor = HTMLContentExactEvaluator() + evalutor = HTMLContentEvaluator() score = evalutor( trajectory, config_file, env.page, env.get_page_client(env.page) ) @@ -341,7 +339,7 @@ def test_func_url_func_page_success( env = script_browser_env trajectory = tf_roll_out(agent, env, tmp_config) - evalutor = HTMLContentExactEvaluator() + evalutor = HTMLContentEvaluator() score = evalutor( trajectory, tmp_config, env.page, env.get_page_client(env.page) )