diff --git a/miniwob/action.py b/miniwob/action.py index 63aff422..52ac7d43 100644 --- a/miniwob/action.py +++ b/miniwob/action.py @@ -1,156 +1,178 @@ """MiniWoB action space.""" -import logging -from enum import IntEnum -from typing import Any, Dict +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, Optional, Sequence, Set, Tuple, Union import numpy as np from gymnasium import spaces from selenium.webdriver import Chrome as ChromeDriver -from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.common.by import By -from miniwob.constants import ASCII_CHARSET, MAX_REF, TYPING_MAX_LENGTH +from miniwob import selenium_actions +from miniwob.constants import ( + ASCII_CHARSET, + DEFAULT_ALLOWED_KEYS, + MAX_FIELDS, + MAX_REF, + TYPING_MAX_LENGTH, +) Action = Dict[str, Any] -class ActionTypes(IntEnum): +class ActionTypes(str, Enum): """Valid action types for MiniWoB environments.""" - NONE = 0 - COORD_CLICK = 1 - ELEMENT_CLICK = 2 - TYPE = 3 - FOCUS_AND_TYPE = 4 - - -def get_action_space(screen_width: int, screen_height: int) -> spaces.Space: - """Return the space of serialized actions.""" - space = spaces.Dict( - { - "action_type": spaces.Discrete(len(ActionTypes)), - # coords (left, top) is used for COORD_CLICK - "coords": spaces.Box( - np.array([0.0, 0.0], dtype=np.float32), - np.array([screen_width, screen_height], dtype=np.float32), - ), - # ref (element ref ID) is used for ELEMENT_CLICK and FOCUS_AND_TYPE - "ref": spaces.Discrete(MAX_REF, start=1), - # text is only used for TYPE and FOCUS_AND_TYPE - "text": spaces.Text(TYPING_MAX_LENGTH, charset=ASCII_CHARSET), - } - ) - return space - - -def create_none_action() -> Action: - """Return a valid action object that does nothing.""" - return { - "action_type": ActionTypes.NONE, - "coords": np.zeros(2, dtype=np.float32), - "ref": 1, - "text": " ", - } - - -def create_coord_click_action(left: float, top: float) -> Action: - """Return a valid action object with type COORD_CLICK.""" - action = create_none_action() - action.update( - { - "action_type": ActionTypes.COORD_CLICK, - "coords": np.array([left, top], dtype=np.float32), - } - ) - return action - - -def create_element_click_action(ref: int) -> Action: - """Return a valid action object with type ELEMENT_CLICK.""" - action = create_none_action() - action.update( - { - "action_type": ActionTypes.ELEMENT_CLICK, - "ref": ref, - } - ) - return action - - -def create_type_action(text: str) -> Action: - """Return a valid action object with type TYPE.""" - action = create_none_action() - action.update( - { - "action_type": ActionTypes.TYPE, - "text": text, - } - ) - return action - - -def create_focus_and_type_action(ref: int, text: str) -> Action: - """Return a valid action object with type FOCUS_AND_TYPE.""" - action = create_none_action() - action.update( - { - "action_type": ActionTypes.FOCUS_AND_TYPE, - "ref": ref, - "text": text, - } - ) - return action - - -def execute_coord_click(left: float, top: float, driver: ChromeDriver): - """Click at coordinates (left, top).""" - body = driver.find_element(By.TAG_NAME, "body") - # The offset is from the center, not top-left. - x = -body.size["width"] / 2 + left - y = -body.size["height"] / 2 + top - chain = ActionChains(driver) - chain.move_to_element_with_offset(body, x, y).click().perform() - - -def execute_element_click(ref: int, driver: ChromeDriver): - """Click on the DOM element specified by a ref ID.""" - # TODO: Handle correctly. + result = driver.execute_script(f"return core.elementClick({ref});") + if result is not True: + logging.warning("Clicking %s failed: %s", ref, result) + + +def execute_type(text: str, driver: ChromeDriver): + """Send keystrokes to the focused element.""" + chain = ActionChains(driver) + chain.send_keys(text) + chain.perform() + + +def execute_focus_and_type(ref: int, text: str, driver: ChromeDriver): + """Click the specified DOM element and then send keystrokes.""" + execute_element_click(ref, driver) + execute_type(text, driver) diff --git a/pyproject.toml b/pyproject.toml index ae64d69c..9f7ae664 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,6 +2,7 @@ include = [ "miniwob/**", + "tests/**", ] exclude = [ diff --git a/tests/test_action.py b/tests/test_action.py index 54a3e4be..f17b103f 100644 --- a/tests/test_action.py +++ b/tests/test_action.py @@ -2,14 +2,10 @@ from typing import Mapping, Tuple import gymnasium +import numpy as np import pytest -from miniwob.action import ( - create_coord_click_action, - create_element_click_action, - create_focus_and_type_action, - create_type_action, -) +from miniwob.action import ActionSpaceConfig, ActionTypes class RepeatedTester: @@ -23,14 +19,25 @@ class RepeatedTester: MAX_STEPS = 1 # Fragile tasks need longer wait time and single instance FRAGILE = False + # Supported actions in the action space + SUPPORTED_ACTIONS = [ + ActionTypes.NONE, + ActionTypes.CLICK_COORDS, + ActionTypes.CLICK_ELEMENT, + ActionTypes.TYPE_TEXT, + ActionTypes.FOCUS_ELEMENT_AND_TYPE_TEXT, + ] @pytest.fixture def env(self): """Yield an environment for the task.""" + action_space_config = ActionSpaceConfig(action_types=self.SUPPORTED_ACTIONS) if self.FRAGILE: - env = gymnasium.make(self.ENV_NAME, wait_ms=300) + env = gymnasium.make( + self.ENV_NAME, action_space_config=action_space_config, wait_ms=300 + ) else: - env = gymnasium.make(self.ENV_NAME) + env = gymnasium.make(self.ENV_NAME, action_space_config=action_space_config) yield env env.close() @@ -41,7 +48,7 @@ def test_run(self, env): obs, info = env.reset() reward = -1 for step in range(self.MAX_STEPS): - action = self._get_action(obs, info, step) + action = self._get_action(env, obs, info, step) obs, reward, terminated, _, _ = env.step(action) assert reward >= 0 if terminated: @@ -50,22 +57,54 @@ def test_run(self, env): assert False, f"Number of steps exceeded {self.MAX_STEPS}" assert reward >= 0 - def _get_action(self, obs, info, step): + def _get_action(self, env, obs, info, step): """Return a MiniWoBAction that clicks the right thing.""" raise NotImplementedError - def click_button(self, obs, text): + def create_click_element_action(self, env, element): + """Create an action that clicks in the specified element.""" + action = env.action_space.sample() + action["action_type"] = self.SUPPORTED_ACTIONS.index(ActionTypes.CLICK_ELEMENT) + action["ref"] = element["ref"] + return action + + def create_click_button_action(self, env, obs, button_text): """Create an action that clicks on the button with the specified text.""" for element in obs["dom_elements"]: - if element["tag"] == "button" and element["text"] == text: - return create_element_click_action(element["ref"]) - assert False, "Submit button not found" + if element["tag"] == "button" and element["text"] == button_text: + return self.create_click_element_action(env, element) + assert False, f"{button_text} button not found" + + def create_click_coords_action(self, env, left, top): + """Create an action that clicks on the specified coordinates.""" + action = env.action_space.sample() + action["action_type"] = self.SUPPORTED_ACTIONS.index(ActionTypes.CLICK_COORDS) + action["coords"] = np.array([left, top], dtype=np.float32) + return action - def create_coord_click_action(self, element): - """Create an action that clicks on the element using CoordClick.""" + def create_click_element_center_action(self, env, element): + """Create an action that clicks the element's center.""" left, top = element["pos"].tolist() width, height = element["size"].tolist() - action = create_coord_click_action(left + (width / 2), top + (height / 2)) + return self.create_click_coords_action( + env, left + (width / 2), top + (height / 2) + ) + + def create_type_action(self, env, text): + """Create an action that types text.""" + action = env.action_space.sample() + action["action_type"] = self.SUPPORTED_ACTIONS.index(ActionTypes.TYPE_TEXT) + action["text"] = text + return action + + def create_focus_and_type_action(self, env, element, text): + """Create an action that focuses on the element and types text.""" + action = env.action_space.sample() + action["action_type"] = self.SUPPORTED_ACTIONS.index( + ActionTypes.FOCUS_ELEMENT_AND_TYPE_TEXT + ) + action["ref"] = element["ref"] + action["text"] = text return action @@ -78,10 +117,10 @@ class TestClickTest2(RepeatedTester): ENV_NAME = "miniwob/click-test-2-v1" - def _get_action(self, obs, info, step): + def _get_action(self, env, obs, info, step): for element in obs["dom_elements"]: if element["tag"] == "button" and element["text"] == "ONE": - return create_element_click_action(element["ref"]) + return self.create_click_element_action(env, element) # No button is found, which is weird assert False, 'Button "ONE" not found' @@ -91,11 +130,11 @@ class TestClickButton(RepeatedTester): ENV_NAME = "miniwob/click-button-v1" - def _get_action(self, obs, info, step): + def _get_action(self, env, obs, info, step): target = info["fields"]["target"] for element in obs["dom_elements"]: if element["tag"] == "button" and element["text"] == target: - return self.create_coord_click_action(element) + return self.create_click_element_center_action(env, element) # No button is found, which is weird assert False, f'Button "{target}" not found' @@ -105,10 +144,10 @@ class TestFocusText(RepeatedTester): ENV_NAME = "miniwob/focus-text-v1" - def _get_action(self, obs, info, step): + def _get_action(self, env, obs, info, step): for element in obs["dom_elements"]: if element["tag"] == "input_text": - return self.create_coord_click_action(element) + return self.create_click_element_center_action(env, element) # No input is found, which is weird assert False, "Input box not found" @@ -118,11 +157,11 @@ class TestIdentifyShape(RepeatedTester): ENV_NAME = "miniwob/identify-shape-v1" - def _get_action(self, obs, info, step): + def _get_action(self, env, obs, info, step): shape = self._identify_shape(obs) for element in obs["dom_elements"]: if element["tag"] == "button" and element["text"] == shape: - return create_element_click_action(element["ref"]) + return self.create_click_element_action(env, element) # No button is found, which is weird assert False, f'Button "{shape}" not found' @@ -150,13 +189,13 @@ class TestClickDialog2(RepeatedTester): ENV_NAME = "miniwob/click-dialog-2-v1" - def _get_action(self, obs, info, step): + def _get_action(self, env, obs, info, step): target = info["fields"]["target"] if target == "x": target = "" for element in obs["dom_elements"]: if element["tag"] == "button" and element["text"] == target: - return create_element_click_action(element["ref"]) + return self.create_click_element_action(env, element) # No button is found, which is weird assert False, f'Button "{target}" not found' @@ -171,13 +210,13 @@ class TestEnterText(RepeatedTester): ENV_NAME = "miniwob/enter-text-v1" MAX_STEPS = 3 - def _get_action(self, obs, info, step): + def _get_action(self, env, obs, info, step): if step == 0: # Click on the textbox for element in obs["dom_elements"]: if element["tag"] == "input_text": assert not element["flags"][0].item() - return create_element_click_action(element["ref"]) + return self.create_click_element_action(env, element) assert False, "Input text not found" elif step == 1: # Assert that the input is focused @@ -192,10 +231,10 @@ def _get_action(self, obs, info, step): if len(target) > 2: # Hmm... Let's try the LEFT arrow key target = target[:-2] + target[-1] + "\ue012" + target[-2] - return create_type_action(target) + return self.create_type_action(env, target) elif step == 2: # Click submit - return self.click_button(obs, "Submit") + return self.create_click_button_action(env, obs, "Submit") class TestEnterTextFocusAndType(RepeatedTester): @@ -204,17 +243,17 @@ class TestEnterTextFocusAndType(RepeatedTester): ENV_NAME = "miniwob/enter-text-v1" MAX_STEPS = 2 - def _get_action(self, obs, info, step): + def _get_action(self, env, obs, info, step): if step == 0: # Type into the textbox target = info["fields"]["target"] for element in obs["dom_elements"]: if element["tag"] == "input_text": - return create_focus_and_type_action(element["ref"], target) + return self.create_focus_and_type_action(env, element, target) assert False, "Input text not found" elif step == 1: # Click submit - return self.click_button(obs, "Submit") + return self.create_click_button_action(env, obs, "Submit") class TestClickCheckboxes(RepeatedTester): @@ -223,7 +262,7 @@ class TestClickCheckboxes(RepeatedTester): ENV_NAME = "miniwob/click-checkboxes-v1" MAX_STEPS = 7 - def _get_action(self, obs, info, step): + def _get_action(self, env, obs, info, step): if not obs: return # print obs.dom.visualize() @@ -240,10 +279,10 @@ def _get_action(self, obs, info, step): continue elif text["text"] in things_to_click: # Click on