Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions environments/online_mind2web/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
gcp.json
test.ipynb
36 changes: 36 additions & 0 deletions environments/online_mind2web/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Use our HUD base browser image with Playwright and uv pre-installed
FROM hudpython/base-browser:latest

# Create app-specific working directory
WORKDIR /app

# Copy project files
COPY pyproject.toml ./
COPY src/ ./src/

# Install the package using the existing venv at /opt/venv
# The --python flag tells uv to use this specific Python instead of creating a new venv
RUN uv pip install --python /opt/venv -e .

# Create directories for logs and data
RUN mkdir -p /app/logs /app/data

ENV DISPLAY_WIDTH=1448
ENV DISPLAY_HEIGHT=944

ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1
# Note: Environment variables for browser providers should be set at runtime:
# - BROWSER_PROVIDER: anchorbrowser, steel, browserbase, hyperbrowser, kernel
# - Provider-specific API keys: ANCHOR_API_KEY, STEEL_API_KEY, etc.
# - GCP_CREDENTIALS_JSON: For Google Sheets functionality (if needed)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: remove this line, not relevant in OM2W


# Run remote browser with persistent context
CMD ["sh", "-c", "\
# Start context server in background \
python3 -m hud_controller.context >&2 & \
# Wait a bit for context server to start \
sleep 2 && \
# Run MCP server in foreground with exec \
exec python3 -m hud_controller.server \
"]
36 changes: 36 additions & 0 deletions environments/online_mind2web/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# HUD Online Mind2Web Taskset

Based on hud remote-browser, this MCP server provides environment for Online-Mind2Web task exacution and evaluation.

## Running with Docker

The Docker image supports both production and development modes using the same Dockerfile.

### Building the Image

```bash
# Production build (default)
docker build -t hud-om2w:latest .
```

### Running the Test Task
```bash
hud eval ./test_task.json
```

### Running Whole Online-Mind2Web Dataset From HuggingFace
```bash
hud eval Genteki/Online-Mind2Web --full --max-concurrent=5
```

### Different Evaluation Method

To chosse different evaluation method, you can change different `task["evaluate_tool"]["evaluate"]["name"]` value in task json file. Here are the different evaluation method we support for you:

| Evaluation Method | Final Screenshot | Screenshot History | Action Histroy |
|:---|:---:|:---:| :---: |
| `autonomous` | ✔ | ✗ | ✔ |
| `webjudge` | ✔ | ✔ | ✔ |
| `overall_judge`[^1] | - | - | - |

[^1]: `overall_judge` will execute all evaluation methods above and return the average of the rewards of them.
22 changes: 22 additions & 0 deletions environments/online_mind2web/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[project]
name = "hud-om2w"
version = "0.1.0"
description = "HUD Remote Browser Controller with MCP tools for cloud browser providers"
requires-python = ">=3.11,<3.13"
dependencies = [ "hud-python>=0.4.12", "pyautogui", "playwright", "httpx", "typer", "google-api-python-client", "google-auth",]

[build-system]
requires = [ "hatchling",]
build-backend = "hatchling.build"

[project.scripts]
hud-om2w = "hud_controller.__main__:main"

[tool.hud]
image = "hud-om2w:dev"

[tool.hatch.metadata]
allow-direct-references = true

[tool.hatch.build.targets.wheel]
packages = [ "src/hud_controller",]
3 changes: 3 additions & 0 deletions environments/online_mind2web/src/hud_controller/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""Online Mind2Web Env, From ../remote-browser"""

__version__ = "0.1.0"
139 changes: 139 additions & 0 deletions environments/online_mind2web/src/hud_controller/context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
"""
Context server for remote browser environment that persists state across hot-reloads.

Run this as a separate process to maintain browser session state during development:
python -m hud_controller.context
"""

import asyncio
import logging
from datetime import datetime
from typing import Dict, Any, Optional
from hud.server.context import run_context_server

logger = logging.getLogger(__name__)


class RemoteBrowserContext:
"""Context that holds remote browser state across reloads."""

def __init__(self):
"""Initialize the remote browser context."""
self.browser_provider = None
self.is_initialized = False
self.provider_config: Optional[Dict[str, Any]] = None
self.launch_options: Optional[Dict[str, Any]] = None
self._startup_complete = False
self.playwright_tool = None # Store the playwright tool
self._telemetry: Optional[Dict[str, Any]] = None # Store full telemetry data

logger.info("[RemoteBrowserContext] Created new remote browser context")

def startup(self):
"""One-time startup when context server starts."""
if self._startup_complete:
logger.info("[RemoteBrowserContext] Startup already complete, skipping")
return

logger.info("[RemoteBrowserContext] Performing one-time startup")
self._startup_complete = True

# === Proxy-friendly methods for multiprocessing.Manager ===
# Note: These are needed because direct attribute access doesn't always
# work correctly through the multiprocessing proxy

def get_browser_provider(self):
"""Get the browser provider instance."""
return self.browser_provider

def set_browser_provider(self, provider) -> None:
"""Set the browser provider instance."""
self.browser_provider = provider
if provider:
self.provider_name = provider.__class__.__name__.replace("Provider", "").lower()
logger.info(f"[RemoteBrowserContext] Set browser provider: {self.provider_name}")

def get_cdp_url(self) -> Optional[str]:
"""Get the CDP URL from telemetry."""
return self._telemetry.get("cdp_url") if self._telemetry else None

def get_is_initialized(self) -> bool:
"""Check if environment is initialized."""
return self.is_initialized

def set_initialized(self, value: bool) -> None:
"""Set initialization status."""
self.is_initialized = value
logger.info(f"[RemoteBrowserContext] Initialization status: {value}")

def get_provider_config(self) -> Optional[Dict[str, Any]]:
"""Get provider configuration."""
return self.provider_config

def set_provider_config(self, config: Dict[str, Any]) -> None:
"""Set provider configuration."""
self.provider_config = config
logger.info(f"[RemoteBrowserContext] Set provider config")

def get_launch_options(self) -> Optional[Dict[str, Any]]:
"""Get launch options."""
return self.launch_options

def set_launch_options(self, options: Dict[str, Any]) -> None:
"""Set launch options."""
self.launch_options = options
logger.info(f"[RemoteBrowserContext] Set launch options")

def get_playwright_tool(self):
"""Get the playwright tool instance."""
return self.playwright_tool

def set_playwright_tool(self, tool) -> None:
"""Set the playwright tool instance."""
self.playwright_tool = tool
logger.info(f"[RemoteBrowserContext] Set playwright tool")

def set_telemetry(self, telemetry: Dict[str, Any]) -> None:
"""Set the full telemetry data."""
self._telemetry = telemetry
logger.info(f"[RemoteBrowserContext] Set telemetry: {telemetry}")

def get_state_summary(self) -> Dict[str, Any]:
"""Get a summary of the current state."""
return {
"is_initialized": self.is_initialized,
"startup_complete": self._startup_complete,
"provider_name": self._telemetry.get("provider") if self._telemetry else None,
"has_cdp_url": self.get_cdp_url() is not None,
"has_browser_provider": self.browser_provider is not None,
"has_playwright_tool": self.playwright_tool is not None,
}

def get_telemetry(self) -> Dict[str, Any]:
"""Get telemetry data from the browser provider."""
# If we have stored telemetry, return it
if self._telemetry:
return self._telemetry

# Otherwise return basic telemetry data
return {
"provider": "unknown",
"status": "not_initialized",
"live_url": None,
"cdp_url": None,
"instance_id": None,
"timestamp": datetime.now().isoformat(),
}


if __name__ == "__main__":
# Run the context server with RemoteBrowserContext
context = RemoteBrowserContext()
context.startup()

# Log initial state
logger.info(f"[Context] Starting remote browser context server")
logger.info(f"[Context] Initial state: {context.get_state_summary()}")

# Run the context server
asyncio.run(run_context_server(context, "/tmp/hud_remote_browser_ctx.sock"))
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""Evaluation layer for remote browser environment."""

from __future__ import annotations

from hud.tools.base import BaseHub

evaluate = BaseHub("evaluate")

from . import autonomous_eval, webjudge, overall_judge

__all__ = ["evaluate"]
Loading
Loading