diff --git a/.agent/AGENTS.md b/.agent/AGENTS.md index f90515657..184c34f70 100644 --- a/.agent/AGENTS.md +++ b/.agent/AGENTS.md @@ -380,7 +380,7 @@ Subagents provide specialized capabilities. Read them when tasks require domain | `tools/build-mcp/` | MCP development - creating Model Context Protocol servers and tools | build-mcp, api-wrapper, server-patterns, transports, deployment | | `tools/ai-assistants/` | AI tool integration - configuring assistants, CAPTCHA solving, multi-modal agents | agno, capsolver, windsurf, configuration, status | | `tools/ai-orchestration/` | AI orchestration frameworks - visual builders, multi-agent teams, workflow automation, DSL orchestration | overview, langflow, crewai, autogen, openprose, packaging | -| `tools/browser/` | Browser automation - web scraping, testing, screenshots, form filling | stagehand, playwright, playwriter, crawl4ai, dev-browser, pagespeed, chrome-devtools | +| `tools/browser/` | Browser automation - web scraping, testing, screenshots, form filling | agent-browser, stagehand, playwright, playwriter, crawl4ai, dev-browser, pagespeed, chrome-devtools | | `tools/ui/` | UI components - component libraries, design systems, interface constraints | shadcn, ui-skills | | `tools/code-review/` | Code quality - linting, security scanning, style enforcement, PR reviews | code-standards, code-simplifier, codacy, coderabbit, qlty, snyk, secretlint, auditing | | `tools/context/` | Context optimization - semantic search, codebase indexing, token efficiency | osgrep, augment-context-engine, context-builder, context7, toon, dspy, llm-tldr | diff --git a/.agent/scripts/agent-browser-helper.sh b/.agent/scripts/agent-browser-helper.sh new file mode 100755 index 000000000..6c23553d2 --- /dev/null +++ b/.agent/scripts/agent-browser-helper.sh @@ -0,0 +1,311 @@ +#!/bin/bash +# shellcheck disable=SC2034,SC2155,SC2317,SC2329,SC2016,SC2181,SC1091,SC2154,SC2015,SC2086,SC2129,SC2030,SC2031,SC2119,SC2120,SC2001,SC2162,SC2088,SC2089,SC2090,SC2029,SC2006,SC2153 + +# Agent Browser Helper - Headless Browser Automation CLI for AI Agents +# Part of AI DevOps Framework +# Provides setup and management of agent-browser CLI + +# Source shared constants and functions +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/shared-constants.sh" + +# Colors for output +readonly BLUE='\033[0;34m' +readonly GREEN='\033[0;32m' +readonly YELLOW='\033[1;33m' +readonly RED='\033[0;31m' +readonly NC='\033[0m' + +# Print functions +print_info() { + local msg="$1" + echo -e "${BLUE}[INFO]${NC} $msg" + return 0 +} + +print_success() { + local msg="$1" + echo -e "${GREEN}[SUCCESS]${NC} $msg" + return 0 +} + +print_warning() { + local msg="$1" + echo -e "${YELLOW}[WARNING]${NC} $msg" + return 0 +} + +print_error() { + local msg="$1" + echo -e "${RED}[ERROR]${NC} $msg" >&2 + return 0 +} + +# Check if agent-browser is installed +check_installed() { + if command -v agent-browser &> /dev/null; then + local version + version=$(agent-browser --version 2>/dev/null || echo "unknown") + print_success "agent-browser is installed: $version" + return 0 + else + print_warning "agent-browser is not installed" + return 1 + fi +} + +# Check if Chromium is installed +check_chromium() { + if agent-browser install --check 2>/dev/null; then + print_success "Chromium browser is installed" + return 0 + else + print_warning "Chromium browser not found" + return 1 + fi +} + +# Install agent-browser globally +install_agent_browser() { + print_info "Installing agent-browser CLI..." + + # Check for npm + if ! command -v npm &> /dev/null; then + print_error "npm is required. Please install Node.js first." + return 1 + fi + + # Install globally + if npm install -g agent-browser; then + print_success "agent-browser installed successfully" + else + print_error "Failed to install agent-browser" + return 1 + fi + + return 0 +} + +# Install Chromium browser +install_chromium() { + print_info "Installing Chromium browser..." + + if ! check_installed; then + print_error "agent-browser must be installed first. Run: $0 install" + return 1 + fi + + # Detect platform for deps + local with_deps="" + if [[ "$(uname)" == "Linux" ]]; then + print_info "Linux detected - installing with system dependencies" + with_deps="--with-deps" + fi + + if agent-browser install $with_deps; then + print_success "Chromium installed successfully" + else + print_error "Failed to install Chromium" + return 1 + fi + + return 0 +} + +# Full setup +setup() { + print_info "Setting up agent-browser..." + + if ! check_installed; then + install_agent_browser || return 1 + fi + + if ! check_chromium 2>/dev/null; then + install_chromium || return 1 + fi + + print_success "agent-browser setup complete!" + echo "" + print_info "Quick start:" + echo " agent-browser open example.com" + echo " agent-browser snapshot" + echo " agent-browser click @e1" + echo " agent-browser close" + + return 0 +} + +# Show status +status() { + echo "=== Agent Browser Status ===" + echo "" + + # Check installation + if check_installed; then + echo "" + fi + + # Check Chromium + check_chromium 2>/dev/null || true + echo "" + + # Check active sessions + print_info "Active sessions:" + agent-browser session list 2>/dev/null || echo " (none or daemon not running)" + + return 0 +} + +# List active sessions +sessions() { + print_info "Active browser sessions:" + agent-browser session list 2>/dev/null || echo "No active sessions" + return 0 +} + +# Close all sessions +close_all() { + print_info "Closing all browser sessions..." + + # Get list of sessions and close each + local sessions + sessions=$(agent-browser session list 2>/dev/null | grep -E '^\s*\w+' | awk '{print $1}') + + if [[ -z "$sessions" ]]; then + print_info "No active sessions to close" + return 0 + fi + + for session in $sessions; do + print_info "Closing session: $session" + AGENT_BROWSER_SESSION="$session" agent-browser close 2>/dev/null || true + done + + print_success "All sessions closed" + return 0 +} + +# Run a quick demo +demo() { + print_info "Running agent-browser demo..." + + if ! check_installed; then + print_error "agent-browser not installed. Run: $0 setup" + return 1 + fi + + echo "" + print_info "1. Opening example.com..." + agent-browser open https://example.com + + echo "" + print_info "2. Getting snapshot (accessibility tree)..." + agent-browser snapshot -i + + echo "" + print_info "3. Getting page title..." + agent-browser get title + + echo "" + print_info "4. Taking screenshot..." + local screenshot_path="/tmp/agent-browser-demo.png" + agent-browser screenshot "$screenshot_path" + print_success "Screenshot saved to: $screenshot_path" + + echo "" + print_info "5. Closing browser..." + agent-browser close + + print_success "Demo complete!" + return 0 +} + +# Show help +show_help() { + cat << 'EOF' +Agent Browser Helper - Headless Browser Automation CLI for AI Agents + +Usage: agent-browser-helper.sh + +Commands: + setup Full setup (install CLI + Chromium) + install Install agent-browser CLI only + chromium Install Chromium browser only + status Show installation and session status + sessions List active browser sessions + close-all Close all active sessions + demo Run a quick demonstration + help Show this help message + +Examples: + # First-time setup + agent-browser-helper.sh setup + + # Check status + agent-browser-helper.sh status + + # Run demo + agent-browser-helper.sh demo + + # Close all sessions + agent-browser-helper.sh close-all + +Direct CLI Usage: + agent-browser open example.com # Navigate to URL + agent-browser snapshot # Get accessibility tree with refs + agent-browser click @e2 # Click by ref from snapshot + agent-browser fill @e3 "text" # Fill input by ref + agent-browser screenshot page.png # Take screenshot + agent-browser close # Close browser + +Multi-Session: + agent-browser --session s1 open site-a.com + agent-browser --session s2 open site-b.com + agent-browser session list + +For full documentation, see: + ~/.aidevops/agents/tools/browser/agent-browser.md + https://github.com/vercel-labs/agent-browser +EOF + return 0 +} + +# Main entry point +main() { + local command="${1:-help}" + + case "$command" in + setup) + setup + ;; + install) + install_agent_browser + ;; + chromium) + install_chromium + ;; + status) + status + ;; + sessions) + sessions + ;; + close-all) + close_all + ;; + demo) + demo + ;; + help|--help|-h) + show_help + ;; + *) + print_error "Unknown command: $command" + echo "" + show_help + return 1 + ;; + esac +} + +main "$@" diff --git a/.agent/tools/browser/agent-browser.md b/.agent/tools/browser/agent-browser.md new file mode 100644 index 000000000..6a6a05e99 --- /dev/null +++ b/.agent/tools/browser/agent-browser.md @@ -0,0 +1,464 @@ +--- +description: Agent Browser - headless browser automation CLI for AI agents with Rust CLI and Node.js fallback +mode: subagent +tools: + read: true + write: false + edit: false + bash: true + glob: true + grep: true + webfetch: true + task: true +--- + +# Agent Browser - Headless Browser Automation CLI + + + +## Quick Reference + +- **Purpose**: Headless browser automation CLI optimized for AI agents +- **Install**: `npm install -g agent-browser && agent-browser install` +- **Architecture**: Fast Rust CLI with Node.js fallback, Playwright-based daemon +- **GitHub**: https://github.com/vercel-labs/agent-browser + +**Core Workflow** (optimal for AI): + +```bash +agent-browser open example.com +agent-browser snapshot # Get accessibility tree with refs +agent-browser click @e2 # Click by ref from snapshot +agent-browser fill @e3 "test@example.com" # Fill by ref +agent-browser get text @e1 # Get text by ref +agent-browser screenshot page.png +agent-browser close +``` + +**Key Advantages**: +- **Ref-based selection**: Deterministic element targeting from snapshots +- **AI-optimized**: `--json` output for machine parsing +- **Session isolation**: Multiple browser instances with `--session` +- **Fast**: Native Rust CLI, persistent daemon between commands + + + +## Installation + +### npm (recommended) + +```bash +npm install -g agent-browser +agent-browser install # Download Chromium +``` + +### Linux Dependencies + +```bash +agent-browser install --with-deps +# or manually: npx playwright install-deps chromium +``` + +### From Source + +```bash +git clone https://github.com/vercel-labs/agent-browser +cd agent-browser +pnpm install +pnpm build +agent-browser install +``` + +## AI-Optimized Workflow + +### The Snapshot + Ref Pattern + +This is the **recommended workflow for AI agents**: + +```bash +# 1. Navigate and get snapshot +agent-browser open example.com +agent-browser snapshot -i --json # AI parses tree and refs + +# 2. AI identifies target refs from snapshot +# Output includes refs like: +# - heading "Example Domain" [ref=e1] [level=1] +# - button "Submit" [ref=e2] +# - textbox "Email" [ref=e3] + +# 3. Execute actions using refs +agent-browser click @e2 +agent-browser fill @e3 "input text" + +# 4. Get new snapshot if page changed +agent-browser snapshot -i --json +``` + +**Why use refs?** +- **Deterministic**: Ref points to exact element from snapshot +- **Fast**: No DOM re-query needed +- **AI-friendly**: Snapshot + ref workflow is optimal for LLMs + +### Snapshot Options + +```bash +agent-browser snapshot # Full accessibility tree +agent-browser snapshot -i # Interactive elements only +agent-browser snapshot -c # Compact (remove empty structural) +agent-browser snapshot -d 3 # Limit depth to 3 levels +agent-browser snapshot -s "#main" # Scope to CSS selector +agent-browser snapshot -i -c -d 5 # Combine options +``` + +| Option | Description | +|--------|-------------| +| `-i, --interactive` | Only show interactive elements (buttons, links, inputs) | +| `-c, --compact` | Remove empty structural elements | +| `-d, --depth ` | Limit tree depth | +| `-s, --selector ` | Scope to CSS selector | + +## Core Commands + +### Navigation + +```bash +agent-browser open # Navigate to URL +agent-browser back # Go back +agent-browser forward # Go forward +agent-browser reload # Reload page +``` + +### Interaction + +```bash +agent-browser click # Click element +agent-browser dblclick # Double-click element +agent-browser focus # Focus element +agent-browser type # Type into element +agent-browser fill # Clear and fill +agent-browser press # Press key (Enter, Tab, Control+a) +agent-browser hover # Hover element +agent-browser select # Select dropdown option +agent-browser check # Check checkbox +agent-browser uncheck # Uncheck checkbox +agent-browser scroll [px] # Scroll (up/down/left/right) +agent-browser scrollintoview # Scroll element into view +agent-browser drag # Drag and drop +agent-browser upload # Upload files +``` + +### Get Info + +```bash +agent-browser get text # Get text content +agent-browser get html # Get innerHTML +agent-browser get value # Get input value +agent-browser get attr # Get attribute +agent-browser get title # Get page title +agent-browser get url # Get current URL +agent-browser get count # Count matching elements +agent-browser get box # Get bounding box +``` + +### Check State + +```bash +agent-browser is visible # Check if visible +agent-browser is enabled # Check if enabled +agent-browser is checked # Check if checked +``` + +### Screenshots & Output + +```bash +agent-browser screenshot [path] # Take screenshot (--full for full page) +agent-browser pdf # Save as PDF +agent-browser snapshot # Accessibility tree with refs +agent-browser eval # Run JavaScript +agent-browser close # Close browser +``` + +## Selectors + +### Refs (Recommended for AI) + +```bash +# From snapshot output: +# - button "Submit" [ref=e2] +# - textbox "Email" [ref=e3] + +agent-browser click @e2 # Click the button +agent-browser fill @e3 "test@example.com" # Fill the textbox +``` + +### CSS Selectors + +```bash +agent-browser click "#id" +agent-browser click ".class" +agent-browser click "div > button" +``` + +### Text & XPath + +```bash +agent-browser click "text=Submit" +agent-browser click "xpath=//button" +``` + +### Semantic Locators + +```bash +agent-browser find role button click --name "Submit" +agent-browser find text "Sign In" click +agent-browser find label "Email" fill "test@test.com" +agent-browser find first ".item" click +agent-browser find nth 2 "a" text +``` + +**Actions**: `click`, `fill`, `check`, `hover`, `text` + +## Sessions + +Run multiple isolated browser instances: + +```bash +# Different sessions +agent-browser --session agent1 open site-a.com +agent-browser --session agent2 open site-b.com + +# Or via environment variable +AGENT_BROWSER_SESSION=agent1 agent-browser click "#btn" + +# List active sessions +agent-browser session list + +# Show current session +agent-browser session +``` + +Each session has its own: +- Browser instance +- Cookies and storage +- Navigation history +- Authentication state + +## Wait Commands + +```bash +agent-browser wait # Wait for element +agent-browser wait # Wait for time +agent-browser wait --text "Welcome" # Wait for text +agent-browser wait --url "**/dash" # Wait for URL pattern +agent-browser wait --load networkidle # Wait for load state +agent-browser wait --fn "window.ready === true" # Wait for JS condition +``` + +**Load states**: `load`, `domcontentloaded`, `networkidle` + +## Cookies & Storage + +```bash +agent-browser cookies # Get all cookies +agent-browser cookies set # Set cookie +agent-browser cookies clear # Clear cookies + +agent-browser storage local # Get all localStorage +agent-browser storage local # Get specific key +agent-browser storage local set # Set value +agent-browser storage local clear # Clear all + +agent-browser storage session # Same for sessionStorage +``` + +## Network + +```bash +agent-browser network route # Intercept requests +agent-browser network route --abort # Block requests +agent-browser network route --body # Mock response +agent-browser network unroute [url] # Remove routes +agent-browser network requests # View tracked requests +agent-browser network requests --filter api # Filter requests +``` + +## Tabs & Windows + +```bash +agent-browser tab # List tabs +agent-browser tab new [url] # New tab (optionally with URL) +agent-browser tab # Switch to tab n +agent-browser tab close [n] # Close tab +agent-browser window new # New window +``` + +## Frames + +```bash +agent-browser frame # Switch to iframe +agent-browser frame main # Back to main frame +``` + +## Dialogs + +```bash +agent-browser dialog accept [text] # Accept (with optional prompt text) +agent-browser dialog dismiss # Dismiss +``` + +## Debug + +```bash +agent-browser trace start [path] # Start recording trace +agent-browser trace stop [path] # Stop and save trace +agent-browser console # View console messages +agent-browser console --clear # Clear console +agent-browser errors # View page errors +agent-browser errors --clear # Clear errors +agent-browser highlight # Highlight element +agent-browser state save # Save auth state +agent-browser state load # Load auth state +``` + +## Browser Settings + +```bash +agent-browser set viewport # Set viewport size +agent-browser set device # Emulate device ("iPhone 14") +agent-browser set geo # Set geolocation +agent-browser set offline [on|off] # Toggle offline mode +agent-browser set headers # Extra HTTP headers +agent-browser set credentials

# HTTP basic auth +agent-browser set media [dark|light] # Emulate color scheme +``` + +## Mouse Control + +```bash +agent-browser mouse move # Move mouse +agent-browser mouse down [button] # Press button (left/right/middle) +agent-browser mouse up [button] # Release button +agent-browser mouse wheel [dx] # Scroll wheel +``` + +## Agent Mode (JSON Output) + +Use `--json` for machine-readable output: + +```bash +agent-browser snapshot --json +# Returns: {"success":true,"data":{"snapshot":"...","refs":{"e1":{"role":"heading","name":"Title"},...}}} + +agent-browser get text @e1 --json +agent-browser is visible @e2 --json +``` + +## Headed Mode + +Show the browser window for debugging: + +```bash +agent-browser open example.com --headed +``` + +## Architecture + +agent-browser uses a client-daemon architecture: + +1. **Rust CLI** (fast native binary) - Parses commands, communicates with daemon +2. **Node.js Daemon** - Manages Playwright browser instance +3. **Fallback** - If native binary unavailable, uses Node.js directly + +The daemon starts automatically on first command and persists between commands for fast subsequent operations. + +## Platform Support + +| Platform | Binary | Fallback | +|----------|--------|----------| +| macOS ARM64 | Native Rust | Node.js | +| macOS x64 | Native Rust | Node.js | +| Linux ARM64 | Native Rust | Node.js | +| Linux x64 | Native Rust | Node.js | +| Windows | - | Node.js | + +## Comparison with Other Tools + +| Feature | agent-browser | dev-browser | Playwriter | Stagehand | +|---------|---------------|-------------|------------|-----------| +| Interface | CLI | TypeScript API | MCP | SDK | +| Selection | Refs + CSS | CSS + ARIA | Playwright API | Natural language | +| Sessions | Built-in | Manual | Extension tabs | Per-instance | +| AI-optimized | Snapshot + refs | ARIA snapshots | Execute tool | act/extract | +| Architecture | Rust + Node daemon | Bun + Playwright | Chrome extension | Browserbase | + +### When to Use agent-browser + +- **CLI-first workflows** - Shell scripts, CI/CD pipelines +- **Multi-session automation** - Parallel browser instances +- **AI agent integration** - Snapshot + ref pattern for LLMs +- **Cross-platform** - Native binaries for all major platforms + +### When to Use Other Tools + +- **dev-browser** - TypeScript/JavaScript projects, stateful pages +- **Playwriter** - Existing browser sessions, bypass detection +- **Stagehand** - Natural language automation, self-healing selectors +- **Crawl4AI** - Web scraping and content extraction + +## Common Patterns + +### Login Flow + +```bash +agent-browser open https://app.example.com/login +agent-browser snapshot -i +# Identify refs from snapshot +agent-browser fill @e3 "user@example.com" +agent-browser fill @e4 "password" +agent-browser click @e5 +agent-browser wait --url "**/dashboard" +agent-browser state save auth.json +``` + +### Form Submission + +```bash +agent-browser open https://example.com/form +agent-browser snapshot -i +agent-browser fill @e1 "John Doe" +agent-browser fill @e2 "john@example.com" +agent-browser select @e3 "US" +agent-browser check @e4 +agent-browser click @e5 +agent-browser wait --text "Success" +``` + +### Data Extraction + +```bash +agent-browser open https://example.com/products +agent-browser snapshot --json > products.json +# Parse JSON to extract product data +``` + +### Multi-Session Parallel + +```bash +# Session 1: Login to site A +agent-browser --session s1 open https://site-a.com +agent-browser --session s1 state load auth-a.json + +# Session 2: Login to site B +agent-browser --session s2 open https://site-b.com +agent-browser --session s2 state load auth-b.json + +# Work in parallel +agent-browser --session s1 snapshot -i +agent-browser --session s2 snapshot -i +``` + +## Resources + +- **GitHub**: https://github.com/vercel-labs/agent-browser +- **License**: Apache-2.0 +- **Languages**: TypeScript (74%), Rust (22%) diff --git a/.agent/tools/browser/browser-automation.md b/.agent/tools/browser/browser-automation.md index 0f47e6506..be861a2a3 100644 --- a/.agent/tools/browser/browser-automation.md +++ b/.agent/tools/browser/browser-automation.md @@ -43,6 +43,8 @@ Need browser automation? │ │ │ └─► NO ──► Start it: dev-browser-helper.sh start │ + ├─► Need CLI-first / shell scripts? ──► Use agent-browser + │ ├─► Need existing browser session/cookies? ──► Use Playwriter │ ├─► Need natural language control? ──► Use Stagehand @@ -55,12 +57,13 @@ Need browser automation? | Tool | Best For | Setup | |------|----------|-------| | **dev-browser** (DEFAULT) | Dev testing, multi-step workflows | `dev-browser-helper.sh setup` | +| **agent-browser** | CLI-first, shell scripts, CI/CD, multi-session | `npm i -g agent-browser` | | **playwriter** | Existing sessions, bypass detection | Chrome extension + MCP | | **stagehand** | Natural language automation | `stagehand-helper.sh setup` | | **crawl4ai** | Web scraping, content extraction | `crawl4ai-helper.sh setup` | | **playwright** | Cross-browser testing | MCP integration | -**Full docs**: `tools/browser/dev-browser.md` (default), `tools/browser/playwriter.md`, etc. +**Full docs**: `tools/browser/dev-browser.md` (default), `tools/browser/agent-browser.md`, `tools/browser/playwriter.md`, etc. **Ethical Rules**: Respect ToS, rate limit (2-5s delays), no spam, legitimate use only