diff --git a/automated_testing/.env.example b/automated_testing/.env.example new file mode 100644 index 0000000000..d6d52337c9 --- /dev/null +++ b/automated_testing/.env.example @@ -0,0 +1,41 @@ +# ============================================================================= +# Gleec QA Automation — Environment Configuration +# ============================================================================= +# Copy this file to .env and fill in values for your environment. +# +# PLATFORM NOTES: +# Linux: Ollama runs natively. Use http://host.docker.internal:11434 +# Windows: Run Ollama natively on Windows (best GPU perf). +# Run this runner + Docker inside WSL2. +# Ollama URL from WSL2/Docker: http://host.docker.internal:11434 +# (Docker Desktop shares ports between Windows and WSL2 automatically) +# ============================================================================= + +# --- Skyvern + Ollama integration --- +ENV=local +ENABLE_OLLAMA=true +LLM_KEY=OLLAMA +OLLAMA_SERVER_URL=http://host.docker.internal:11434 +OLLAMA_MODEL=qwen2.5-vl:32b +OLLAMA_SUPPORTS_VISION=true + +# --- Database (managed by Docker Compose) --- +DATABASE_STRING=postgresql+psycopg://skyvern:skyvern@postgres:5432/skyvern + +# --- Browser --- +BROWSER_TYPE=chromium-headful +VIDEO_PATH=/app/videos +BROWSER_ACTION_TIMEOUT_MS=10000 +MAX_STEPS_PER_RUN=50 + +# --- Skyvern server --- +LOG_LEVEL=INFO +PORT=8000 + +# --- Runner configuration (read by runner.py, not by Docker) --- +# APP_BASE_URL=https://app.gleecwallet.com +# OLLAMA_URL=http://localhost:11434 +# SKYVERN_URL=http://localhost:8000 +# VRAM_MIN_GB=15 +# DEFAULT_RETRIES=3 +# CRITICAL_RETRIES=5 diff --git a/docs/GLEEC_WALLET_MANUAL_TEST_CASES.md b/automated_testing/GLEEC_WALLET_MANUAL_TEST_CASES.md similarity index 100% rename from docs/GLEEC_WALLET_MANUAL_TEST_CASES.md rename to automated_testing/GLEEC_WALLET_MANUAL_TEST_CASES.md diff --git a/automated_testing/ci-pipeline.sh b/automated_testing/ci-pipeline.sh new file mode 100755 index 0000000000..4f02243411 --- /dev/null +++ b/automated_testing/ci-pipeline.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ============================================================================= +# Gleec QA Automation — CI Pipeline +# ============================================================================= +# Exit codes: +# 0 = all tests passed +# 1 = test failures or errors +# 2 = pre-flight / infrastructure failure +# 3 = all passed but some flaky +# ============================================================================= + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +MATRIX="${MATRIX:-test_matrix.yaml}" +ARTIFACTS_DIR="${CI_ARTIFACTS_DIR:-results}" + +echo "=== Gleec QA CI Pipeline ===" + +# --------------------------------------------------------------------------- +# Infrastructure +# --------------------------------------------------------------------------- +echo "[infra] Verifying Ollama..." +if ! curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then + echo "[infra] Starting Ollama..." + ollama serve & + sleep 5 +fi + +echo "[infra] Starting Docker stack..." +docker compose up -d +sleep 10 + +# --------------------------------------------------------------------------- +# Smoke gate (fast, blocks deployment on failure) +# --------------------------------------------------------------------------- +echo "[smoke] Running smoke gate..." +python -m runner.runner --matrix "$MATRIX" --tag smoke --single +SMOKE_EXIT=$? + +if [ $SMOKE_EXIT -eq 2 ]; then + echo "[smoke] INFRASTRUCTURE FAILURE — aborting pipeline" + exit 2 +fi + +if [ $SMOKE_EXIT -eq 1 ]; then + echo "[smoke] SMOKE GATE FAILED — blocking deployment" + # Copy whatever reports exist + cp results/run_*/report.html "$ARTIFACTS_DIR/" 2>/dev/null || true + exit 1 +fi + +echo "[smoke] Smoke gate passed (exit=$SMOKE_EXIT)" + +# --------------------------------------------------------------------------- +# Full suite (with retries and majority vote) +# --------------------------------------------------------------------------- +echo "[full] Running full automated suite..." +python -m runner.runner --matrix "$MATRIX" +FULL_EXIT=$? + +# --------------------------------------------------------------------------- +# Collect artifacts +# --------------------------------------------------------------------------- +echo "[artifacts] Collecting reports..." +mkdir -p "$ARTIFACTS_DIR" + +LATEST_RUN=$(ls -td results/run_* 2>/dev/null | head -1) +if [ -n "$LATEST_RUN" ]; then + cp "$LATEST_RUN/report.html" "$ARTIFACTS_DIR/" 2>/dev/null || true + cp "$LATEST_RUN/results.json" "$ARTIFACTS_DIR/" 2>/dev/null || true +fi + +echo "[done] Pipeline complete (exit=$FULL_EXIT)" +exit $FULL_EXIT diff --git a/automated_testing/docker-compose.yml b/automated_testing/docker-compose.yml new file mode 100644 index 0000000000..e68bc2b6fe --- /dev/null +++ b/automated_testing/docker-compose.yml @@ -0,0 +1,33 @@ +services: + postgres: + image: postgres:15 + environment: + POSTGRES_USER: skyvern + POSTGRES_PASSWORD: skyvern + POSTGRES_DB: skyvern + volumes: + - pgdata:/var/lib/postgresql/data + ports: + - "5432:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U skyvern"] + interval: 5s + retries: 5 + + skyvern: + image: ghcr.io/skyvern-ai/skyvern:latest + depends_on: + postgres: + condition: service_healthy + ports: + - "8000:8000" + env_file: + - .env + extra_hosts: + - "host.docker.internal:host-gateway" + volumes: + - ./results/videos:/app/videos + - ./results/screenshots:/app/artifacts + +volumes: + pgdata: diff --git a/automated_testing/gleec-qa-architecture.md b/automated_testing/gleec-qa-architecture.md new file mode 100644 index 0000000000..b9dfa7da2a --- /dev/null +++ b/automated_testing/gleec-qa-architecture.md @@ -0,0 +1,1137 @@ +# Gleec Wallet QA Automation Architecture + +> **Skyvern + Ollama Vision-Based Testing — Consolidated Technical Reference** +> +> Komodo Platform · March 2026 · Version 1.0 + +--- + +## Table of Contents + +1. [Executive Summary](#1-executive-summary) +2. [Architecture Overview](#2-architecture-overview) +3. [Component Breakdown](#3-component-breakdown) +4. [Infrastructure Setup](#4-infrastructure-setup) +5. [Robustness Hardening](#5-robustness-hardening) +6. [Test Case Evaluation](#6-test-case-evaluation) +7. [Automated Test Matrix](#7-automated-test-matrix) +8. [Manual Test Companion](#8-manual-test-companion) +9. [Implementation Artifacts](#9-implementation-artifacts) +10. [Execution Strategy](#10-execution-strategy) +11. [Performance Expectations](#11-performance-expectations) +12. [Risks and Limitations](#12-risks-and-limitations) + +--- + +## 1. Executive Summary + +This document is the consolidated technical reference for automating QA testing of the Gleec Wallet, a Flutter web application within the Komodo Platform ecosystem. It covers the complete architecture from infrastructure through test case design to execution strategy. + +### Problem + +Flutter web applications render their entire UI to an HTML canvas element, which makes traditional DOM-based testing tools (Selenium, Cypress, Playwright selectors) non-functional. The Gleec Wallet has 85+ manual test cases across 26 feature areas and 6 platforms, requiring approximately 52 hours of manual execution time per full regression cycle. + +### Solution + +A vision-based testing architecture using Skyvern (browser automation orchestrator) backed by Ollama running a local vision-language model (qwen2.5-vl:32b) on an RTX 5090 GPU. The system takes screenshots of the Flutter canvas, sends them to the vision model for analysis and action planning, and executes actions through Playwright. Tests are defined as natural-language prompts in a YAML matrix file. + +### Key Outcomes + +| Metric | Value | +| --------------------------------- | --------------------- | +| Manual test cases evaluated | 85 | +| Fully automatable (Grade A) | 40 (47%) | +| Partially automatable (Grade B) | 18 (21%) | +| Manual only (Grade C) | 27 (32%) | +| Automated tests in Phase 1 matrix | 43 | +| Manual companion checklist items | 36 | +| Estimated automated run time | 30–60 minutes | +| Target pass-rate stability | 90–95% | +| Hardware requirement | RTX 5090 (32 GB VRAM) | + +--- + +## 2. Architecture Overview + +The architecture is a three-layer stack designed for local GPU-accelerated execution with no cloud API dependencies. + +### 2.1 Three-Layer Design + +**Layer 1 — Ollama (native on host):** Runs the qwen2.5-vl:32b vision-language model directly on the host machine with full NVIDIA GPU access. Serves a local HTTP API on port 11434. Not containerised, to avoid Docker GPU passthrough complexity. + +**Layer 2 — Skyvern + PostgreSQL (Docker Compose):** Skyvern is the browser automation orchestrator that manages Chromium sessions via Playwright, captures screenshots, sends them to Ollama for analysis, receives action plans, and executes them. PostgreSQL stores task state and run history. Both run inside Docker with host network access to reach Ollama. + +**Layer 3 — Python Test Runner (host):** A standalone Python script that reads the test matrix YAML, iterates test cases, calls the Skyvern SDK programmatically, applies robustness hardening (retries, majority vote, checkpoints, timeout guards), and generates JSON + HTML reports. + +### 2.2 Data Flow + +The test execution flow follows this sequence: + +1. Runner reads `test_matrix.yaml` and parses test cases with their prompts, expected results, and extraction schemas. +2. Pre-flight checks validate that Ollama, Skyvern, PostgreSQL, and the Flutter app are all healthy before any tests run. +3. For each test case, the runner creates a fresh browser session and calls Skyvern's `run_task()` with the natural-language prompt. +4. Skyvern enters its vision loop: screenshot the page → send to Ollama → receive action plan → execute via Playwright → repeat until COMPLETE or step limit reached. +5. At task completion, Skyvern extracts structured data using the `extraction_schema` and returns it alongside the task status. +6. The runner applies majority vote across multiple attempts (3–5 per test) to determine the final pass/fail/flaky verdict. +7. Results are written to `results.json` and `report.html` in a timestamped run directory. + +### 2.3 System Diagram + +``` +tests/test_matrix.yaml + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ HARDENED TEST RUNNER │ +│ │ +│ ┌──────────────┐ ┌────────────────┐ ┌────────────┐ │ +│ │ Pre-flight │ │ Test Executor │ │ Post-run │ │ +│ │ Checks │ │ (per test) │ │ Analysis │ │ +│ │ │ │ │ │ │ │ +│ │ • Ollama up? │ │ • Fresh session │ │ • Majority │ │ +│ │ • VRAM ok? │ │ • Retry loop │ │ vote │ │ +│ │ • Skyvern up? │ │ • Checkpoints │ │ • Flaky │ │ +│ │ • App up? │ │ • Screenshots │ │ detect │ │ +│ │ • Model loads?│ │ • Timeout guard │ │ • Report │ │ +│ └──────┬───────┘ └───────┬─────────┘ └─────┬──────┘ │ +│ │ │ │ │ +└─────────┼───────────────────┼────────────────────┼────────┘ + ▼ ▼ ▼ + abort if fail Skyvern SDK results.json + + browser sessions report.html +``` + +### 2.4 Network Topology + +| Component | Host | Port | Protocol | +| --------------------- | -------------------------- | ----- | ----------------- | +| Ollama | Host machine (native) | 11434 | HTTP REST | +| Skyvern Server | Docker container | 8000 | HTTP REST | +| PostgreSQL | Docker container | 5432 | TCP | +| Chromium (Playwright) | Docker (inside Skyvern) | — | CDP | +| Flutter Web App | Staging server / localhost | 3000 | HTTPS/HTTP | +| Python Runner | Host machine | — | Calls Skyvern SDK | + +Docker containers reach Ollama on the host via the `host.docker.internal` alias (configured with `extra_hosts: host-gateway`). The runner communicates with Skyvern through its published port 8000 on localhost. + +--- + +## 3. Component Breakdown + +### 3.1 Ollama (Vision Model Server) + +| Setting | Value | +| ----------------- | ------------------------------------------------------------------------ | +| Primary model | qwen2.5-vl:32b (Q4 quantised) | +| Fallback model | gemma3:27b (faster, less accurate) | +| Lightweight model | qwen2.5-vl:7b (for rapid iteration) | +| VRAM usage | ~20 GB (32B Q4) / ~16 GB (27B) / ~5 GB (7B) | +| Host | http://localhost:11434 | +| Role | Vision analysis, action planning, checkpoint validation, data extraction | +| Installation | Native on host via `curl -fsSL https://ollama.com/install.sh \| sh` | + +Ollama runs outside Docker to get direct NVIDIA GPU access without container GPU passthrough complexity. The qwen2.5-vl:32b model is the primary choice because it provides the strongest vision accuracy for Flutter's canvas-rendered UI. The RTX 5090's 32 GB VRAM comfortably holds the Q4-quantised 32B model with room for KV cache. + +### 3.2 Skyvern (Browser Automation Orchestrator) + +| Setting | Value | +| ---------------------- | ------------------------------------------------------------- | +| Image | ghcr.io/skyvern-ai/skyvern:latest | +| Port | 8000 | +| Engine options | skyvern-1.0 (simple tasks) / skyvern-2.0 (complex multi-step) | +| Browser | Chromium via Playwright (headful for video, headless for CI) | +| LLM backend | Ollama via ENABLE_OLLAMA=true | +| Connection to Ollama | http://host.docker.internal:11434 | +| Max steps per run | 50 (configurable per test) | +| Browser action timeout | 10000ms | + +Skyvern orchestrates the vision-action loop: it takes a screenshot of the current browser state, sends it to the LLM with the prompt context, receives an action plan (click coordinates, text to type, scroll direction), executes the action via Playwright, and repeats. Each iteration is one "step." Tasks complete when the LLM determines the goal is met, an error is detected, or the step limit is reached. + +### 3.3 PostgreSQL (Task State Store) + +PostgreSQL 15 runs alongside Skyvern in Docker Compose. It stores task history, step-by-step screenshots, extracted data, and run metadata. No manual interaction is needed; it is managed entirely by Skyvern's internal ORM. Data is persisted in a named Docker volume (`pgdata`) across restarts. + +### 3.4 Python Test Runner + +The runner is the single orchestration point that ties everything together. It is a standalone Python 3.11+ script that uses the Skyvern Python SDK to create tasks programmatically. It reads the YAML test matrix, applies robustness hardening (pre-flight checks, retries, majority vote, timeout guards, Ollama monitoring), and writes structured results. + +--- + +## 4. Infrastructure Setup + +### 4.1 Docker Compose Configuration + +```yaml +# docker-compose.yml +services: + postgres: + image: postgres:15 + environment: + POSTGRES_USER: skyvern + POSTGRES_PASSWORD: skyvern + POSTGRES_DB: skyvern + volumes: + - pgdata:/var/lib/postgresql/data + ports: + - "5432:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U skyvern"] + interval: 5s + retries: 5 + + skyvern: + image: ghcr.io/skyvern-ai/skyvern:latest + depends_on: + postgres: + condition: service_healthy + ports: + - "8000:8000" + environment: + - DATABASE_STRING=postgresql+psycopg://skyvern:skyvern@postgres:5432/skyvern + - BROWSER_TYPE=chromium-headful + - VIDEO_PATH=/app/videos + - BROWSER_ACTION_TIMEOUT_MS=10000 + - MAX_STEPS_PER_RUN=50 + - ENABLE_OLLAMA=true + - OLLAMA_SERVER_URL=http://host.docker.internal:11434 + - OLLAMA_MODEL=qwen2.5-vl:32b + - OLLAMA_SUPPORTS_VISION=true + - ENV=local + - LOG_LEVEL=INFO + - PORT=8000 + extra_hosts: + - "host.docker.internal:host-gateway" + volumes: + - ./results/videos:/app/videos + - ./results/screenshots:/app/artifacts + +volumes: + pgdata: +``` + +### 4.2 Environment File + +```bash +# .env +ENV=local +ENABLE_OLLAMA=true +OLLAMA_SERVER_URL=http://host.docker.internal:11434 +OLLAMA_MODEL=qwen2.5-vl:32b +OLLAMA_SUPPORTS_VISION=true +DATABASE_STRING=postgresql+psycopg://skyvern:skyvern@postgres:5432/skyvern +BROWSER_TYPE=chromium-headful +VIDEO_PATH=/app/videos +BROWSER_ACTION_TIMEOUT_MS=10000 +MAX_STEPS_PER_RUN=50 +LOG_LEVEL=INFO +PORT=8000 +``` + +### 4.3 Setup Script + +```bash +#!/usr/bin/env bash +set -euo pipefail + +echo "=== Komodo QA Automation Setup ===" + +# 1. Install Ollama +if ! command -v ollama &> /dev/null; then + curl -fsSL https://ollama.com/install.sh | sh +fi + +# 2. Pull vision model +ollama pull qwen2.5-vl:32b + +# 3. Start Ollama server +if ! curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then + ollama serve & + sleep 3 +fi + +# 4. Create project structure +mkdir -p komodo-qa-automation/{tests,runner,results} + +# 5. Start Docker stack +cd komodo-qa-automation +docker compose up -d + +echo "Setup complete. Ollama: :11434 Skyvern: :8000" +``` + +### 4.4 Directory Structure + +``` +komodo-qa-automation/ +├── docker-compose.yml +├── .env +├── setup.sh +├── tests/ +│ ├── test_matrix.yaml # 43 automated test cases +│ └── manual_companion.yaml # 36 manual-only checklist items +├── runner/ +│ ├── __init__.py +│ ├── runner.py # Hardened main runner +│ ├── models.py # Pydantic data models +│ ├── reporter.py # HTML report generator +│ ├── preflight.py # Pre-flight health checks +│ ├── prompt_builder.py # Flutter-hardened prompt assembly +│ ├── retry.py # Majority vote logic +│ ├── guards.py # Timeout and deadlock guards +│ └── ollama_monitor.py # Background GPU/VRAM monitor +└── results/ + └── run_/ + ├── results.json + ├── report.html + └── screenshots/ +``` + +### 4.5 Python Dependencies + +```text +# requirements.txt +skyvern>=1.0.0 +pyyaml>=6.0 +pydantic>=2.0 +httpx>=0.27.0 +``` + +--- + +## 5. Robustness Hardening + +Vision-based testing with LLMs is inherently non-deterministic. The following ten strategies address the primary failure modes. + +### 5.1 Known Fragility Points + +| # | Problem | Severity | Root Cause | +| --- | --------------------------- | ------------ | ----------------------------------------------------------------------------------------------------------------- | +| 1 | Vision model hallucination | **Critical** | LLM identifies UI elements that don't exist, or clicks wrong ones. No DOM fallback on Flutter canvas. | +| 2 | Non-deterministic outputs | **High** | Same prompt + same page produces different actions across runs due to LLM stochasticity. | +| 3 | Flutter async rendering | **High** | Skyvern screenshots mid-render, acts on incomplete frame while Flutter rebuilds widgets. | +| 4 | Multi-step state corruption | **Critical** | Step N fails silently (wrong click), but subsequent steps execute against wrong state, producing misleading PASS. | +| 5 | No test isolation | **High** | Test B inherits leftover state from Test A (open modals, changed settings, navigation position). | +| 6 | Login session expiry | **Medium** | Setup logs in, but by test 15 the session has timed out. | +| 7 | Ambiguous completion | **High** | Skyvern cannot distinguish successful completion from dead-end abandonment. | +| 8 | Flaky pass/fail | **High** | Test passes 3/5 times. Single-run result is unreliable. | +| 9 | No visual baseline | **Medium** | Assertions rely on LLM judgment, not pixel-level comparison with known-good screenshots. | +| 10 | Silent Ollama failures | **Medium** | Ollama OOMs, truncates responses, or times out. Skyvern may not surface this cleanly. | + +### 5.2 Mitigation Strategies + +**1. Pre-flight Health Checks:** Before running any tests, the runner validates: Ollama is responding (HTTP + actual inference test), VRAM has >15 GB free, Skyvern server responds on :8000, and the Flutter app is reachable. If any check fails, the run aborts with exit code 2. This prevents wasting time on tests that would all fail due to infrastructure issues. + +```python +# runner/preflight.py + +async def check_ollama(url: str = "http://localhost:11434") -> bool: + """Verify Ollama is running and the model is loaded.""" + async with httpx.AsyncClient(timeout=10) as client: + resp = await client.get(f"{url}/api/tags") + models = resp.json().get("models", []) + return len(models) > 0 + +async def check_ollama_inference(url: str = "http://localhost:11434", + model: str = "qwen2.5-vl:32b") -> bool: + """Actually run a trivial inference to confirm GPU works.""" + async with httpx.AsyncClient(timeout=60) as client: + resp = await client.post(f"{url}/api/generate", json={ + "model": model, + "prompt": "Reply with only the word OK.", + "stream": False, + }) + output = resp.json().get("response", "").strip() + return "ok" in output.lower() + +async def check_vram() -> bool: + """Verify sufficient VRAM is free (>15 GB for 32B Q4 model).""" + result = subprocess.run( + ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5, + ) + free_gb = int(result.stdout.strip().split("\n")[0]) / 1024 + return free_gb > 15 + +async def check_skyvern(url: str = "http://localhost:8000") -> bool: + """Verify Skyvern server responds.""" + async with httpx.AsyncClient(timeout=10) as client: + resp = await client.get(f"{url}/api/v1/heartbeat") + return resp.status_code == 200 + +async def check_app(url: str) -> bool: + """Verify the Flutter app is reachable.""" + async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client: + resp = await client.get(url) + return resp.status_code < 500 + +async def run_preflight(config: dict) -> bool: + """Run all pre-flight checks. Returns False if any critical check fails.""" + results = await asyncio.gather( + check_ollama(), check_vram(), check_skyvern(), check_app(config["base_url"]), + ) + all_ok = all(results) + if all_ok: + all_ok = await check_ollama_inference() + return all_ok +``` + +**2. Test Isolation via Fresh Browser Sessions:** Each test case gets its own fresh browser session with no shared cookies, storage, or navigation history. The setup prompt (login) is embedded directly into each test's prompt so login and test execute in the same session, eliminating the session-handoff problem. + +```python +# In runner.py — each run_task() creates an isolated browser: + +async def run_isolated_test(skyvern, test, config, setup_config): + full_prompt = "" + if setup_config: + full_prompt += f"PHASE 1 — SETUP:\n{setup_config['prompt']}\n\n" + full_prompt += "After setup is complete, proceed immediately to Phase 2.\n\n" + full_prompt += "PHASE 2 — TEST:\n" + full_prompt += test.prompt + + task = await skyvern.run_task( + url=config["base_url"], + prompt=full_prompt, + engine=config.get("default_engine", "skyvern-2.0"), + max_steps=test.max_steps or 25, + data_extraction_schema=test.extraction_schema, + wait_for_completion=True, + # Each run_task gets its own browser — no shared state + ) + return task +``` + +**3. Retry with Majority Vote:** Every test runs 3 times (5 times for critical-tagged tests). The final verdict is determined by majority vote across attempts. If 2/3 pass, the test is PASS. If results are split (1 pass, 1 fail, 1 error), the test is marked FLAKY. This is the single most important robustness measure for dealing with LLM non-determinism. + +```python +# runner/retry.py + +@dataclass +class VotedResult: + test_id: str + final_status: str # PASS | FAIL | FLAKY | ERROR + vote_counts: dict # {"PASS": 2, "FAIL": 1} + confidence: float # 0.0–1.0 + attempts: list + +def majority_vote(attempts: list, test) -> VotedResult: + """ + Rules: + - If ALL attempts agree → that status, confidence 1.0 + - If majority agrees → that status, confidence = majority/total + - If no majority → FLAKY, confidence = max_count/total + - If all ERROR → ERROR + """ + statuses = [a.status for a in attempts] + counts = Counter(statuses) + total = len(attempts) + most_common_status, most_common_count = counts.most_common(1)[0] + + if most_common_count > total / 2: + final_status = most_common_status + confidence = most_common_count / total + else: + final_status = "FLAKY" + confidence = most_common_count / total + + # If "winner" is ERROR but there are PASS/FAIL, prefer those + if final_status == "ERROR": + non_errors = [s for s in statuses if s != "ERROR"] + if non_errors: + final_status = "FLAKY" + + return VotedResult( + test_id=test.id, + final_status=final_status, + vote_counts=dict(counts), + confidence=round(confidence, 2), + attempts=attempts, + ) +``` + +**4. Flutter Render Wait Strategy:** A Flutter preamble is automatically prepended to every prompt instructing the vision model to wait 2–3 seconds for canvas rendering, look for loading spinners before acting, and handle blank-screen initialization delays. This prevents acting on incomplete frames. + +```python +# runner/prompt_builder.py + +FLUTTER_PREAMBLE = """IMPORTANT CONTEXT: +This is a Flutter web application rendered entirely on an HTML canvas element. +You cannot use DOM selectors — you must identify all elements visually. + +Before taking any action on each new screen: +1. Wait 2 seconds for the page to fully render (Flutter animations to complete). +2. If you see a loading spinner, circular progress indicator, or skeleton + placeholders, wait until they disappear before proceeding. +3. If the screen appears blank or only shows a solid color, wait 3 more + seconds — Flutter may still be initialising. + +If you are unsure whether an element is a button or just text, look for +visual cues: rounded corners, drop shadows, background color contrast, +or iconography that suggests interactivity. +""" + +COMPLETION_SUFFIX = """ +After completing the task, clearly state whether you succeeded or encountered +an error. If you see an error message, snackbar, or alert dialog on screen, +report its exact text in your response.""" + +def build_prompt(test_prompt: str, setup_prompt: str | None = None) -> str: + parts = [FLUTTER_PREAMBLE] + if setup_prompt: + parts.append(f"PHASE 1 — SETUP:\n{setup_prompt}\n") + parts.append("After setup is complete, proceed to Phase 2.\n") + parts.append(f"PHASE 2 — TEST:\n{test_prompt}") + else: + parts.append(test_prompt) + parts.append(COMPLETION_SUFFIX) + return "\n".join(parts) +``` + +**5. Checkpoint Assertions (Mid-Flow Validation):** Complex multi-step tests include checkpoint verification between steps. Each checkpoint is a visual assertion ("the send form is visible with recipient and amount fields"). If a checkpoint fails, the test aborts immediately instead of continuing against wrong state, preventing cascading false results. + +```python +# runner/prompt_builder.py (addition) + +def build_stepped_prompt(steps: list[dict]) -> str: + """Convert checkpoint-based steps into a single sequential prompt.""" + lines = [] + for i, step in enumerate(steps, 1): + lines.append(f"STEP {i}: {step['action']}") + if step.get("checkpoint"): + lines.append( + f" → BEFORE proceeding to step {i+1}, verify: {step['checkpoint']}") + lines.append( + f" → If this verification FAILS, STOP and report which step failed and why.") + lines.append("") + return "\n".join(lines) +``` + +**6. Timeout and Deadlock Guards:** Every Skyvern task call is wrapped in an `asyncio.wait_for()` with a configurable timeout (default 180 seconds). If the vision model hangs, the browser deadlocks, or Ollama stalls, the test is terminated and marked ERROR with a diagnostic message. + +```python +# runner/guards.py + +class TestTimeoutError(Exception): + pass + +async def run_with_timeout(coro, seconds: int, test_id: str): + """Run a coroutine with a hard timeout.""" + try: + return await asyncio.wait_for(coro, timeout=seconds) + except asyncio.TimeoutError: + raise TestTimeoutError( + f"Test {test_id} timed out after {seconds}s." + ) +``` + +**7. Background Ollama Health Monitor:** A background asyncio task polls `nvidia-smi` and Ollama's HTTP endpoint every 10 seconds during the run. It checks VRAM free (abort if <500 MB), GPU temperature (warn if >90°C), and Ollama API responsiveness. If Ollama becomes unhealthy mid-run, subsequent tests are immediately marked ERROR with the specific failure reason. + +```python +# runner/ollama_monitor.py + +class OllamaMonitor: + def __init__(self, ollama_url: str = "http://localhost:11434"): + self.url = ollama_url + self._running = False + self.last_error = None + + async def start(self): + self._running = True + self._task = asyncio.create_task(self._monitor_loop()) + + @property + def healthy(self) -> bool: + return self.last_error is None + + async def _monitor_loop(self): + while self._running: + try: + async with httpx.AsyncClient(timeout=5) as client: + resp = await client.get(f"{self.url}/api/tags") + if resp.status_code != 200: + self.last_error = f"Ollama returned HTTP {resp.status_code}" + else: + self.last_error = None + + result = subprocess.run( + ["nvidia-smi", "--query-gpu=memory.free,memory.used,temperature.gpu", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5, + ) + parts = result.stdout.strip().split(", ") + if len(parts) >= 3: + free_mb, used_mb, temp_c = int(parts[0]), int(parts[1]), int(parts[2]) + if free_mb < 500: + self.last_error = f"VRAM critically low: {free_mb}MB free" + elif temp_c > 90: + self.last_error = f"GPU temperature critical: {temp_c}°C" + except Exception as e: + self.last_error = f"Monitor error: {e}" + + await asyncio.sleep(10) +``` + +**8. Hardened Prompt Construction:** All prompts are built through a `prompt_builder` module that automatically adds the Flutter preamble, structures multi-phase prompts (setup + test), injects checkpoint verification language, and appends a completion suffix requesting explicit success/error reporting. + +**9. Early Exit Optimisation:** If the first 2 attempts both pass, remaining retries are skipped. If all attempts so far are ERROR (infrastructure issue), retries stop early. This reduces total run time by 30–40% for stable tests while preserving full retry coverage for flaky ones. + +```python +# Inside run_test_with_retries(): + +attempts = [] +for i in range(num_attempts): + result = await execute_single_attempt(skyvern, test, config, setup_config, monitor) + attempts.append(result) + + # Early exit: if first 2 attempts both pass, skip remaining + if len(attempts) >= 2: + pass_count = sum(1 for a in attempts if a.status == "PASS") + if pass_count >= 2: + break + + # Early exit: if all attempts so far are ERROR (infra issue), stop + if all(a.status == "ERROR" for a in attempts) and len(attempts) >= 2: + break +``` + +**10. Structured Exit Codes for CI:** Exit code 0 = all passed. Exit code 1 = failures or errors. Exit code 2 = pre-flight failure (infrastructure). Exit code 3 = all tests passed but some were flaky. This enables CI pipelines to distinguish between test failures, infra failures, and instability. + +| Code | Meaning | +| ---- | ------------------------------------------------------------------ | +| `0` | All tests passed | +| `1` | One or more tests failed or errored | +| `2` | Pre-flight checks failed (infrastructure issue) | +| `3` | All tests passed but some were flaky (inconsistent across retries) | + +--- + +## 6. Test Case Evaluation + +All 85 test cases from `GLEEC_WALLET_MANUAL_TEST_CASES.md` were evaluated for automation suitability with the Skyvern + Ollama stack. + +### 6.1 Classification Framework + +| Grade | Meaning | Count | % | Action | +| ----- | -------------------------------------------------------------------- | ----- | --- | -------------------------------------------- | +| **A** | Fully automatable — pure UI interaction within a web browser | 40 | 47% | Convert to Skyvern prompt | +| **B** | Partially automatable — some steps need human/external action | 18 | 21% | Split: automate UI, flag manual verification | +| **C** | Manual only — requires hardware, OS actions, network, cross-platform | 27 | 32% | Keep in manual checklist | + +### 6.2 Full Classification Table + +| Test ID | Module | Title | Grade | Reason | +| --------- | ------------------- | ------------------------------------- | ----- | ------------------------------------------------------------------------------------------- | +| AUTH-001 | Auth | Create wallet with seed backup | **A** | UI-only flow: tap, enter password, navigate seed screens | +| AUTH-002 | Auth | Login/logout with remember-session | **B** | Login/logout automatable; "close and relaunch app" requires session restart outside Skyvern | +| AUTH-003 | Auth | Import wallet from seed | **A** | UI-only: enter seed, set password, verify balances | +| AUTH-004 | Auth | Invalid password attempts + lockout | **A** | UI-only: enter wrong passwords, observe lockout messages | +| AUTH-005 | Auth | Trezor hardware wallet | **C** | Requires physical Trezor device connected via USB | +| WAL-001 | Wallet Manager | Create/rename/switch wallets | **A** | Pure UI interactions within wallet management | +| WAL-002 | Wallet Manager | Delete wallet with confirmation | **A** | UI dialog flow | +| WAL-003 | Wallet Manager | Selection persistence after restart | **C** | Requires app restart | +| COIN-001 | Coin Manager | Enable DOC/MARTY test coins | **A** | Toggle coins in settings | +| COIN-002 | Coin Manager | Search and activate coins | **A** | Search UI, toggle activation | +| COIN-003 | Coin Manager | Deactivate coin with balance | **A** | Balance warning dialog, deactivation flow | +| DASH-001 | Dashboard | Hide balances / zero balance toggles | **A** | Toggle switches, verify UI changes | +| DASH-002 | Dashboard | Offline indicator | **C** | Requires network disconnection at OS level | +| DASH-003 | Dashboard | Dashboard persistence after restart | **C** | Requires app restart | +| SEND-001 | Send | Faucet funding | **A** | Navigate to faucet, request funds, verify balance | +| SEND-002 | Send | Faucet cooldown | **B** | Faucet automatable; network error fallback requires network toggle | +| SEND-003 | Send | Send DOC happy path | **A** | Fill send form, confirm, verify status | +| SEND-004 | Send | Address validation | **A** | Enter invalid addresses, observe error messages | +| SEND-005 | Send | Amount boundary testing | **A** | Enter boundary amounts, observe validation | +| SEND-006 | Send | Interrupted send (network kill) | **C** | Requires network interruption mid-transaction | +| DEX-001 | DEX | Create maker order | **A** | Fill order form, submit, verify | +| DEX-002 | DEX | Taker order | **B** | Depends on market liquidity availability | +| DEX-003 | DEX | Input validation | **A** | Enter invalid values, observe errors | +| DEX-004 | DEX | Partial fill behaviour | **B** | Depends on market conditions | +| DEX-005 | DEX | History export | **B** | UI automatable; file verification requires filesystem access | +| DEX-006 | DEX | Recovery after closure + network | **C** | Requires app closure and network manipulation | +| BRDG-001 | Bridge | Bridge transfer happy path | **A** | Fill bridge form, submit, verify | +| BRDG-002 | Bridge | Unsupported pair handling | **A** | Select unsupported pair, observe error | +| BRDG-003 | Bridge | Amount boundaries | **A** | Enter boundary amounts, verify validation | +| BRDG-004 | Bridge | Bridge failure (network) | **C** | Requires network interruption | +| NFT-001 | NFT | List and detail view | **A** | Navigate NFT section, browse items | +| NFT-002 | NFT | Send NFT | **A** | Fill send form, confirm transfer | +| NFT-003 | NFT | Send failure handling | **A** | Trigger failure, observe error UI | +| SET-001 | Settings | Persistence after restart | **C** | Requires app restart | +| SET-002 | Settings | Privacy toggles | **A** | Toggle settings, verify UI changes | +| SET-003 | Settings | Test coin toggle impact | **A** | Toggle, verify coin visibility | +| SET-004 | Settings | Settings persistence (logout/restart) | **C** | Requires logout and restart | +| BOT-001 | Bot | Create and start market maker | **A** | Fill bot config, start, verify running | +| BOT-002 | Bot | Bot validation (invalid params) | **A** | Enter invalid params, observe errors | +| NAV-001 | Navigation | Route integrity | **A** | Navigate all routes, verify loading | +| NAV-002 | Navigation | Deep link while logged out | **C** | Requires direct URL manipulation | +| NAV-003 | Navigation | Unsaved changes warning | **A** | Make changes, attempt navigation, verify warning | +| RESP-001 | Responsive | Breakpoint behaviour | **C** | Requires controlled window resizing | +| RESP-002 | Responsive | Orientation change | **C** | Requires device rotation | +| XPLAT-001 | Cross-platform | Feature parity | **C** | Requires Android/iOS/macOS/Linux/Windows | +| XPLAT-002 | Cross-platform | Permission dialogs | **C** | Requires OS-level permission dialogs | +| A11Y-001 | Accessibility | Keyboard navigation | **C** | Requires focus state inspection | +| A11Y-002 | Accessibility | Screen reader | **C** | Requires screen reader output analysis | +| A11Y-003 | Accessibility | Contrast and scaling | **C** | Requires pixel-level measurement | +| SEC-001 | Security | Seed phrase reveal | **B** | Reveal automatable; screenshot masking verification is manual | +| SEC-002 | Security | Auto-lock timeout | **C** | Requires idle timeout + app-switcher | +| SEC-003 | Security | Clipboard clearing | **C** | Requires clipboard monitoring outside browser | +| ERR-001 | Error Handling | Network outage recovery | **C** | Requires network toggling | +| ERR-002 | Error Handling | Partial failure | **C** | Requires selective network failure | +| ERR-003 | Error Handling | Stale state after closure | **C** | Requires app closure | +| L10N-001 | Localization | Translation completeness | **A** | Switch locale, verify text rendering | +| L10N-002 | Localization | Long string overflow | **B** | Visual clipping judgment is low-confidence for LLM | +| L10N-003 | Localization | Locale-specific formats | **A** | Switch locale, verify date/number formats | +| FIAT-001 | Fiat | Fiat menu access | **A** | Navigate to fiat section | +| FIAT-002 | Fiat | Form validation | **A** | Enter invalid data, observe errors | +| FIAT-003 | Fiat | Provider checkout | **B** | Provider webview may cross domain boundaries | +| FIAT-004 | Fiat | Checkout closed/cancelled | **B** | Manual closure detection | +| FIAT-005 | Fiat | Fiat after logout/login | **C** | Requires logout and re-login | +| SUP-001 | Support | Support page access | **A** | Navigate to support section | +| FEED-001 | Feedback | Feedback entry | **A** | Open feedback form, submit | +| SECX-001 | Security (Extended) | Private key export | **B** | Export automatable; download/share may cross browser boundary | +| SECX-002 | Security (Extended) | Seed backup verification | **A** | View seed, confirm backup flow | +| SECX-003 | Security (Extended) | Unban pubkeys | **A** | Navigate to pubkey management, unban | +| SECX-004 | Security (Extended) | Change password | **A** | Enter old/new password, confirm | +| SETX-001 | Advanced Settings | Weak password toggle | **A** | Toggle setting, verify effect | +| SETX-002 | Advanced Settings | Bot toggles | **B** | Stop-on-disable verification depends on running bot | +| SETX-003 | Advanced Settings | Export/import JSON | **C** | Filesystem operation | +| SETX-004 | Advanced Settings | Show swap data | **B** | Export is filesystem operation | +| SETX-005 | Advanced Settings | Import swaps JSON | **C** | Requires paste from external source | +| SETX-006 | Advanced Settings | Download logs | **C** | Filesystem download | +| SETX-007 | Advanced Settings | Reset coins to default | **A** | Trigger reset, verify coin list | +| WALX-001 | Wallet (Extended) | Overview cards | **A** | Verify wallet cards display | +| WALX-002 | Wallet (Extended) | Tabs (logged-out fallback) | **B** | Logged-out fallback needs logout | +| WADDR-001 | Wallet Addresses | Multi-address display | **A** | View address list | +| WADDR-002 | Wallet Addresses | Create new address | **A** | Generate address, verify display | +| CTOK-001 | Custom Token | Import ERC-20 token | **A** | Enter contract address, import | +| CTOK-002 | Custom Token | Invalid contract handling | **A** | Enter invalid contract, observe error | +| CTOK-003 | Custom Token | Back/cancel from import | **A** | Navigate away, verify no side effects | +| GATE-001 | Feature Gating | Trading-disabled tooltips | **A** | Verify disabled state indicators | +| GATE-002 | Feature Gating | Hardware wallet restrictions | **C** | Requires connected hardware wallet | +| GATE-003 | Feature Gating | NFT disabled state | **A** | Verify NFT section disabled UI | +| CDET-001 | Coin Detail | Address display | **B** | Display automatable; clipboard/explorer verification manual | +| CDET-002 | Coin Detail | Transaction list | **B** | List automatable; pending→confirmed needs real chain time | +| CDET-003 | Coin Detail | Price chart | **B** | Chart automatable; offline fallback needs network toggle | +| RWD-001 | Rewards | Rewards claim | **B** | Claim depends on reward availability | +| BREF-001 | Bitrefill | Bitrefill widget | **B** | Widget crosses domain boundaries | +| ZHTL-001 | ZHTLC | ZHTLC activation | **B** | Logout-during-activation is manual | +| QLOG-001 | Quick Login | Remember-me persistence | **C** | Requires app relaunch | +| WARN-001 | Warnings | Clock warning banner | **C** | Requires system clock manipulation | + +### 6.3 Why Grade C Tests Cannot Be Automated + +The 27 Grade C tests fall into these categories, none of which Skyvern can handle: + +**Hardware wallet (2 tests):** GW-MAN-AUTH-005, GW-MAN-GATE-002 require a physical Trezor device connected via USB. The vision model cannot interact with external hardware. + +**Network manipulation (7 tests):** DASH-002, SEND-006, DEX-006, BRDG-004, ERR-001/002/003 require disabling/re-enabling the network at the OS level, which is outside the browser sandbox. + +**App lifecycle (7 tests):** AUTH-002b, WAL-003, DASH-003, SET-004, FIAT-005, QLOG-001, NAV-002 require closing and relaunching the application, which destroys the Skyvern browser session. + +**Cross-platform + responsive (4 tests):** XPLAT-001/002, RESP-001/002 require execution on Android, iOS, macOS, Linux, and Windows native apps, or controlled window resizing that Skyvern cannot reliably perform. + +**Accessibility (3 tests):** A11Y-001/002/003 require keyboard-only navigation focus inspection, screen reader output analysis, and pixel-level contrast measurement. + +**Security/privacy (3 tests):** SEC-002/003, WARN-001 require app-switcher snapshot inspection, clipboard monitoring, and system clock manipulation. + +**Filesystem operations (3 tests):** SETX-003/005/006 require file export/import and log downloading outside the browser context. + +### 6.4 Structural Gaps in the Manual Document + +Beyond per-case suitability, these structural issues in the original document prevent direct conversion to automation: + +**Compound test cases:** A single manual test case often covers 4–6 distinct scenarios. For example, AUTH-001 tests creation, password entry, seed skip attempt, seed confirmation, and onboarding completion. For vision-based agents, this must be split into 2–3 atomic tasks to prevent state corruption at one step from cascading through the rest. + +**No visual element descriptions:** Every manual case says "Open DEX" or "Enter amount" without describing what the element looks like. Skyvern needs "Click the input field labeled Amount below the recipient address, with a coin ticker next to it." + +**Abstract expected results:** "Validation blocks invalid orders with guidance" is not machine-evaluable. The automation needs: "A red error message appears containing the word invalid, insufficient, or minimum." + +**No inline test data:** Cases reference AS-01, AM-03, WP-02 by code. The automation prompt must contain the actual address string, amount value, and seed phrase inline. + +**Missing dependency graph:** Many tests assume DOC/MARTY are funded (from SEND-001) without declaring this dependency. The automation needs explicit execution ordering. + +--- + +## 7. Automated Test Matrix + +43 test cases converted to Skyvern-compatible prompts with visual descriptions, checkpoint assertions, extraction schemas, and inline test data. The full YAML is provided as a companion file (`test_matrix.yaml`); this section summarises the structure and execution phases. + +### 7.1 Execution Phases (Dependency Order) + +| Phase | Tests | Purpose | Tags | +| ------------------ | -------------------------------------------------------------- | -------------------------------------------------------------- | -------------------- | +| 1. Auth + Wallet | AUTH-001a/b, AUTH-003, AUTH-004, WAL-001, WAL-002 | Establish wallet creation, import, login, wallet management | auth, critical, p0 | +| 2. Coin Management | COIN-001, COIN-002, DASH-001 | Enable DOC/MARTY, verify dashboard toggles | coin, p1 | +| 3. Faucet Funding | SEND-001, SEND-002a | Fund wallets with test coins via in-app faucet | prerequisite, p0 | +| 4. Send/Withdraw | SEND-003, SEND-004, SEND-005 | DOC send happy path, address validation, amount boundaries | send, critical, p0 | +| 5. DEX | DEX-001, DEX-003 | Maker order creation, input validation | dex, critical, p0 | +| 6. Bridge | BRDG-001, BRDG-002, BRDG-003 | Bridge transfer, unsupported pairs, boundaries | bridge, critical, p0 | +| 7. NFT | NFT-001 | NFT list/detail view (if enabled) | nft, p1 | +| 8. Settings | SET-002, SET-003, NAV-001, NAV-003 | Privacy toggles, test coin impact, navigation, unsaved changes | settings, p1 | +| 9. Bot | BOT-001, BOT-002 | Market maker bot creation, validation | bot, p1 | +| 10. Fiat | FIAT-001, FIAT-002 | Fiat access, form validation | fiat, p0 | +| 11. Security | SECX-002, SECX-003, SECX-004 | Seed backup, unban pubkeys, password change | security, p0 | +| 12. Custom Token | CTOK-001, CTOK-002 | Token import, error handling | custom_token, p1 | +| 13. Localization | L10N-001 | Translation completeness check | l10n, p2 | +| 14. Feature Gating | GATE-001, GATE-003 | Disabled feature tooltips, NFT gate | gating, p1 | +| 15. Support/Misc | SUP-001, FEED-001, SETX-001, SETX-007, WALX-001, WADDR-001/002 | Support, feedback, advanced settings, addresses | p2 | + +### 7.2 Test Case Format + +Each automated test case in `test_matrix.yaml` follows this structure: + +**id:** Unique identifier prefixed `GW-AUTO-` to distinguish from manual IDs. + +**source_manual_id:** Maps back to the original manual test case ID for traceability. + +**tags:** Array of tags for filtering (smoke, critical, p0/p1/p2, module name). + +**steps:** Ordered list of action + checkpoint pairs. Each action describes what to do visually; each checkpoint describes what must be true before proceeding. + +**prompt:** Alternative to steps for simpler tests — a single natural-language prompt. + +**expected_result:** Human-readable expected outcome for the report. + +**extraction_schema:** JSON Schema defining structured data to extract from the final screen state. Used by Skyvern to return machine-comparable fields. + +**max_steps / timeout:** Safety limits per test case (default 30 steps, 180 seconds). + +### 7.3 Example Test Case + +```yaml +- id: GW-AUTO-SEND-003 + name: "Send DOC happy path" + source_manual_id: GW-MAN-SEND-003 + tags: [send, critical, p0, smoke] + timeout: 240 + steps: + - action: > + Navigate to the wallet or coin that holds DOC. Look for 'DOC' or + 'Document' in your wallet/coin list and click on it. + checkpoint: "The DOC coin detail screen is visible with a balance > 0." + + - action: > + Click the 'Send' or 'Withdraw' button. It may appear as an icon + with an upward arrow or the word 'Send'. + checkpoint: "A send form is visible with fields for recipient address and amount." + + - action: > + Enter the recipient address 'RReplaceMeWithValidDOCAddress' into + the address/recipient field. Enter '0.001' into the amount field. + checkpoint: "Both fields are filled. No error messages are shown." + + - action: > + Click the 'Send', 'Confirm', or 'Submit' button to initiate the + transaction. If a confirmation dialog appears, confirm it. + checkpoint: "A success message, pending indicator, or transaction hash is displayed." + + expected_result: "Transaction submitted; fee and amount match; status is Pending or Confirmed." + extraction_schema: + type: object + properties: + transaction_submitted: + type: boolean + success_or_pending_message: + type: string + fee_displayed: + type: string + transaction_hash: + type: string +``` + +### 7.4 Regression Pack Filtering + +```bash +# Smoke pack (fastest gate check) +python runner/runner.py --tag smoke + +# Critical money-movement tests +python runner/runner.py --tag critical + +# P0 only (highest priority) +python runner/runner.py --tag p0 + +# Full automated suite +python runner/runner.py +``` + +--- + +## 8. Manual Test Companion + +36 test items that must remain manual. Provided as `manual_companion.yaml` — a structured pass/fail checklist that runs alongside the automated suite for full coverage. + +### 8.1 Categories + +| Category | Count | Examples | +| ------------------------------ | ----- | ---------------------------------------------------------------- | +| Hardware wallet (Trezor) | 2 | Connect/sign, restricted modules | +| Network manipulation | 7 | Offline indicators, interrupted transactions, recovery | +| App lifecycle/restart | 7 | Session persistence, settings retention, quick-login | +| Cross-platform + responsive | 4 | Multi-platform parity, breakpoint behaviour, orientation | +| Accessibility | 3 | Keyboard nav, screen reader, contrast/scaling | +| Security/privacy (OS-level) | 3 | Auto-lock, app-switcher, clipboard | +| Filesystem operations | 3 | Export/import JSON, download logs | +| Deep link / clock manipulation | 2 | Auth gating on deep links, clock warning banner | +| Grade-B manual verification | 5 | Clipboard checks, explorer links, provider webview, export files | + +Together, the 43 automated tests and 36 manual checklist items cover the full scope of the original 85 manual test cases with no gaps. + +--- + +## 9. Implementation Artifacts + +The complete runner consists of 7 Python modules plus the YAML test matrix. + +### 9.1 Data Models (`models.py`) + +```python +from pydantic import BaseModel +from typing import Optional + +class TestCase(BaseModel): + id: str + name: str + tags: list[str] = [] + prompt: str = "" + steps: Optional[list[dict]] = None + expected_result: str + extraction_schema: Optional[dict] = None + max_steps: Optional[int] = None + timeout: Optional[int] = None + source_manual_id: Optional[str] = None + +class TestResult(BaseModel): + test_id: str + test_name: str + tags: list[str] + status: str # PASS | FAIL | ERROR | SKIP + skyvern_status: str + expected: str + extracted_data: Optional[dict | str] = None + duration_seconds: float + run_id: Optional[str] = None + error: Optional[str] = None + +class TestRun(BaseModel): + timestamp: str + base_url: str + total: int + passed: int + failed: int + errors: int + skipped: int + flaky: int = 0 + results: list[TestResult] + voted_results: list[dict] = [] +``` + +### 9.2 Pre-flight Checks (`preflight.py`) + +Validates Ollama responsiveness and inference, VRAM availability (>15 GB free), Skyvern HTTP health, and Flutter app reachability. Returns `False` on any failure, causing the runner to abort with exit code 2. Full implementation in Section 5.2. + +### 9.3 Prompt Builder (`prompt_builder.py`) + +Automatically prepends the Flutter render-wait preamble to every prompt, converts checkpoint-based step lists into sequential prompts with verification gates, and appends a completion suffix requesting explicit success/error reporting. Full implementation in Section 5.2. + +### 9.4 Majority Vote (`retry.py`) + +Runs each test N times and determines the final verdict. All-agree = that status at 100% confidence. Majority-agree = that status at majority/total confidence. No majority = FLAKY. All-ERROR with some non-error = FLAKY. Includes early exit: skip remaining retries if first 2 pass, or stop early if all attempts are ERROR. Full implementation in Section 5.2. + +### 9.5 Timeout Guards (`guards.py`) + +Wraps every Skyvern task call in an `asyncio.wait_for()` with a configurable timeout (default 180 seconds). Raises `TestTimeoutError` with a diagnostic message including the test ID and timeout value, allowing the runner to log the issue and continue to the next test. Full implementation in Section 5.2. + +### 9.6 Ollama Monitor (`ollama_monitor.py`) + +Background asyncio task polling every 10 seconds: checks Ollama HTTP endpoint, nvidia-smi VRAM free/used/temperature. Flags unhealthy if VRAM < 500 MB free, temperature > 90°C, or Ollama stops responding. The runner checks `monitor.healthy` before each test attempt. Full implementation in Section 5.2. + +### 9.7 Hardened Runner (`runner.py`) + +The main orchestration script. Loads the YAML matrix, applies tag filtering, runs pre-flight checks, starts the Ollama monitor, iterates tests with retry+majority-vote, applies early exit optimisation, and writes results. + +```python +# runner/runner.py (hardened version) — key structure + +async def main(matrix_path: str, tag_filter: str = None, single: bool = False): + matrix = load_matrix(matrix_path) + config = matrix["config"] + + # Pre-flight + if not await run_preflight(config): + sys.exit(2) + + # Filter tests + tests = [TestCase(**t) for t in matrix["tests"]] + if tag_filter: + tests = [t for t in tests if tag_filter in t.tags] + + # Start infrastructure + skyvern = Skyvern(base_url="http://localhost:8000", api_key="local") + monitor = OllamaMonitor() + await monitor.start() + + # Execute with retries + voted_results = [] + for test in tests: + voted = await run_test_with_retries(skyvern, test, config, setup, monitor) + voted_results.append(voted) + + await monitor.stop() + + # Write results + run_dir = Path(f"results/run_{timestamp}") + run_dir.mkdir(parents=True, exist_ok=True) + (run_dir / "results.json").write_text(json.dumps(results, indent=2)) + generate_html_report(run, run_dir / "report.html") + + # Exit codes + if failed > 0 or errors > 0: + sys.exit(1) + elif flaky > 0: + sys.exit(3) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--matrix", default="tests/test_matrix.yaml") + parser.add_argument("--tag", default=None) + parser.add_argument("--single", action="store_true") + args = parser.parse_args() + asyncio.run(main(args.matrix, args.tag, args.single)) +``` + +### 9.8 HTML Reporter (`reporter.py`) + +Generates a styled dark-theme HTML report with summary statistics (total, passed, failed, errors, flaky, pass rate) and a results table showing test ID, name, tags, status with colour coding, duration, extracted data preview, and error messages. Saved alongside `results.json` in each timestamped run directory. + +--- + +## 10. Execution Strategy + +### 10.1 Phased Rollout + +**Week 1–2 (Infrastructure):** Run `setup.sh`. Validate Ollama + Skyvern connectivity. Execute a single trivial test (navigate to dashboard, verify it loads) to confirm the vision loop works end-to-end. + +**Week 3–4 (Smoke Suite):** Run the smoke-tagged subset (7–8 tests). Tune prompts and timeouts based on actual Skyvern + Ollama behaviour. Establish baseline pass rate. Target: >90% stability before expanding. + +**Week 5–6 (Critical Suite):** Add critical-tagged tests (money movement: send, DEX, bridge). These are the highest-value automated checks. Tune retry counts and checkpoint language. + +**Week 7–8 (Full Suite):** Enable all 43 automated tests. Run in CI on every staging deployment. Measure flaky rate and triage unstable tests. + +**Ongoing:** Expand manual companion tests to automation as Flutter rendering stabilises and Skyvern capabilities evolve. Target: increase Grade A percentage from 47% to 60%+ over 6 months. + +### 10.2 CI Integration + +```bash +#!/usr/bin/env bash +# ci-pipeline.sh + +# Start infrastructure +ollama serve & +docker compose up -d +sleep 10 + +# Run smoke gate +python runner/runner.py --tag smoke --single +SMOKE_EXIT=$? + +if [ $SMOKE_EXIT -ne 0 ]; then + echo "SMOKE GATE FAILED — blocking deployment" + exit 1 +fi + +# Run full suite with retries +python runner/runner.py --matrix tests/test_matrix.yaml +FULL_EXIT=$? + +# Upload report as artifact +cp results/run_*/report.html $CI_ARTIFACTS_DIR/ + +exit $FULL_EXIT +``` + +### 10.3 Tag Filtering Strategy + +| Scenario | Command | Tests | Time | +| ---------------------- | ---------------------- | ----- | --------- | +| Pre-merge gate | `--tag smoke` | ~8 | 5–10 min | +| Nightly regression | `--tag critical` | ~20 | 15–30 min | +| Full weekly regression | (no filter) | 43 | 30–60 min | +| Quick infra check | `--tag smoke --single` | ~8 | 3–5 min | + +### 10.4 Test Data Population + +Before running the suite, the `test_data` section in `test_matrix.yaml` must be populated with actual QA environment values: + +| Key | Description | Example | +| ------------------------- | ---------------------------- | --------------------------- | +| `wallet_password` | QA environment password | `TestPass123!` | +| `import_seed_12` | Valid testnet 12-word seed | `abandon abandon ... about` | +| `doc_recipient_address` | Valid DOC testnet address | `R9o9xTocqr6...` | +| `marty_recipient_address` | Valid MARTY testnet address | `R4kL2xPqm7...` | +| `evm_token_contract` | Test ERC-20 contract address | `0x1234...abcd` | + +--- + +## 11. Performance Expectations + +| Metric | Estimate (RTX 5090 + qwen2.5-vl:32b) | +| ------------------------------------------------- | ------------------------------------ | +| Time per Skyvern step (screenshot → LLM → action) | 2–4 seconds | +| Average test case (10–15 steps) | 30–60 seconds | +| Setup task per test (login, ~5 steps) | 10–20 seconds | +| Single test with 3x majority vote | 90–180 seconds | +| Full 43-test suite (with retries) | 30–60 minutes | +| Smoke suite (8 tests, single attempt) | 3–5 minutes | +| VRAM usage during inference | ~20 GB | +| Peak GPU utilisation during inference | 80–95% | +| Ollama idle VRAM (model loaded) | ~20 GB | + +For faster iteration during prompt tuning, use gemma3:27b (~16 GB VRAM, faster inference) or qwen2.5-vl:7b (~5 GB, much faster but less accurate on complex UIs). Switch models by changing `OLLAMA_MODEL` in `.env`. + +### Comparison with Manual Testing + +| Metric | Manual | Automated | +| --------------------- | ----------------- | ---------------------------------- | +| Full regression cycle | ~52 hours | ~1 hour | +| Smoke check | ~4 hours | ~5 minutes | +| Cost per run | Human tester time | Electricity (~$0.10) | +| Consistency | Varies by tester | 90–95% stable | +| Coverage | 85 tests | 43 automated + 36 manual companion | +| Time reduction | Baseline | ~75% reduction | + +--- + +## 12. Risks and Limitations + +### 12.1 Inherent Limitations + +| Limitation | Why | Workaround | +| ---------------------------------- | ----------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------- | +| No 100% deterministic results | LLMs are probabilistic. Even temperature=0 varies with subtle screenshot differences. | Majority vote targets 90–95% consistency. Accept this as the realistic ceiling. | +| No pixel-perfect visual validation | LLM describes what it sees in natural language, not coordinates. Cannot detect 2px misalignment. | Supplement with pixelmatch or BackstopJS for visual regression. | +| No complex gestures | Skyvern supports click, type, scroll. Pinch, long-press, drag are unreliable. | Test gesture-dependent features manually. | +| No timing assertions | LLM cannot measure load times or animation duration. | Use Playwright performance APIs in a separate non-LLM test suite. | +| No cross-test state | Each test runs in isolation. If Test A creates data for Test B, it requires a shared database or API. | Add teardown/setup hooks or use a test database reset endpoint. | +| 32% of tests remain manual | Hardware, OS, accessibility, and cross-platform tests are architecturally impossible in-browser. | Run manual_companion.yaml alongside automated suite. | + +### 12.2 Operational Risks + +| Risk | Impact | Mitigation | +| ---------------------------------------------- | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------ | +| Vision model misidentifies Flutter UI elements | Tests click wrong things, produce false passes | Use 32B model for best accuracy. Add explicit visual descriptions in prompts. Majority vote catches intermittent errors. | +| Ollama OOM on 32B model | All tests fail with ERROR | Pre-flight VRAM check aborts early. Drop to 7B for debugging. RTX 5090 32 GB handles Q4 32B comfortably. | +| Skyvern + Ollama integration bugs | Tasks hang or produce garbled output | Pin Skyvern version. Test with skyvern-1.0 engine first. Monitor Skyvern GitHub issues. | +| Flutter app load time causes timeouts | Tests fail before the app renders | Increase BROWSER_ACTION_TIMEOUT_MS. Flutter preamble adds explicit wait instructions. | +| Faucet rate-limiting blocks test funding | Send/DEX/Bridge tests cannot execute | Pre-fund test wallets. Add faucet cooldown handling. Run funding phase once, not per test. | +| Prompt drift after app UI changes | Tests fail because prompts describe old UI | Maintain prompts alongside app releases. Use visual descriptions, not hard-coded labels. | + +--- + +## Companion Files + +| File | Description | +| ----------------------- | ----------------------------------------------------------------------------------- | +| `test_matrix.yaml` | 43 automated test cases with Skyvern prompts, extraction schemas, and tag filtering | +| `manual_companion.yaml` | 36 manual-only checklist items for Grade-C and Grade-B verification steps | + +--- + +_End of document._ diff --git a/automated_testing/gleec-qa-evaluation.md b/automated_testing/gleec-qa-evaluation.md new file mode 100644 index 0000000000..8bdfb1a0ed --- /dev/null +++ b/automated_testing/gleec-qa-evaluation.md @@ -0,0 +1,196 @@ +# Gleec Wallet Test Cases: Automation Suitability Evaluation & Overhauled Test Matrix + +> Evaluates `GLEEC_WALLET_MANUAL_TEST_CASES.md` against the Skyvern + Ollama vision-based automation architecture. + +--- + +## 1. Executive Assessment + +The test cases document is excellent manual QA documentation. It is not suitable for vision-based automation in its current form, and roughly 40% of it can never be automated with this stack at all. The document needs to be split into two separate artifacts: one that feeds the Skyvern runner, and one that remains a manual checklist. + +**What the document does well:** + +- Comprehensive coverage across 26+ modules with 80+ test cases +- Proper risk-based prioritisation (P0–P3, S1–S4) +- Strong test data strategy with wallet profiles, coin sets, address sets, amount sets +- Good traceability matrix linking features to test IDs +- Realistic time estimates and parallel tester allocation +- Regression pack definitions (smoke, critical, full) + +**What makes it unsuitable for Skyvern automation as-is:** + +- Written for human judgment, not machine-executable prompts +- Steps are abstract and assume contextual understanding ("Attempt to proceed without confirming backup" — what does that look like visually?) +- Many test cases are compound: a single case tests 4–6 different things requiring different verdicts +- No visual descriptions of UI elements (Skyvern needs "click the blue button labeled Send", not "open send screen") +- ~35% of cases require capabilities outside the browser (network toggling, hardware wallets, screen readers, clipboard inspection, app-switcher behaviour, device rotation) +- Expected results are qualitative ("clear error messaging") rather than extractable assertions + +--- + +## 2. Test Case Classification + +Every test case classified by automation suitability with Skyvern + Ollama. + +### Classification Key + +| Grade | Meaning | Action | +|-------|---------|--------| +| **A — Fully automatable** | Pure UI interaction within a web browser. All steps and verification are visual. | Convert to Skyvern prompt. | +| **B — Partially automatable** | Some steps are automatable, but verification or setup requires human/external action. | Split: automate the UI steps, flag verification as manual. | +| **C — Manual only** | Requires hardware, OS-level actions, network manipulation, screen reader, or cross-platform device. | Keep in manual checklist. Remove from automation matrix. | + +### Full Classification + +| Test ID | Module | Title | Grade | Reason | +|---------|--------|-------|-------|--------| +| AUTH-001 | Auth | Create wallet with seed backup | **A** | UI-only flow: tap, enter password, navigate seed screens | +| AUTH-002 | Auth | Login/logout with remember-session | **B** | Login/logout automatable; "close and relaunch app" requires session restart outside Skyvern | +| AUTH-003 | Auth | Import wallet from seed | **A** | UI-only: enter seed, set password, verify balances | +| AUTH-004 | Auth | Invalid password attempts + lockout | **A** | UI-only: enter wrong passwords, observe lockout messages | +| AUTH-005 | Auth | Trezor connect/disconnect | **C** | Requires physical hardware wallet + USB | +| WAL-001 | Wallet Manager | Create, rename, select wallets | **A** | UI-only multi-step flow | +| WAL-002 | Wallet Manager | Delete wallet with confirmation | **A** | UI-only: cancel/confirm delete dialogs | +| WAL-003 | Wallet Manager | Selection persistence across restart | **C** | Requires app restart + re-login outside browser | +| COIN-001 | Coin Manager | Test coin visibility gate | **A** | Toggle setting, search, verify visibility | +| COIN-002 | Coin Manager | Activate/deactivate with search/filter | **A** | UI-only search and toggle | +| COIN-003 | Coin Manager | Deactivate coin with balance + restore | **A** | UI-only with warning dialog | +| DASH-001 | Dashboard | Hide balances / hide zero toggles | **A** | Toggle and verify masking | +| DASH-002 | Dashboard | Balance refresh + offline indicator | **C** | Requires network toggle (OS-level) | +| DASH-003 | Dashboard | Dashboard persistence across restart | **C** | Requires app restart | +| CDET-001 | Coin Details | Address display, copy, QR, explorer | **B** | View/QR automatable; clipboard + external explorer are manual | +| CDET-002 | Coin Details | Transaction list + status progression | **B** | List view automatable; pending→confirmed requires real-time chain state | +| CDET-003 | Coin Details | Price chart + no-data/network fallback | **B** | Chart view automatable; offline fallback requires network toggle | +| SEND-001 | Send | Faucet funding success | **A** | Click faucet button, verify incoming tx in UI | +| SEND-002 | Send | Faucet cooldown/denied + network error | **B** | Cooldown automatable; network error requires network toggle | +| SEND-003 | Send | DOC send happy path | **A** | Enter recipient, amount, confirm, track in history | +| SEND-004 | Send | Address validation + memo/tag | **A** | Enter invalid addresses, verify error messages | +| SEND-005 | Send | Amount boundary + insufficient funds | **A** | Enter boundary amounts, verify validation messages | +| SEND-006 | Send | Interrupted send + duplicate-submit | **C** | Requires network kill mid-transaction, app backgrounding | +| DEX-001 | DEX | Maker limit order creation | **A** | Select pair, enter price/amount, submit, verify in open orders | +| DEX-002 | DEX | Taker order execution | **B** | Depends on orderbook liquidity in test environment | +| DEX-003 | DEX | DEX validation (invalid inputs) | **A** | Enter invalid values, verify error messages | +| DEX-004 | DEX | Order lifecycle: partial fill, cancel | **B** | Cancel automatable; partial fill depends on external market activity | +| DEX-005 | DEX | Swap history filtering + export | **B** | Filtering automatable; export file verification needs filesystem access | +| DEX-006 | DEX | DEX recovery after restart/network drop | **C** | Requires app closure and network toggling | +| BRDG-001 | Bridge | Bridge transfer happy path | **A** | Select pair, enter amount, confirm, track status | +| BRDG-002 | Bridge | Unsupported pair validation | **A** | Select unsupported pair, verify blocking message | +| BRDG-003 | Bridge | Amount boundaries + insufficient funds | **A** | Enter boundary amounts, verify error messages | +| BRDG-004 | Bridge | Bridge failure/timeout + recovery | **C** | Requires network interruption and app restart | +| NFT-001 | NFT | NFT list/details/history filtering | **A** | Browse, filter, view details | +| NFT-002 | NFT | NFT send happy path | **A** | Enter recipient, confirm, monitor history | +| NFT-003 | NFT | NFT send failure + recovery | **A** | Enter invalid recipient, verify error, retry | +| SET-001 | Settings | Theme + language + format persistence | **B** | Change settings automatable; restart persistence check is manual | +| SET-002 | Settings | Analytics/privacy toggles | **A** | Toggle on/off, verify state | +| SET-003 | Settings | Test coin toggle impact | **A** | Toggle, verify DOC/MARTY visibility | +| SET-004 | Settings | Settings persistence across restart | **C** | Requires logout/restart | +| BOT-001 | Bot | Create + start market maker bot | **A** | Fill form, save, start, verify running status | +| BOT-002 | Bot | Bot validation (invalid config) | **A** | Enter invalid values, verify error blocking | +| BOT-003 | Bot | Edit, stop, restart bot | **B** | Edit/stop/restart automatable; persistence across relaunch is manual | +| NAV-001 | Navigation | Route integrity + back navigation | **A** | Click through all menu items, use back button | +| NAV-002 | Navigation | Deep link + auth gating | **C** | Requires direct URL entry while logged out + auth redirect chain | +| NAV-003 | Navigation | Unsaved changes prompt | **A** | Enter data in form, navigate away, interact with dialog | +| RESP-001 | Responsive | Breakpoint behaviour | **C** | Requires browser window resize (not reliably controllable via Skyvern) | +| RESP-002 | Responsive | Orientation/resize state retention | **C** | Requires device rotation or window resize mid-flow | +| XPLAT-001 | Cross-Platform | Core flow parity | **C** | By definition requires running on Android, iOS, macOS, Linux, Windows | +| XPLAT-002 | Cross-Platform | Platform permissions + input | **C** | Requires OS permission dialogs, hardware back, etc. | +| A11Y-001 | Accessibility | Keyboard-only navigation | **C** | Requires keyboard Tab/Shift+Tab, focus ring inspection — vision model cannot reliably judge focus state | +| A11Y-002 | Accessibility | Screen reader labels/roles | **C** | Requires VoiceOver/TalkBack | +| A11Y-003 | Accessibility | Color contrast + touch targets + text scaling | **C** | Requires pixel-level contrast analysis and OS text scaling | +| SEC-001 | Security | Seed phrase handling/reveal | **B** | Reveal flow automatable; screenshot masking policy and background behaviour are manual | +| SEC-002 | Security | Session auto-lock + app-switcher privacy | **C** | Requires idle timeout, app-switcher snapshot | +| SEC-003 | Security | Clipboard exposure risk | **C** | Requires clipboard access/monitoring outside browser | +| ERR-001 | Error Handling | Global network outage | **C** | Requires network toggle | +| ERR-002 | Error Handling | Partial backend failure isolation | **C** | Requires endpoint-specific failure simulation | +| ERR-003 | Error Handling | Stale-state reconciliation | **C** | Requires app closure during in-flight transaction | +| L10N-001 | Localization | Translation completeness | **A** | Switch locale, review UI text | +| L10N-002 | Localization | Long-string overflow/clipping | **B** | Can screenshot narrow width, but visual clipping judgment is low-confidence for vision model | +| L10N-003 | Localization | Locale-specific format consistency | **A** | Switch locale, compare date/number formatting | +| FIAT-001 | Fiat | Menu access + connect-wallet gating | **A** | Open fiat menu, verify gating, connect wallet | +| FIAT-002 | Fiat | Form validation | **A** | Enter invalid amounts, switch payment methods | +| FIAT-003 | Fiat | Checkout success via provider webview | **B** | Provider webview/dialog may be a separate domain the vision model can't follow | +| FIAT-004 | Fiat | Checkout closed/failed handling | **B** | Closing provider window mid-flow is manual | +| FIAT-005 | Fiat | Form behaviour across logout/login | **C** | Requires logout/re-login | +| SUP-001 | Support | Support page + links + missing coins dialog | **A** | Open page, verify content, open dialog | +| FEED-001 | Feedback | Feedback entry points | **A** | Open feedback from settings/bug button, submit/cancel | +| SECX-001 | Security Settings | Private key export flow | **B** | Auth + toggle automatable; download/share actions may cross browser boundary | +| SECX-002 | Security Settings | Seed backup show/confirm/success | **A** | Auth, reveal, confirm challenge — all visual | +| SECX-003 | Security Settings | Unban pubkeys | **A** | Trigger action, observe results | +| SECX-004 | Security Settings | Change password flow | **A** | Enter old/new passwords, verify rejection/acceptance | +| SETX-001 | Settings Advanced | Weak-password toggle | **A** | Toggle setting, attempt wallet create with weak password | +| SETX-002 | Settings Advanced | Trading bot master toggles | **B** | Toggle automatable; stop-on-disable verification depends on running bot state | +| SETX-003 | Settings Advanced | Export/import maker orders JSON | **C** | File system import/export outside browser | +| SETX-004 | Settings Advanced | Show/export swap data | **B** | View/copy automatable; export is filesystem | +| SETX-005 | Settings Advanced | Import swaps from JSON | **C** | Requires pasting external JSON payload | +| SETX-006 | Settings Advanced | Download logs + flood logs | **C** | File download + debug build action | +| SETX-007 | Settings Advanced | Reset activated coins | **A** | Select wallet, confirm reset, verify | +| WALX-001 | Wallet Advanced | Overview cards + privacy toggle | **A** | View cards, toggle privacy, verify masking | +| WALX-002 | Wallet Advanced | Assets/Growth/PnL tabs | **B** | Tab switching automatable; logged-out fallback requires logout | +| WADDR-001 | Coin Addresses | Multi-address display + controls | **A** | Toggle hide-zero, expand/collapse, copy, QR, faucet | +| WADDR-002 | Coin Addresses | Create new address flow | **A** | Click create, confirm, verify new address appears | +| CTOK-001 | Custom Token | Import happy path | **A** | Select network, enter contract, fetch, confirm import | +| CTOK-002 | Custom Token | Fetch failure + not-found | **A** | Enter invalid contract, verify error | +| CTOK-003 | Custom Token | Back/cancel + state reset | **A** | Navigate back, close dialog, verify clean state | +| RWD-001 | Rewards | KMD rewards refresh + claim | **B** | View automatable; claim depends on reward availability | +| GATE-001 | Feature Gating | Trading-disabled mode | **A** | Verify disabled menu items, tooltips | +| GATE-002 | Feature Gating | Hardware-wallet restrictions | **C** | Requires Trezor login | +| GATE-003 | Feature Gating | NFT menu disabled + direct route | **A** | Verify disabled state, attempt direct navigation | +| QLOG-001 | Quick Login | Remember-me persistence | **C** | Requires app relaunch | +| BREF-001 | Bitrefill | Integration visibility + lifecycle | **B** | Button visibility automatable; widget interaction crosses domains | +| ZHTL-001 | ZHTLC | Configuration dialog + activation | **B** | Dialog automatable; logout-during-activation is manual | +| WARN-001 | System Health | Clock warning banner | **C** | Requires system clock manipulation | + +### Summary Count + +| Grade | Count | Percentage | +|-------|-------|------------| +| **A — Fully automatable** | 40 | 47% | +| **B — Partially automatable** | 18 | 21% | +| **C — Manual only** | 27 | 32% | +| **Total** | 85 | 100% | + +--- + +## 3. Structural Problems for Automation + +Beyond per-case suitability, these structural issues in the original document prevent direct conversion: + +**Problem 1: Compound test cases.** +AUTH-001 tests five things in one case: tap create wallet, enter password, attempt to skip seed backup, complete seed confirmation, finish onboarding. For a vision-based agent, this needs to be 2–3 separate tasks to avoid state corruption at step 3 causing steps 4–5 to run against the wrong screen. + +**Problem 2: No visual element descriptions.** +Every case says "Open DEX" or "Enter amount" without describing what the DEX screen looks like, what the amount field looks like, or what distinguishes it from adjacent inputs. Skyvern needs: "Look for the input field labeled 'Amount' below the recipient address field, with a coin ticker symbol next to it." + +**Problem 3: Abstract expected results.** +"Validation blocks invalid orders with specific guidance" is not machine-evaluable. The automation needs: "A red error message or banner appears on screen containing the word 'invalid', 'insufficient', or 'minimum'." + +**Problem 4: No test data in-line.** +The cases reference AS-01, AM-03, WP-02 — but the automation prompt must contain the actual address string, the actual amount value, and the actual seed phrase. The runner cannot look up a test data matrix. + +**Problem 5: Missing setup/teardown coupling.** +Many Grade-A tests assume DOC/MARTY are already funded (from SEND-001). The automation needs explicit dependency ordering or the setup block must handle funding. + +--- + +## 4. Recommended Architecture: Two Documents + +``` +GLEEC_WALLET_MANUAL_TEST_CASES.md (original — keep as-is) + │ + ├── Remains the canonical QA reference for manual testers + ├── All 85 test cases, all platforms, all edge cases + └── Used by human QA team for full regression + +tests/test_matrix.yaml (NEW — Skyvern automation) + │ + ├── Grade-A tests converted to vision-compatible prompts + ├── Grade-B tests with automatable portions only + ├── Hardened with checkpoints, explicit data, visual descriptions + └── Used by the Skyvern runner for automated regression + +tests/manual_companion.yaml (NEW — manual-only checklist) + │ + ├── Grade-C tests formatted as pass/fail checklist + ├── Grade-B manual verification steps + └── Run alongside automation for full coverage +``` diff --git a/automated_testing/manual_companion.yaml b/automated_testing/manual_companion.yaml new file mode 100644 index 0000000000..593d35a159 --- /dev/null +++ b/automated_testing/manual_companion.yaml @@ -0,0 +1,287 @@ +# Gleec Wallet — Manual-Only Test Companion Checklist +# +# These 14 tests CANNOT be automated even with OS calls and Playwright. +# They require physical hardware, screen reader software, native platform +# builds, or real-time market conditions. +# +# 22 formerly-manual tests have been moved to test_matrix.yaml as composite +# tests (tagged "composite") using OS network toggle, Playwright browser +# lifecycle, viewport resizing, clock mocking, and accessibility auditing. +# +# Usage: +# Automated runner integration: python -m runner.runner --include-manual +# Manual-only mode: python -m runner.runner --manual-only + +manual_tests: + + # ========================================================================= + # HARDWARE WALLET (Trezor) — requires physical device + # ========================================================================= + + - id: MAN-AUTH-005 + source: GW-MAN-AUTH-005 + title: "Trezor connect/disconnect and signing" + reason: "Requires physical Trezor device + USB" + platforms: [web, macOS, Linux, Windows] + tags: [hardware, security, critical] + checklist: + - "[ ] Connect Trezor and import hardware account" + - "[ ] Start sign-required action (send preview) — confirm on device" + - "[ ] Disconnect device and retry action — app prompts reconnection" + - "[ ] No crash on disconnect" + interactive_steps: + - prompt: "Connect your Trezor device via USB to the computer." + wait_for: keypress + - prompt: "Open the Gleec Wallet web app and look for a hardware wallet import option. Click it to begin pairing the Trezor. Follow any prompts on the Trezor screen to authorize the connection. Is the Trezor wallet successfully imported? (y/n)" + wait_for: confirmation + - prompt: "Navigate to a coin with balance and open the Send form. Fill in a valid address and small amount, then click Preview/Confirm. The Trezor should display a signing request on its screen. Confirm on the Trezor device. Did the transaction submit successfully? (y/n)" + wait_for: confirmation + - prompt: "Disconnect the Trezor USB cable physically. Now try to initiate another send action in the app. Does the app show a reconnection prompt or clear error message (no crash)? (y/n)" + wait_for: confirmation + + - id: MAN-GATE-002 + source: GW-MAN-GATE-002 + title: "Hardware-wallet restrictions for fiat/trading" + reason: "Requires Trezor login" + platforms: [web, macOS, Linux, Windows] + tags: [hardware, gating] + checklist: + - "[ ] Log in with Trezor wallet" + - "[ ] Fiat/DEX/Bridge/Bot menu items show restricted/disabled state" + - "[ ] Clear wallet-only messaging shown" + interactive_steps: + - prompt: "Ensure you are logged in with a Trezor hardware wallet. Press Enter when ready." + wait_for: keypress + - prompt: "Check the main navigation menu. Are the Fiat, DEX, Bridge, and Bot menu items showing a disabled/restricted state (grayed out, lock icon, or tooltip)? (y/n)" + wait_for: confirmation + - prompt: "Try clicking one of the disabled items. Does a clear message explain the restriction for hardware wallets? (y/n)" + wait_for: confirmation + + # ========================================================================= + # PARTIAL BACKEND FAILURE — requires custom proxy/endpoint blocking + # ========================================================================= + + - id: MAN-ERR-002 + source: GW-MAN-ERR-002 + title: "Partial backend failure isolation" + reason: "Requires endpoint-specific failure simulation (proxy/firewall)" + platforms: [all] + tags: [network, error_handling] + checklist: + - "[ ] Trigger failure in one module endpoint" + - "[ ] Unaffected modules still functional" + - "[ ] Failed module recovers after service restore" + interactive_steps: + - prompt: "Set up a proxy or firewall rule to block a specific API endpoint (e.g., DEX orderbook). Press Enter when the failure condition is active." + wait_for: keypress + - prompt: "With one module's endpoint failing, do other modules (wallet, settings, etc.) still function normally? (y/n)" + wait_for: confirmation + - prompt: "Restore the blocked endpoint. Does the failed module recover and function correctly? (y/n)" + wait_for: confirmation + + # ========================================================================= + # CROSS-PLATFORM — requires native app builds + # ========================================================================= + + - id: MAN-XPLAT-001 + source: GW-MAN-XPLAT-001 + title: "Core-flow parity across platforms" + reason: "Requires running on Android, iOS, macOS, Linux, Windows" + platforms: [web, Android, iOS, macOS, Linux, Windows] + tags: [cross_platform] + checklist: + - "[ ] Create wallet on each platform" + - "[ ] Fund DOC via faucet on each" + - "[ ] Send DOC on each" + - "[ ] Verify history on each" + - "[ ] Document any platform-specific blockers" + interactive_steps: + - prompt: "Which platform are you testing now? (type the name and press Enter)" + wait_for: keypress + - prompt: "Create a new wallet on this platform. Does wallet creation complete? (y/n)" + wait_for: confirmation + - prompt: "Enable DOC and use the faucet. Does the faucet work? (y/n)" + wait_for: confirmation + - prompt: "Send 0.001 DOC to a test address. Does the transaction complete? (y/n)" + wait_for: confirmation + - prompt: "Check transaction history. Does it show correctly? (y/n)" + wait_for: confirmation + + - id: MAN-XPLAT-002 + source: GW-MAN-XPLAT-002 + title: "Platform permissions and input behavior" + reason: "Requires OS permission dialogs" + platforms: [all] + tags: [cross_platform] + checklist: + - "[ ] Trigger QR scan / export / notification permissions" + - "[ ] Deny permission — graceful handling" + - "[ ] Grant permission — action works" + - "[ ] Hardware back (Android) / keyboard shortcuts (desktop) work" + interactive_steps: + - prompt: "Trigger a permission request (QR scan, file export, etc.). DENY it. Does the app handle denial gracefully? (y/n)" + wait_for: confirmation + - prompt: "Trigger the same action and GRANT the permission. Does it work? (y/n)" + wait_for: confirmation + - prompt: "Test hardware back (Android) or keyboard shortcuts (desktop). Do they work? (y/n)" + wait_for: confirmation + + # ========================================================================= + # SCREEN READER — requires VoiceOver/TalkBack + # ========================================================================= + + - id: MAN-A11Y-002 + source: GW-MAN-A11Y-002 + title: "Screen reader labels and announcements" + reason: "Requires VoiceOver/TalkBack — cannot be automated" + platforms: [iOS, Android, desktop, web] + tags: [accessibility] + checklist: + - "[ ] Navigate key screens with screen reader" + - "[ ] Form fields have meaningful labels" + - "[ ] Validation errors announced" + - "[ ] Status changes (pending/confirmed) announced" + - "[ ] No unlabeled actionable controls" + interactive_steps: + - prompt: "Enable a screen reader (VoiceOver, TalkBack, NVDA). Navigate the dashboard. Press Enter when ready." + wait_for: keypress + - prompt: "Do form fields on the send screen have meaningful labels read by the screen reader? (y/n)" + wait_for: confirmation + - prompt: "Trigger a validation error. Is it announced by the screen reader? (y/n)" + wait_for: confirmation + - prompt: "Are there any unlabeled buttons or controls? (y/n = y means NO unlabeled controls)" + wait_for: confirmation + + # ========================================================================= + # NATIVE SECURITY — requires OS screenshot/app-switcher + # ========================================================================= + + - id: MAN-SEC-001b + source: GW-MAN-SEC-001 + title: "Seed phrase — screenshot masking and background behavior" + reason: "Requires OS screenshot and app backgrounding (native behavior)" + platforms: [all] + tags: [security] + checklist: + - "[ ] Take screenshot while seed visible — masking policy applied" + - "[ ] Background app while seed visible — seed not exposed" + - "[ ] Return to app — seed screen state handled correctly" + interactive_steps: + - prompt: "Reveal the seed phrase (Settings > Security > View Seed, enter password)." + wait_for: keypress + - prompt: "Take an OS screenshot. Is the seed phrase masked/protected in the screenshot? (y/n)" + wait_for: confirmation + - prompt: "Background/minimize the app. Check the app-switcher preview. Is the seed hidden? (y/n)" + wait_for: confirmation + - prompt: "Return to the app. Is the seed screen handled correctly? (y/n)" + wait_for: confirmation + + - id: MAN-SEC-002 + source: GW-MAN-SEC-002 + title: "Session auto-lock and app-switcher privacy" + reason: "Requires idle timeout and native app-switcher" + platforms: [all] + tags: [security] + checklist: + - "[ ] Set short inactivity timeout" + - "[ ] Leave idle until timeout — re-auth required" + - "[ ] App-switcher snapshot — no sensitive data visible" + interactive_steps: + - prompt: "Set the auto-lock timeout to the shortest option in Settings." + wait_for: keypress + - prompt: "Leave the app idle until the timeout triggers. Does it require re-authentication? (y/n)" + wait_for: confirmation + - prompt: "Check the app-switcher preview while sensitive data is on screen. Is it hidden? (y/n)" + wait_for: confirmation + + # ========================================================================= + # GRADE-B: Manual verification requiring real chain/market conditions + # ========================================================================= + + - id: MAN-VERIFY-CDET-001 + source: GW-AUTO-CDET-001a + title: "Explorer link verification after address copy" + reason: "Explorer link correctness requires opening external browser" + tags: [wallet, verification] + checklist: + - "[ ] Explorer link opens correct network/address URL" + interactive_steps: + - prompt: "Click the explorer link on a coin detail page. Does it open the correct block explorer URL? (y/n)" + wait_for: confirmation + + - id: MAN-VERIFY-CDET-002 + source: GW-MAN-CDET-002 + title: "Transaction pending to confirmed progression" + reason: "Requires waiting for real chain confirmations (1-5 min)" + tags: [wallet, verification] + checklist: + - "[ ] After sending DOC, watch for pending→confirmed in history" + - "[ ] Confirmed tx shows correct hash, amount, fee, timestamp" + - "[ ] Explorer link shows matching transaction" + interactive_steps: + - prompt: "After sending DOC, watch the history. Wait for pending→confirmed (1-5 min). Did status progress? (y/n)" + wait_for: confirmation + - prompt: "Does the confirmed tx show correct hash, amount, fee, timestamp? (y/n)" + wait_for: confirmation + + - id: MAN-VERIFY-DEX-004 + source: GW-MAN-DEX-004 + title: "DEX partial fill observation" + reason: "Depends on external market activity and counterparty" + tags: [dex, verification] + checklist: + - "[ ] Place maker order with moderate size" + - "[ ] Wait for partial fill (requires counterparty)" + - "[ ] Cancel remaining — balances/locked funds reconcile" + interactive_steps: + - prompt: "Place a maker order (1 DOC). Wait for partial fill from a counterparty. Did it occur? (y/n/s to skip)" + wait_for: confirmation + - prompt: "If partially filled, cancel the remaining. Do balances reconcile? (y/n)" + wait_for: confirmation + + - id: MAN-VERIFY-DEX-005 + source: GW-MAN-DEX-005 + title: "History export file verification" + reason: "Requires opening and inspecting exported file contents" + tags: [dex, verification] + checklist: + - "[ ] Export CSV/file from DEX history" + - "[ ] Open file — records match visible history" + - "[ ] Timestamp/decimal formatting correct" + interactive_steps: + - prompt: "Export a file from DEX history. Open it. Do records match the visible history? (y/n)" + wait_for: confirmation + - prompt: "Are timestamps and decimal values formatted correctly? (y/n)" + wait_for: confirmation + + - id: MAN-VERIFY-FIAT-003 + source: GW-MAN-FIAT-003 + title: "Fiat checkout via provider webview" + reason: "Provider webview crosses domain boundary" + tags: [fiat, verification] + checklist: + - "[ ] Submit Buy Now — provider checkout launches" + - "[ ] Complete provider flow" + - "[ ] Return to app — success status shown" + interactive_steps: + - prompt: "Click Buy Now in the Fiat section. Does the provider checkout launch? (y/n)" + wait_for: confirmation + - prompt: "Complete the provider flow and return to the app. Is a success status shown? (y/n)" + wait_for: confirmation + + - id: MAN-VERIFY-FIAT-004 + source: GW-MAN-FIAT-004 + title: "Fiat checkout closed/failed handling" + reason: "Closing provider window mid-flow is manual" + tags: [fiat, verification] + checklist: + - "[ ] Close provider dialog before completion" + - "[ ] Failure messaging shown, no stale state" + - "[ ] Retry works cleanly" + interactive_steps: + - prompt: "Start fiat checkout. Close the provider dialog BEFORE completing payment." + wait_for: keypress + - prompt: "Does the app show appropriate messaging (no stale state)? (y/n)" + wait_for: confirmation + - prompt: "Try starting checkout again. Does it work cleanly? (y/n)" + wait_for: confirmation diff --git a/automated_testing/requirements.txt b/automated_testing/requirements.txt new file mode 100644 index 0000000000..c0d43fd352 --- /dev/null +++ b/automated_testing/requirements.txt @@ -0,0 +1,8 @@ +skyvern>=1.0.0 +pyyaml>=6.0 +pydantic>=2.0 +httpx>=0.27.0 +jinja2>=3.1 +rich>=13.0 +playwright>=1.40.0 +axe-playwright-python>=0.1.0 diff --git a/automated_testing/runner/__init__.py b/automated_testing/runner/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/automated_testing/runner/__init__.py @@ -0,0 +1 @@ + diff --git a/automated_testing/runner/__main__.py b/automated_testing/runner/__main__.py new file mode 100644 index 0000000000..67e0888252 --- /dev/null +++ b/automated_testing/runner/__main__.py @@ -0,0 +1,4 @@ +"""Allow running the runner as a module: python -m runner""" +from .runner import cli + +cli() diff --git a/automated_testing/runner/guards.py b/automated_testing/runner/guards.py new file mode 100644 index 0000000000..cc1ccac76d --- /dev/null +++ b/automated_testing/runner/guards.py @@ -0,0 +1,25 @@ +"""Timeout and deadlock guards for Skyvern task execution.""" + +from __future__ import annotations + +import asyncio + + +class TestTimeoutError(Exception): + """Raised when a test exceeds its allowed execution time.""" + + +async def run_with_timeout(coro, seconds: int, test_id: str): + """Run a coroutine with a hard timeout. + + Raises TestTimeoutError with a diagnostic message including the test ID + and timeout value so the runner can log the issue and continue. + """ + try: + return await asyncio.wait_for(coro, timeout=seconds) + except asyncio.TimeoutError: + raise TestTimeoutError( + f"Test {test_id} timed out after {seconds}s. " + f"This may indicate a hung browser session, Ollama stall, " + f"or the app failing to reach the expected state." + ) diff --git a/automated_testing/runner/interactive.py b/automated_testing/runner/interactive.py new file mode 100644 index 0000000000..8742b436cd --- /dev/null +++ b/automated_testing/runner/interactive.py @@ -0,0 +1,174 @@ +"""Human-in-the-loop interactive prompting for Grade-C and hardware tests. + +When the runner encounters tests tagged 'manual_step' or loaded from +manual_companion.yaml with interactive_steps, this module pauses execution, +presents clear instructions to the human tester, and awaits confirmation. +""" + +from __future__ import annotations + +import asyncio +import sys +from typing import Optional + +from .models import ManualResult + +try: + from rich.console import Console + from rich.panel import Panel + from rich.prompt import Prompt + + _console = Console() + _HAS_RICH = True +except ImportError: + _HAS_RICH = False + + +def _print_header(text: str) -> None: + if _HAS_RICH: + _console.print(Panel(text, title="Manual Test", border_style="yellow")) + else: + print(f"\n{'=' * 60}") + print(f" MANUAL TEST: {text}") + print(f"{'=' * 60}\n") + + +def _print_instruction(text: str) -> None: + if _HAS_RICH: + _console.print(f" [bold cyan]>>>[/bold cyan] {text}") + else: + print(f" >>> {text}") + + +def _get_response(prompt_text: str = "Result (y=pass / n=fail / s=skip)") -> str: + if _HAS_RICH: + return Prompt.ask(f" {prompt_text}", choices=["y", "n", "s"], default="s") + while True: + resp = input(f" {prompt_text} [y/n/s]: ").strip().lower() + if resp in ("y", "n", "s"): + return resp + print(" Please enter y, n, or s.") + + +def _get_keypress(prompt_text: str = "Press Enter when ready...") -> None: + if _HAS_RICH: + _console.input(f" [dim]{prompt_text}[/dim] ") + else: + input(f" {prompt_text} ") + + +async def run_interactive_test(test_def: dict) -> ManualResult: + """Execute a manual/interactive test with human-in-the-loop prompting. + + Args: + test_def: A dict from manual_companion.yaml containing at minimum: + id, title, and either 'interactive_steps' or 'checklist'. + + Returns: + ManualResult with human-provided pass/fail/skip status. + """ + test_id = test_def["id"] + title = test_def.get("title", test_id) + reason = test_def.get("reason", "") + checklist = test_def.get("checklist", []) + interactive_steps = test_def.get("interactive_steps", []) + + _print_header(f"{test_id}: {title}") + if reason: + _print_instruction(f"Reason for manual execution: {reason}") + print() + + checklist_results: list[dict] = [] + + if interactive_steps: + for i, step in enumerate(interactive_steps, 1): + prompt_text = step.get("prompt", "") + wait_for = step.get("wait_for", "confirmation") + + print(f"\n Step {i}/{len(interactive_steps)}:") + _print_instruction(prompt_text) + + if wait_for == "keypress": + _get_keypress() + checklist_results.append({"step": i, "prompt": prompt_text, "result": "acknowledged"}) + elif wait_for == "confirmation": + resp = _get_response() + status = {"y": "pass", "n": "fail", "s": "skip"}[resp] + checklist_results.append({"step": i, "prompt": prompt_text, "result": status}) + if status == "fail": + notes = input(" Failure notes (optional): ").strip() + checklist_results[-1]["notes"] = notes + else: + _get_keypress(f"{wait_for} — Press Enter when done...") + checklist_results.append({"step": i, "prompt": prompt_text, "result": "acknowledged"}) + + elif checklist: + for i, item in enumerate(checklist, 1): + item_text = item.replace("[ ] ", "").replace("[x] ", "").strip() + print(f"\n Checklist item {i}/{len(checklist)}:") + _print_instruction(item_text) + resp = _get_response() + status = {"y": "pass", "n": "fail", "s": "skip"}[resp] + checklist_results.append({"item": i, "text": item_text, "result": status}) + + print() + overall = _get_response("Overall test result (y=pass / n=fail / s=skip)") + overall_status = {"y": "PASS", "n": "FAIL", "s": "SKIP"}[overall] + notes = "" + if overall_status == "FAIL": + notes = input(" Failure notes: ").strip() + + return ManualResult( + test_id=test_id, + title=title, + status=overall_status, + checklist_results=checklist_results, + notes=notes, + ) + + +async def run_interactive_batch( + manual_tests: list[dict], + tag_filter: Optional[str] = None, +) -> list[ManualResult]: + """Run a batch of interactive/manual tests sequentially. + + Args: + manual_tests: List of test defs from manual_companion.yaml + tag_filter: If set, only run tests whose tags include this value + + Returns: + List of ManualResult objects. + """ + results: list[ManualResult] = [] + filtered = manual_tests + if tag_filter: + filtered = [ + t for t in manual_tests + if tag_filter in t.get("tags", []) + ] + + total = len(filtered) + print(f"\n{'=' * 60}") + print(f" INTERACTIVE TEST SESSION — {total} manual tests") + print(f"{'=' * 60}") + + for i, test_def in enumerate(filtered, 1): + print(f"\n [{i}/{total}]") + result = await run_interactive_test(test_def) + results.append(result) + + if i < total: + resp = input("\n Continue to next test? (Enter=yes, q=quit): ").strip() + if resp.lower() == "q": + print(" Skipping remaining manual tests.") + for remaining in filtered[i:]: + results.append(ManualResult( + test_id=remaining["id"], + title=remaining.get("title", ""), + status="SKIP", + notes="Skipped by user", + )) + break + + return results diff --git a/automated_testing/runner/models.py b/automated_testing/runner/models.py new file mode 100644 index 0000000000..65f0840c4f --- /dev/null +++ b/automated_testing/runner/models.py @@ -0,0 +1,106 @@ +"""Pydantic data models for the Gleec QA test runner.""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Optional + +from pydantic import BaseModel, Field + + +class TestStep(BaseModel): + action: str + checkpoint: Optional[str] = None + + +class CompositePhase(BaseModel): + """A single phase in a composite test that mixes Skyvern + OS/Playwright.""" + type: str # skyvern | os_call | playwright | assert + action: str = "" + prompt: str = "" + args: dict = Field(default_factory=dict) + expected: Optional[str] = None + checkpoint: Optional[str] = None + extraction_schema: Optional[dict] = None + max_steps: Optional[int] = None + + +class TestCase(BaseModel): + id: str + name: str + tags: list[str] = Field(default_factory=list) + prompt: str = "" + steps: Optional[list[TestStep]] = None + phases: Optional[list[CompositePhase]] = None + expected_result: str = "" + extraction_schema: Optional[dict] = None + max_steps: Optional[int] = None + timeout: Optional[int] = None + source_manual_id: Optional[str] = None + manual_verification_note: Optional[str] = None + + @property + def is_composite(self) -> bool: + return self.phases is not None and len(self.phases) > 0 + + +class AttemptResult(BaseModel): + attempt: int + status: str # PASS | FAIL | ERROR | TIMEOUT + skyvern_status: str = "" + extracted_data: Optional[dict | str] = None + duration_seconds: float = 0.0 + run_id: Optional[str] = None + error: Optional[str] = None + screenshot_path: Optional[str] = None + + +class VotedResult(BaseModel): + test_id: str + test_name: str + tags: list[str] = Field(default_factory=list) + final_status: str # PASS | FAIL | FLAKY | ERROR | SKIP + vote_counts: dict[str, int] = Field(default_factory=dict) + confidence: float = 0.0 + expected: str = "" + manual_verification_note: Optional[str] = None + attempts: list[AttemptResult] = Field(default_factory=list) + duration_seconds: float = 0.0 + + +class ManualResult(BaseModel): + test_id: str + title: str + status: str # PASS | FAIL | SKIP + checklist_results: list[dict] = Field(default_factory=list) + notes: str = "" + + +class TestRun(BaseModel): + timestamp: str = Field( + default_factory=lambda: datetime.now(timezone.utc).isoformat() + ) + base_url: str = "" + engine: str = "skyvern-2.0" + model: str = "qwen2.5-vl:32b" + total: int = 0 + passed: int = 0 + failed: int = 0 + errors: int = 0 + skipped: int = 0 + flaky: int = 0 + pass_rate: float = 0.0 + duration_seconds: float = 0.0 + voted_results: list[VotedResult] = Field(default_factory=list) + manual_results: list[ManualResult] = Field(default_factory=list) + + def compute_summary(self) -> None: + self.total = len(self.voted_results) + self.passed = sum(1 for r in self.voted_results if r.final_status == "PASS") + self.failed = sum(1 for r in self.voted_results if r.final_status == "FAIL") + self.errors = sum(1 for r in self.voted_results if r.final_status == "ERROR") + self.skipped = sum(1 for r in self.voted_results if r.final_status == "SKIP") + self.flaky = sum(1 for r in self.voted_results if r.final_status == "FLAKY") + executed = self.passed + self.failed + self.flaky + self.pass_rate = round(self.passed / executed * 100, 1) if executed > 0 else 0.0 + self.duration_seconds = sum(r.duration_seconds for r in self.voted_results) diff --git a/automated_testing/runner/ollama_monitor.py b/automated_testing/runner/ollama_monitor.py new file mode 100644 index 0000000000..360d0a7bdd --- /dev/null +++ b/automated_testing/runner/ollama_monitor.py @@ -0,0 +1,111 @@ +"""Background Ollama and GPU health monitor running during test execution.""" + +from __future__ import annotations + +import asyncio +import logging +import platform +import shutil +import subprocess + +import httpx + +logger = logging.getLogger(__name__) + + +class OllamaMonitor: + """Polls Ollama health and GPU metrics every interval_seconds. + + The runner checks ``monitor.healthy`` before each test attempt. + If unhealthy, subsequent tests are immediately marked ERROR with + the specific failure reason from ``last_error``. + """ + + def __init__( + self, + ollama_url: str = "http://localhost:11434", + interval_seconds: int = 10, + vram_critical_mb: int = 500, + temp_critical_c: int = 90, + ): + self.url = ollama_url + self.interval = interval_seconds + self.vram_critical_mb = vram_critical_mb + self.temp_critical_c = temp_critical_c + self._running = False + self._task: asyncio.Task | None = None + self.last_error: str | None = None + + @property + def healthy(self) -> bool: + return self.last_error is None + + async def start(self) -> None: + self._running = True + self._task = asyncio.create_task(self._monitor_loop()) + logger.info("Ollama monitor started (interval=%ds)", self.interval) + + async def stop(self) -> None: + self._running = False + if self._task: + self._task.cancel() + try: + await self._task + except asyncio.CancelledError: + pass + logger.info("Ollama monitor stopped") + + async def _monitor_loop(self) -> None: + while self._running: + try: + await self._check_ollama_http() + await self._check_gpu() + except asyncio.CancelledError: + break + except Exception as exc: + self.last_error = f"Monitor error: {exc}" + logger.warning("Monitor exception: %s", exc) + + await asyncio.sleep(self.interval) + + async def _check_ollama_http(self) -> None: + try: + async with httpx.AsyncClient(timeout=5) as client: + resp = await client.get(f"{self.url}/api/tags") + if resp.status_code != 200: + self.last_error = f"Ollama returned HTTP {resp.status_code}" + else: + self.last_error = None + except Exception as exc: + self.last_error = f"Ollama unreachable: {exc}" + + async def _check_gpu(self) -> None: + nvidia_smi = shutil.which("nvidia-smi") + if nvidia_smi is None and platform.system() == "Linux": + nvidia_smi = "/usr/lib/wsl/lib/nvidia-smi" + if nvidia_smi is None: + return + + try: + result = subprocess.run( + [ + nvidia_smi, + "--query-gpu=memory.free,memory.used,temperature.gpu", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=5, + ) + parts = result.stdout.strip().split(", ") + if len(parts) >= 3: + free_mb = int(parts[0]) + temp_c = int(parts[2]) + if free_mb < self.vram_critical_mb: + self.last_error = ( + f"VRAM critically low: {free_mb}MB free" + ) + elif temp_c > self.temp_critical_c: + self.last_error = f"GPU temperature critical: {temp_c}°C" + except Exception: + pass diff --git a/automated_testing/runner/os_automation.py b/automated_testing/runner/os_automation.py new file mode 100644 index 0000000000..be4923a3e3 --- /dev/null +++ b/automated_testing/runner/os_automation.py @@ -0,0 +1,266 @@ +"""Cross-platform OS-level automation utilities. + +Provides clipboard access and (opt-in) system-wide network toggling. + +IMPORTANT: Network toggling here is SYSTEM-WIDE — it kills connectivity +for the entire host, including the test runner, Skyvern, and Ollama. +For normal test runs, use PlaywrightSession.set_offline() instead, which +simulates network loss at the browser context level without affecting +the runner's infrastructure. + +The OS-level network functions are retained for CI environments where +the runner and Skyvern run on a separate machine from the app under test. +""" + +from __future__ import annotations + +import asyncio +import logging +import platform +import shutil +import subprocess + +logger = logging.getLogger(__name__) + + +def _detect_platform() -> str: + """Detect the runtime platform category.""" + system = platform.system() + if system == "Darwin": + return "macos" + if system == "Linux": + with open("/proc/version", "r") as f: + if "microsoft" in f.read().lower(): + return "wsl2" + return "linux" + if system == "Windows": + return "windows" + return "unknown" + + +PLATFORM = _detect_platform() + + +# --------------------------------------------------------------------------- +# Network toggling +# --------------------------------------------------------------------------- + +async def set_network_enabled(enabled: bool) -> tuple[bool, str]: + """Toggle the network connection at the OS level. + + Returns (success, message). + """ + action = "enable" if enabled else "disable" + logger.info("Network %s on platform=%s", action, PLATFORM) + + try: + if PLATFORM == "macos": + return await _network_macos(enabled) + elif PLATFORM == "linux": + return await _network_linux(enabled) + elif PLATFORM == "wsl2": + return await _network_wsl2(enabled) + else: + return False, f"Unsupported platform: {PLATFORM}" + except Exception as exc: + return False, f"Network toggle failed: {exc}" + + +async def _network_macos(enabled: bool) -> tuple[bool, str]: + """Toggle Wi-Fi on macOS via networksetup.""" + iface = await _get_macos_wifi_interface() + if not iface: + return False, "No Wi-Fi interface found on macOS" + + state = "on" if enabled else "off" + proc = await asyncio.create_subprocess_exec( + "networksetup", "-setairportpower", iface, state, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await proc.communicate() + if proc.returncode == 0: + return True, f"macOS Wi-Fi ({iface}) set to {state}" + return False, f"networksetup failed: {stderr.decode()}" + + +async def _get_macos_wifi_interface() -> str | None: + """Find the Wi-Fi network interface name on macOS.""" + proc = await asyncio.create_subprocess_exec( + "networksetup", "-listallhardwareports", + stdout=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + lines = stdout.decode().splitlines() + for i, line in enumerate(lines): + if "Wi-Fi" in line or "AirPort" in line: + for j in range(i + 1, min(i + 3, len(lines))): + if lines[j].strip().startswith("Device:"): + return lines[j].split(":", 1)[1].strip() + return None + + +async def _network_linux(enabled: bool) -> tuple[bool, str]: + """Toggle network on Linux via nmcli or ip.""" + if shutil.which("nmcli"): + state = "on" if enabled else "off" + proc = await asyncio.create_subprocess_exec( + "nmcli", "networking", state, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await proc.communicate() + if proc.returncode == 0: + return True, f"nmcli networking {state}" + return False, f"nmcli failed: {stderr.decode()}" + + iface = await _get_linux_default_interface() + if not iface: + return False, "No default network interface found" + + action = "up" if enabled else "down" + proc = await asyncio.create_subprocess_exec( + "sudo", "ip", "link", "set", iface, action, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await proc.communicate() + if proc.returncode == 0: + return True, f"ip link set {iface} {action}" + return False, f"ip link failed: {stderr.decode()}" + + +async def _get_linux_default_interface() -> str | None: + """Find the default network interface on Linux.""" + proc = await asyncio.create_subprocess_exec( + "ip", "route", "show", "default", + stdout=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + parts = stdout.decode().split() + if "dev" in parts: + idx = parts.index("dev") + if idx + 1 < len(parts): + return parts[idx + 1] + return None + + +async def _network_wsl2(enabled: bool) -> tuple[bool, str]: + """Toggle network from WSL2 by calling PowerShell on the Windows host. + + Uses iptables to block/unblock outbound traffic from WSL2 since + directly toggling the Windows adapter from WSL2 requires elevated + privileges on the host. + """ + if shutil.which("iptables"): + if enabled: + proc = await asyncio.create_subprocess_exec( + "sudo", "iptables", "-D", "OUTPUT", "-j", "DROP", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + else: + proc = await asyncio.create_subprocess_exec( + "sudo", "iptables", "-A", "OUTPUT", "-j", "DROP", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await proc.communicate() + if proc.returncode == 0: + state = "restored" if enabled else "blocked" + return True, f"WSL2 outbound traffic {state} via iptables" + return False, f"iptables failed: {stderr.decode()}" + + return False, "iptables not available in WSL2" + + +# --------------------------------------------------------------------------- +# Clipboard +# --------------------------------------------------------------------------- + +async def read_clipboard() -> tuple[bool, str]: + """Read the system clipboard contents. + + Returns (success, clipboard_text_or_error). + """ + try: + if PLATFORM == "macos": + proc = await asyncio.create_subprocess_exec( + "pbpaste", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode == 0: + return True, stdout.decode() + return False, f"pbpaste failed (rc={proc.returncode}): {stderr.decode()}" + + elif PLATFORM in ("linux", "wsl2"): + for cmd in (["xclip", "-selection", "clipboard", "-o"], + ["xsel", "--clipboard", "--output"]): + if shutil.which(cmd[0]): + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + if proc.returncode == 0: + return True, stdout.decode() + + if PLATFORM == "wsl2" and shutil.which("powershell.exe"): + proc = await asyncio.create_subprocess_exec( + "powershell.exe", "-Command", "Get-Clipboard", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + if proc.returncode == 0: + return True, stdout.decode().strip() + + return False, "No clipboard tool found (install xclip or xsel)" + + return False, f"Unsupported platform: {PLATFORM}" + except Exception as exc: + return False, f"Clipboard read failed: {exc}" + + +async def write_clipboard(text: str) -> tuple[bool, str]: + """Write text to the system clipboard.""" + try: + if PLATFORM == "macos": + proc = await asyncio.create_subprocess_exec( + "pbcopy", + stdin=asyncio.subprocess.PIPE, + ) + await proc.communicate(input=text.encode()) + return True, "Clipboard written (macOS)" + + elif PLATFORM in ("linux", "wsl2"): + for cmd in (["xclip", "-selection", "clipboard"], + ["xsel", "--clipboard", "--input"]): + if shutil.which(cmd[0]): + proc = await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + ) + await proc.communicate(input=text.encode()) + if proc.returncode == 0: + return True, f"Clipboard written ({cmd[0]})" + + if PLATFORM == "wsl2" and shutil.which("powershell.exe"): + proc = await asyncio.create_subprocess_exec( + "powershell.exe", "-Command", + "$input | Set-Clipboard", + stdin=asyncio.subprocess.PIPE, + ) + await proc.communicate(input=text.encode()) + if proc.returncode == 0: + return True, "Clipboard written (WSL2/PowerShell)" + return False, f"PowerShell Set-Clipboard failed (rc={proc.returncode})" + + return False, "No clipboard tool found" + + return False, f"Unsupported platform: {PLATFORM}" + except Exception as exc: + return False, f"Clipboard write failed: {exc}" diff --git a/automated_testing/runner/playwright_helpers.py b/automated_testing/runner/playwright_helpers.py new file mode 100644 index 0000000000..2bea16a113 --- /dev/null +++ b/automated_testing/runner/playwright_helpers.py @@ -0,0 +1,287 @@ +"""Direct Playwright automation for tasks that Skyvern cannot handle. + +Provides browser lifecycle management, viewport resizing, file downloads, +keyboard navigation auditing, accessibility scanning, and clock mocking. +These run in a separate Playwright instance from Skyvern's browser. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import tempfile +from datetime import datetime +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + + +class PlaywrightSession: + """Manages a direct Playwright browser session for composite tests. + + This is separate from Skyvern's internal Playwright instance. + Used for OS-level browser manipulation that Skyvern's API doesn't expose. + """ + + def __init__(self, headless: bool = False): + self.headless = headless + self._pw = None + self._browser = None + self._context = None + self._page = None + + async def start(self, viewport: dict | None = None) -> None: + from playwright.async_api import async_playwright + self._pw = await async_playwright().start() + self._browser = await self._pw.chromium.launch(headless=self.headless) + ctx_opts = {"accept_downloads": True} + if viewport: + ctx_opts["viewport"] = viewport + self._context = await self._browser.new_context(**ctx_opts) + await self._context.grant_permissions( + ["clipboard-read", "clipboard-write"] + ) + self._page = await self._context.new_page() + + async def stop(self) -> None: + if self._browser: + await self._browser.close() + if self._pw: + await self._pw.stop() + + @property + def page(self): + return self._page + + @property + def context(self): + return self._context + + @property + def browser(self): + return self._browser + + # ------------------------------------------------------------------- + # Browser lifecycle (simulate app restart) + # ------------------------------------------------------------------- + + async def restart_session( + self, url: str, viewport: dict | None = None + ) -> None: + """Close the current context and open a fresh one (simulates app restart). + + Cookies, localStorage, and sessionStorage are wiped. + """ + if self._context: + await self._context.close() + + ctx_opts = {"accept_downloads": True} + if viewport: + ctx_opts["viewport"] = viewport + self._context = await self._browser.new_context(**ctx_opts) + await self._context.grant_permissions( + ["clipboard-read", "clipboard-write"] + ) + self._page = await self._context.new_page() + await self._page.goto(url, wait_until="networkidle") + + # ------------------------------------------------------------------- + # Navigation + # ------------------------------------------------------------------- + + async def navigate(self, url: str, wait: str = "networkidle") -> None: + await self._page.goto(url, wait_until=wait) + + async def wait_for_flutter(self, seconds: float = 3.0) -> None: + """Wait for Flutter canvas to finish rendering.""" + await asyncio.sleep(seconds) + + # ------------------------------------------------------------------- + # Network simulation (per-browser-context, not system-wide) + # ------------------------------------------------------------------- + + async def set_offline(self, offline: bool = True) -> None: + """Simulate network loss/restore at the browser context level. + + This only affects the browser — the test runner, Skyvern, and Ollama + remain fully connected. + """ + await self._context.set_offline(offline) + + # ------------------------------------------------------------------- + # Viewport / responsive + # ------------------------------------------------------------------- + + async def set_viewport(self, width: int, height: int) -> None: + await self._page.set_viewport_size({"width": width, "height": height}) + + async def take_screenshot(self, path: str | Path) -> str: + await self._page.screenshot(path=str(path), full_page=True) + return str(path) + + # ------------------------------------------------------------------- + # Clipboard + # ------------------------------------------------------------------- + + async def read_clipboard(self) -> str: + return await self._page.evaluate( + "async () => await navigator.clipboard.readText()" + ) + + async def write_clipboard(self, text: str) -> None: + await self._page.evaluate( + "async (t) => await navigator.clipboard.writeText(t)", text + ) + + # ------------------------------------------------------------------- + # Clock mocking + # ------------------------------------------------------------------- + + async def mock_clock(self, fake_time: datetime) -> None: + """Set a fixed fake time for Date.now() and new Date().""" + await self._page.clock.set_fixed_time(fake_time) + + async def reset_clock(self) -> None: + """Remove clock mock by reloading without the override.""" + url = self._page.url + await self._page.reload() + await self.wait_for_flutter() + + # ------------------------------------------------------------------- + # File downloads + # ------------------------------------------------------------------- + + async def trigger_download_and_capture( + self, click_selector: str | None = None, click_text: str | None = None + ) -> dict: + """Click an element that triggers a download and capture the file. + + Returns dict with: path, filename, size, content_preview. + """ + async with self._page.expect_download() as dl_info: + if click_selector: + await self._page.click(click_selector) + elif click_text: + await self._page.get_by_text(click_text).click() + else: + raise ValueError("Provide click_selector or click_text") + + download = await dl_info.value + tmp = tempfile.mktemp(suffix=f"_{download.suggested_filename}") + await download.save_as(tmp) + + content = Path(tmp).read_text(encoding="utf-8", errors="replace") + return { + "path": tmp, + "filename": download.suggested_filename, + "size": Path(tmp).stat().st_size, + "content_preview": content[:500], + "is_valid_json": _is_valid_json(content), + } + + # ------------------------------------------------------------------- + # Keyboard navigation audit + # ------------------------------------------------------------------- + + async def keyboard_navigation_audit( + self, max_tabs: int = 100 + ) -> dict: + """Tab through the page and record focus order. + + Returns dict with: focused_elements (list), traps_detected (bool), + total_tabbable (int). + """ + focused_elements = [] + seen_tags = set() + trap_count = 0 + prev_element = None + + for i in range(max_tabs): + await self._page.keyboard.press("Tab") + await asyncio.sleep(0.15) + + info = await self._page.evaluate("""() => { + const el = document.activeElement; + if (!el || el === document.body) return null; + return { + tag: el.tagName, + role: el.getAttribute('role'), + label: el.getAttribute('aria-label') || el.textContent?.slice(0, 50), + id: el.id, + tabIndex: el.tabIndex, + }; + }""") + + if info is None: + continue + + element_key = f"{info['tag']}:{info.get('id', '')}:{info.get('label', '')}" + + if element_key == prev_element: + trap_count += 1 + if trap_count > 3: + break + else: + trap_count = 0 + + if element_key not in seen_tags: + focused_elements.append(info) + seen_tags.add(element_key) + + prev_element = element_key + + return { + "focused_elements": focused_elements, + "total_tabbable": len(focused_elements), + "traps_detected": trap_count > 3, + } + + # ------------------------------------------------------------------- + # Accessibility audit (axe-core) + # ------------------------------------------------------------------- + + async def accessibility_audit(self) -> dict: + """Run axe-core accessibility scan on the current page. + + Returns dict with: violations_count, violations (list), + passes_count. + """ + try: + from axe_playwright_python.async_playwright import Axe + axe = Axe() + results = await axe.run(self._page) + + response = getattr(results, "response", {}) or {} + raw_violations = response.get("violations", []) + raw_passes = response.get("passes", []) + + violations = [ + { + "id": v.get("id"), + "impact": v.get("impact"), + "description": v.get("description"), + "nodes_count": len(v.get("nodes", [])), + } + for v in raw_violations + ] + + return { + "violations_count": len(raw_violations), + "violations": violations, + "passes_count": len(raw_passes), + } + except ImportError: + logger.warning("axe-playwright-python not installed — skipping a11y audit") + return {"violations_count": -1, "error": "axe-playwright-python not installed"} + except Exception as exc: + return {"violations_count": -1, "error": str(exc)} + + +def _is_valid_json(content: str) -> bool: + try: + json.loads(content) + return True + except (json.JSONDecodeError, ValueError): + return False diff --git a/automated_testing/runner/preflight.py b/automated_testing/runner/preflight.py new file mode 100644 index 0000000000..e8a9de4f73 --- /dev/null +++ b/automated_testing/runner/preflight.py @@ -0,0 +1,131 @@ +"""Pre-flight health checks — validates infrastructure before any tests run.""" + +from __future__ import annotations + +import asyncio +import platform +import shutil +import subprocess + +import httpx + + +async def check_ollama(url: str = "http://localhost:11434") -> tuple[bool, str]: + """Verify Ollama is running and has at least one model loaded.""" + try: + async with httpx.AsyncClient(timeout=10) as client: + resp = await client.get(f"{url}/api/tags") + models = resp.json().get("models", []) + if len(models) > 0: + names = [m.get("name", "?") for m in models] + return True, f"Ollama OK — models: {', '.join(names)}" + return False, "Ollama running but no models loaded" + except Exception as exc: + return False, f"Ollama unreachable: {exc}" + + +async def check_ollama_inference( + url: str = "http://localhost:11434", + model: str = "qwen2.5-vl:32b", +) -> tuple[bool, str]: + """Run a trivial inference to confirm the GPU pipeline works.""" + try: + async with httpx.AsyncClient(timeout=120) as client: + resp = await client.post( + f"{url}/api/generate", + json={ + "model": model, + "prompt": "Reply with only the word OK.", + "stream": False, + }, + ) + output = resp.json().get("response", "").strip() + if "ok" in output.lower(): + return True, "Ollama inference OK" + return False, f"Ollama inference unexpected output: {output!r}" + except Exception as exc: + return False, f"Ollama inference failed: {exc}" + + +async def check_vram(min_gb: float = 15.0) -> tuple[bool, str]: + """Verify sufficient VRAM is free. + + Handles both Linux native nvidia-smi and Windows (via WSL2 or native). + """ + nvidia_smi = shutil.which("nvidia-smi") + if nvidia_smi is None and platform.system() == "Linux": + nvidia_smi = "/usr/lib/wsl/lib/nvidia-smi" + + if nvidia_smi is None: + return True, "nvidia-smi not found — skipping VRAM check" + + try: + result = subprocess.run( + [nvidia_smi, "--query-gpu=memory.free", "--format=csv,noheader,nounits"], + capture_output=True, + text=True, + timeout=5, + ) + free_mb = int(result.stdout.strip().split("\n")[0]) + free_gb = free_mb / 1024 + if free_gb >= min_gb: + return True, f"VRAM OK — {free_gb:.1f} GB free" + return False, f"VRAM low — {free_gb:.1f} GB free (need {min_gb}+ GB)" + except Exception as exc: + return True, f"VRAM check skipped: {exc}" + + +async def check_skyvern(url: str = "http://localhost:8000") -> tuple[bool, str]: + """Verify Skyvern server responds to heartbeat.""" + try: + async with httpx.AsyncClient(timeout=10) as client: + resp = await client.get(f"{url}/api/v1/heartbeat") + if resp.status_code == 200: + return True, "Skyvern OK" + return False, f"Skyvern returned HTTP {resp.status_code}" + except Exception as exc: + return False, f"Skyvern unreachable: {exc}" + + +async def check_app(url: str) -> tuple[bool, str]: + """Verify the Flutter app is reachable.""" + try: + async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client: + resp = await client.get(url) + if 200 <= resp.status_code < 300: + return True, f"App OK — HTTP {resp.status_code}" + return False, f"App returned HTTP {resp.status_code}" + except Exception as exc: + return False, f"App unreachable: {exc}" + + +async def run_preflight( + config: dict, + *, + ollama_url: str = "http://localhost:11434", + skyvern_url: str = "http://localhost:8000", + vram_min_gb: float = 15.0, +) -> tuple[bool, list[tuple[str, bool, str]]]: + """Run all pre-flight checks. Returns (all_ok, list of (name, ok, message)).""" + base_url = config.get("base_url") + if not base_url: + return False, [("Config", False, "config.base_url is missing from test matrix")] + + checks = await asyncio.gather( + check_ollama(ollama_url), + check_vram(vram_min_gb), + check_skyvern(skyvern_url), + check_app(base_url), + ) + + names = ["Ollama", "VRAM", "Skyvern", "App"] + results = [(names[i], checks[i][0], checks[i][1]) for i in range(len(checks))] + all_ok = all(ok for _, ok, _ in results) + + if all_ok: + model = config.get("ollama_model", "qwen2.5-vl:32b") + inf_ok, inf_msg = await check_ollama_inference(ollama_url, model) + results.append(("Inference", inf_ok, inf_msg)) + all_ok = inf_ok + + return all_ok, results diff --git a/automated_testing/runner/prompt_builder.py b/automated_testing/runner/prompt_builder.py new file mode 100644 index 0000000000..10ca2fbc14 --- /dev/null +++ b/automated_testing/runner/prompt_builder.py @@ -0,0 +1,78 @@ +"""Prompt construction for Skyvern tasks targeting Flutter web apps.""" + +from __future__ import annotations + +from .models import TestCase + +FLUTTER_PREAMBLE = """\ +IMPORTANT CONTEXT: +This is a Flutter web application rendered entirely on an HTML canvas element. +You cannot use DOM selectors — you must identify all elements visually. + +Before taking any action on each new screen: +1. Wait 2 seconds for the page to fully render (Flutter animations to complete). +2. If you see a loading spinner, circular progress indicator, or skeleton + placeholders, wait until they disappear before proceeding. +3. If the screen appears blank or only shows a solid color, wait 3 more + seconds — Flutter may still be initialising. + +If you are unsure whether an element is a button or just text, look for +visual cues: rounded corners, drop shadows, background color contrast, +or iconography that suggests interactivity. +""" + +COMPLETION_SUFFIX = """ +After completing the task, clearly state whether you succeeded or encountered +an error. If you see an error message, snackbar, or alert dialog on screen, +report its exact text in your response.""" + + +def build_stepped_prompt(steps: list[dict]) -> str: + """Convert checkpoint-based steps into a single sequential prompt.""" + lines = [] + for i, step in enumerate(steps, 1): + action = step.get("action", "") if isinstance(step, dict) else step.action + checkpoint = step.get("checkpoint") if isinstance(step, dict) else step.checkpoint + + lines.append(f"STEP {i}: {action.strip()}") + if checkpoint: + lines.append( + f" → BEFORE proceeding to step {i + 1}, verify: {checkpoint}" + ) + lines.append( + f" → If this verification FAILS, STOP and report which step " + f"failed and why." + ) + lines.append("") + return "\n".join(lines) + + +def build_prompt( + test: TestCase, + setup_prompt: str | None = None, +) -> str: + """Assemble the full prompt for a Skyvern task. + + Combines the Flutter preamble, optional setup phase, the test body + (either stepped or freeform), and the completion suffix. + """ + parts: list[str] = [FLUTTER_PREAMBLE] + + if setup_prompt: + parts.append(f"PHASE 1 — SETUP:\n{setup_prompt.strip()}\n") + parts.append("After setup is complete, proceed immediately to Phase 2.\n") + parts.append("PHASE 2 — TEST:") + + if test.steps: + step_dicts = [ + {"action": s.action, "checkpoint": s.checkpoint} + for s in test.steps + ] + parts.append(build_stepped_prompt(step_dicts)) + elif test.prompt: + parts.append(test.prompt.strip()) + else: + parts.append(f"Complete the following: {test.name}") + + parts.append(COMPLETION_SUFFIX) + return "\n".join(parts) diff --git a/automated_testing/runner/reporter.py b/automated_testing/runner/reporter.py new file mode 100644 index 0000000000..56d80caa55 --- /dev/null +++ b/automated_testing/runner/reporter.py @@ -0,0 +1,168 @@ +"""HTML report generator for test run results.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from jinja2 import Template + +from .models import TestRun + +REPORT_TEMPLATE = Template("""\ + + + + + +Gleec QA Report — {{ run.timestamp }} + + + +

Gleec Wallet QA Report

+
+ {{ run.timestamp }} · {{ run.base_url }} · + Engine: {{ run.engine }} · Model: {{ run.model }} · + Duration: {{ "%.0f"|format(run.duration_seconds) }}s +
+ +
+
{{ run.total }}
Total
+
{{ run.passed }}
Passed
+
{{ run.failed }}
Failed
+
{{ run.flaky }}
Flaky
+
{{ run.errors }}
Errors
+ +
{{ run.pass_rate }}%
Pass Rate
+
+ +

Automated Tests

+ + + + + + + + +{% for r in run.voted_results %} + + + + + + + + + + +{% endfor %} + +
IDNameTagsStatusConfidenceVotesDurationDetails
{{ r.test_id }}{{ r.test_name }}{{ r.tags | join(', ') }}{{ r.final_status }}{{ "%.0f"|format(r.confidence * 100) }}%{{ r.vote_counts }}{{ "%.1f"|format(r.duration_seconds) }}s + {% if r.attempts %} +
+ {{ r.attempts | length }} attempt(s) +
{{ r.attempts | tojson }}
+
+ {% endif %} + {% if r.manual_verification_note %} +
Manual check: {{ r.manual_verification_note }}
+ {% endif %} +
+ +{% if run.manual_results %} +

Manual / Interactive Tests

+ + + + + +{% for m in run.manual_results %} + + + + + + +{% endfor %} + +
IDTitleStatusNotes
{{ m.test_id }}{{ m.title }}{{ m.status }}{{ m.notes }}
+{% endif %} + + + +""") + + +def generate_html_report(run: TestRun, output_path: Path) -> None: + """Render the test run as a styled HTML report.""" + html = REPORT_TEMPLATE.render(run=run.model_dump()) + output_path.write_text(html, encoding="utf-8") + + +def write_json_results(run: TestRun, output_path: Path) -> None: + """Write the test run as a structured JSON file.""" + output_path.write_text( + json.dumps(run.model_dump(), indent=2, default=str), + encoding="utf-8", + ) diff --git a/automated_testing/runner/retry.py b/automated_testing/runner/retry.py new file mode 100644 index 0000000000..38c8cea293 --- /dev/null +++ b/automated_testing/runner/retry.py @@ -0,0 +1,83 @@ +"""Majority vote and retry logic for non-deterministic vision-based tests.""" + +from __future__ import annotations + +from collections import Counter + +from .models import AttemptResult, VotedResult + + +def majority_vote(attempts: list[AttemptResult], test_id: str, test_name: str, + tags: list[str], expected: str) -> VotedResult: + """Determine final verdict from multiple attempts using majority vote. + + Rules: + - If ALL attempts agree → that status, confidence 1.0 + - If majority agrees → that status, confidence = majority/total + - If no majority → FLAKY, confidence = max_count/total + - If all ERROR → ERROR + - If "winner" is ERROR but PASS/FAIL exist → FLAKY + """ + statuses = [a.status for a in attempts] + counts = Counter(statuses) + total = len(attempts) + + if total == 0: + return VotedResult( + test_id=test_id, + test_name=test_name, + tags=tags, + final_status="SKIP", + vote_counts={}, + confidence=0.0, + expected=expected, + attempts=[], + ) + + most_common_status, most_common_count = counts.most_common(1)[0] + + if most_common_count > total / 2: + final_status = most_common_status + confidence = most_common_count / total + else: + final_status = "FLAKY" + confidence = most_common_count / total + + if final_status == "ERROR": + non_errors = [s for s in statuses if s != "ERROR"] + if non_errors: + final_status = "FLAKY" + + total_duration = sum(a.duration_seconds for a in attempts) + + return VotedResult( + test_id=test_id, + test_name=test_name, + tags=tags, + final_status=final_status, + vote_counts=dict(counts), + confidence=round(confidence, 2), + expected=expected, + attempts=attempts, + duration_seconds=round(total_duration, 2), + ) + + +def should_stop_early(attempts: list[AttemptResult], max_attempts: int) -> bool: + """Determine if remaining retries can be skipped. + + Early exit conditions: + - First 2 attempts both PASS → skip remaining + - All attempts so far are ERROR (infra issue) and at least 2 done → stop + """ + if len(attempts) < 2: + return False + + pass_count = sum(1 for a in attempts if a.status == "PASS") + if pass_count >= 2: + return True + + if all(a.status == "ERROR" for a in attempts): + return True + + return False diff --git a/automated_testing/runner/runner.py b/automated_testing/runner/runner.py new file mode 100644 index 0000000000..2c53eee58f --- /dev/null +++ b/automated_testing/runner/runner.py @@ -0,0 +1,606 @@ +"""Hardened test runner — main orchestration for Gleec QA automation. + +Usage: + python -m runner.runner [--matrix PATH] [--tag TAG] [--single] + [--include-manual] [--manual-only] + [--ollama-url URL] [--skyvern-url URL] +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import logging +import sys +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +import yaml + +from .guards import TestTimeoutError, run_with_timeout +from .interactive import run_interactive_batch +from .models import ( + AttemptResult, CompositePhase, ManualResult, TestCase, TestRun, VotedResult, +) +from .ollama_monitor import OllamaMonitor +from .preflight import run_preflight +from .prompt_builder import build_prompt +from .reporter import generate_html_report, write_json_results +from .retry import majority_vote, should_stop_early + +logger = logging.getLogger(__name__) + +DEFAULT_RETRIES = 3 +CRITICAL_RETRIES = 5 + + +def load_matrix(path: str) -> dict: + with open(path, "r") as f: + return yaml.safe_load(f) + + +def load_manual_tests(path: str) -> list[dict]: + with open(path, "r") as f: + data = yaml.safe_load(f) + return data.get("manual_tests", []) + + +def parse_tests(matrix: dict) -> list[TestCase]: + tests_raw = matrix.get("tests", []) + tests = [] + for raw in tests_raw: + if "steps" in raw and raw["steps"]: + steps_parsed = [] + for s in raw["steps"]: + if isinstance(s, dict): + steps_parsed.append(s) + else: + steps_parsed.append({"action": str(s), "checkpoint": None}) + raw["steps"] = steps_parsed + if "phases" in raw and raw["phases"]: + for phase in raw["phases"]: + phase.setdefault("args", {}) + tests.append(TestCase(**raw)) + return tests + + +async def execute_single_attempt( + test: TestCase, + config: dict, + setup_prompt: Optional[str], + skyvern_url: str, +) -> AttemptResult: + """Execute a single test attempt via the Skyvern API.""" + import httpx + + full_prompt = build_prompt(test, setup_prompt) + + start = time.monotonic() + try: + base_url = config.get("base_url") + if not base_url: + return AttemptResult( + attempt=0, + status="ERROR", + error="config.base_url is missing from test matrix", + duration_seconds=0.0, + ) + + payload = { + "url": base_url, + "navigation_goal": full_prompt, + "proxy_location": "NONE", + "navigation_payload": None, + "extracted_information_schema": test.extraction_schema, + } + + engine = config.get("default_engine", "skyvern-2.0") + max_steps = test.max_steps or config.get("max_steps_per_test", 30) + + async with httpx.AsyncClient( + base_url=skyvern_url, timeout=None + ) as client: + resp = await client.post( + "/api/v1/tasks", + json=payload, + headers={"x-api-key": "local"}, + ) + if resp.status_code not in (200, 201): + return AttemptResult( + attempt=0, + status="ERROR", + skyvern_status=f"HTTP {resp.status_code}", + error=resp.text[:500], + duration_seconds=round(time.monotonic() - start, 2), + ) + + task_data = resp.json() + task_id = task_data.get("task_id", "") + + for _ in range(max_steps * 3): + await asyncio.sleep(5) + status_resp = await client.get( + f"/api/v1/tasks/{task_id}", + headers={"x-api-key": "local"}, + ) + if status_resp.status_code != 200: + continue + + task_status = status_resp.json() + skyvern_status = task_status.get("status", "") + + if skyvern_status in ("completed", "failed", "terminated"): + extracted = task_status.get("extracted_information") + is_pass = skyvern_status == "completed" + return AttemptResult( + attempt=0, + status="PASS" if is_pass else "FAIL", + skyvern_status=skyvern_status, + extracted_data=extracted, + duration_seconds=round(time.monotonic() - start, 2), + run_id=task_id, + ) + + return AttemptResult( + attempt=0, + status="ERROR", + skyvern_status="polling_timeout", + error=f"Task {task_id} did not complete within polling limit", + duration_seconds=round(time.monotonic() - start, 2), + run_id=task_id, + ) + + except TestTimeoutError as exc: + return AttemptResult( + attempt=0, + status="TIMEOUT", + error=str(exc), + duration_seconds=round(time.monotonic() - start, 2), + ) + except Exception as exc: + return AttemptResult( + attempt=0, + status="ERROR", + error=f"{type(exc).__name__}: {exc}", + duration_seconds=round(time.monotonic() - start, 2), + ) + + +async def execute_composite_attempt( + test: TestCase, + config: dict, + setup_prompt: Optional[str], + skyvern_url: str, +) -> AttemptResult: + """Execute a composite test with mixed Skyvern + OS/Playwright phases.""" + from .os_automation import read_clipboard + from .playwright_helpers import PlaywrightSession + + start = time.monotonic() + pw_session = None + phase_results = [] + + try: + for i, phase in enumerate(test.phases): + phase_type = phase.type + logger.info(" Phase %d/%d: %s — %s", + i + 1, len(test.phases), phase_type, phase.action[:60]) + + if phase_type == "skyvern": + sub_test = TestCase( + id=f"{test.id}_phase{i}", + name=phase.action[:80], + prompt=phase.prompt or phase.action, + steps=None, + expected_result=phase.expected or "", + extraction_schema=phase.extraction_schema, + max_steps=phase.max_steps or test.max_steps, + timeout=test.timeout, + ) + result = await execute_single_attempt( + sub_test, config, setup_prompt if i == 0 else None, skyvern_url + ) + phase_results.append({ + "phase": i + 1, "type": phase_type, + "status": result.status, + "extracted": result.extracted_data, + }) + if result.status != "PASS": + label = phase.checkpoint or phase.action[:80] + return AttemptResult( + attempt=0, status=result.status, + error=f"Phase {i+1} ({phase_type}) failed: {label}", + extracted_data={"phases": phase_results}, + duration_seconds=round(time.monotonic() - start, 2), + ) + + elif phase_type == "os_call": + action = phase.action + if action == "read_clipboard": + ok, text = await read_clipboard() + phase_results.append({"phase": i+1, "type": "os_call", "action": action, "ok": ok, "text": text}) + elif action == "wait": + seconds = phase.args.get("seconds", 5) + await asyncio.sleep(seconds) + phase_results.append({"phase": i+1, "type": "os_call", "action": "wait", "seconds": seconds}) + else: + phase_results.append({"phase": i+1, "type": "os_call", "action": action, "error": "unknown action"}) + + elif phase_type == "playwright": + if pw_session is None: + pw_session = PlaywrightSession(headless=False) + await pw_session.start() + await pw_session.navigate(config["base_url"]) + await pw_session.wait_for_flutter() + + action = phase.action + result_data = {} + + if action == "set_offline": + await pw_session.set_offline(True) + result_data = {"offline": True} + + elif action == "set_online": + await pw_session.set_offline(False) + result_data = {"offline": False} + + elif action == "restart_session": + await pw_session.restart_session(config["base_url"]) + await pw_session.wait_for_flutter(5.0) + result_data = {"restarted": True} + + elif action == "set_viewport": + w = phase.args.get("width", 1280) + h = phase.args.get("height", 800) + await pw_session.set_viewport(w, h) + await pw_session.wait_for_flutter(2.0) + result_data = {"viewport": f"{w}x{h}"} + + elif action == "screenshot": + path = phase.args.get("path", f"results/screenshots/{test.id}_phase{i}.png") + await pw_session.take_screenshot(path) + result_data = {"screenshot": path} + + elif action == "navigate": + base = config.get("base_url", "") + suffix = phase.args.get("url_suffix", "") + url = phase.args.get("url", base + suffix) + await pw_session.navigate(url) + await pw_session.wait_for_flutter() + result_data = {"navigated_to": url} + + elif action == "mock_clock": + from datetime import datetime as dt, timedelta + offset_hours = phase.args.get("offset_hours", 8760) + fake = dt.now() + timedelta(hours=offset_hours) + await pw_session.mock_clock(fake) + result_data = {"mocked_time": str(fake)} + + elif action == "reset_clock": + await pw_session.reset_clock() + result_data = {"clock_reset": True} + + elif action == "keyboard_audit": + audit = await pw_session.keyboard_navigation_audit( + max_tabs=phase.args.get("max_tabs", 100) + ) + result_data = audit + + elif action == "accessibility_audit": + audit = await pw_session.accessibility_audit() + result_data = audit + + elif action == "capture_download": + dl = await pw_session.trigger_download_and_capture( + click_text=phase.args.get("click_text"), + click_selector=phase.args.get("click_selector"), + ) + result_data = dl + + elif action == "read_clipboard": + text = await pw_session.read_clipboard() + result_data = {"clipboard": text} + + else: + result_data = {"error": f"Unknown playwright action: {action}"} + + phase_results.append({"phase": i+1, "type": "playwright", "action": action, **result_data}) + + elif phase_type == "assert": + prev = phase_results[-1] if phase_results else {} + check_key = phase.args.get("key", "") + check_value = phase.args.get("value") + check_contains = phase.args.get("contains") + actual = prev.get(check_key) + + passed = False + if check_value is not None: + passed = actual == check_value + elif check_contains is not None and isinstance(actual, str): + passed = check_contains in actual + elif check_key == "ok": + passed = prev.get("ok", False) is True + else: + passed = actual is not None + + phase_results.append({ + "phase": i+1, "type": "assert", + "key": check_key, "expected": check_value or check_contains, + "actual": actual, "passed": passed, + }) + if not passed: + return AttemptResult( + attempt=0, status="FAIL", + error=f"Assertion failed at phase {i+1}: {check_key}={actual!r}", + extracted_data={"phases": phase_results}, + duration_seconds=round(time.monotonic() - start, 2), + ) + + return AttemptResult( + attempt=0, status="PASS", + extracted_data={"phases": phase_results}, + duration_seconds=round(time.monotonic() - start, 2), + ) + + except Exception as exc: + return AttemptResult( + attempt=0, status="ERROR", + error=f"{type(exc).__name__}: {exc}", + extracted_data={"phases": phase_results}, + duration_seconds=round(time.monotonic() - start, 2), + ) + finally: + if pw_session: + try: + await pw_session.set_offline(False) + except Exception: + pass + try: + await pw_session.stop() + except Exception: + pass + + +async def run_test_with_retries( + test: TestCase, + config: dict, + setup_prompt: Optional[str], + monitor: OllamaMonitor, + skyvern_url: str, + single: bool = False, +) -> VotedResult: + """Run a test with majority vote across multiple attempts.""" + is_critical = "critical" in test.tags + num_attempts = 1 if single else (CRITICAL_RETRIES if is_critical else DEFAULT_RETRIES) + timeout = test.timeout or config.get("timeout_per_test", 180) + + attempts: list[AttemptResult] = [] + + for i in range(num_attempts): + if not monitor.healthy: + attempts.append(AttemptResult( + attempt=i + 1, + status="ERROR", + error=f"Ollama unhealthy: {monitor.last_error}", + duration_seconds=0.0, + )) + break + + logger.info( + " Attempt %d/%d for %s", i + 1, num_attempts, test.id + ) + + if test.is_composite: + coro = execute_composite_attempt(test, config, setup_prompt, skyvern_url) + else: + coro = execute_single_attempt(test, config, setup_prompt, skyvern_url) + try: + result = await run_with_timeout(coro, timeout, test.id) + except TestTimeoutError as exc: + result = AttemptResult( + attempt=i + 1, + status="TIMEOUT", + error=str(exc), + duration_seconds=float(timeout), + ) + + result.attempt = i + 1 + attempts.append(result) + + if should_stop_early(attempts, num_attempts): + logger.info(" Early exit for %s after %d attempts", test.id, len(attempts)) + break + + voted = majority_vote( + attempts, + test_id=test.id, + test_name=test.name, + tags=test.tags, + expected=test.expected_result, + ) + voted.manual_verification_note = test.manual_verification_note + return voted + + +async def main( + matrix_path: str, + tag_filter: Optional[str] = None, + single: bool = False, + include_manual: bool = False, + manual_only: bool = False, + ollama_url: str = "http://localhost:11434", + skyvern_url: str = "http://localhost:8000", +) -> int: + matrix = load_matrix(matrix_path) + config = matrix.get("config", {}) + setup_prompt = matrix.get("setup", {}).get("prompt") + + run = TestRun( + base_url=config.get("base_url", ""), + engine=config.get("default_engine", "skyvern-2.0"), + ) + + # ----------------------------------------------------------------------- + # Pre-flight + # ----------------------------------------------------------------------- + if not manual_only: + logger.info("Running pre-flight checks...") + all_ok, checks = await run_preflight( + config, ollama_url=ollama_url, skyvern_url=skyvern_url + ) + for name, ok, msg in checks: + status = "OK" if ok else "FAIL" + logger.info(" [%s] %s: %s", status, name, msg) + + if not all_ok: + logger.error("Pre-flight checks failed — aborting.") + return 2 + + # ----------------------------------------------------------------------- + # Automated tests + # ----------------------------------------------------------------------- + if not manual_only: + tests = parse_tests(matrix) + if tag_filter: + tests = [t for t in tests if tag_filter in t.tags] + + logger.info("Running %d automated test(s)...", len(tests)) + + monitor = OllamaMonitor(ollama_url=ollama_url) + await monitor.start() + + for i, test in enumerate(tests, 1): + logger.info("[%d/%d] %s — %s", i, len(tests), test.id, test.name) + voted = await run_test_with_retries( + test, config, setup_prompt, monitor, skyvern_url, single + ) + run.voted_results.append(voted) + logger.info( + " Result: %s (confidence=%.0f%%)", + voted.final_status, + voted.confidence * 100, + ) + + await monitor.stop() + + # ----------------------------------------------------------------------- + # Interactive/manual tests + # ----------------------------------------------------------------------- + if include_manual or manual_only: + manual_path = Path(matrix_path).parent / "manual_companion.yaml" + if manual_path.exists(): + manual_tests = load_manual_tests(str(manual_path)) + logger.info("Running %d interactive/manual test(s)...", len(manual_tests)) + manual_results = await run_interactive_batch(manual_tests, tag_filter) + run.manual_results = manual_results + else: + logger.warning("manual_companion.yaml not found at %s", manual_path) + + # ----------------------------------------------------------------------- + # Results + # ----------------------------------------------------------------------- + run.compute_summary() + + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + run_dir = Path("results") / f"run_{timestamp}" + run_dir.mkdir(parents=True, exist_ok=True) + + write_json_results(run, run_dir / "results.json") + generate_html_report(run, run_dir / "report.html") + + logger.info("") + logger.info("=" * 60) + logger.info(" RESULTS SUMMARY") + logger.info("=" * 60) + logger.info(" Total: %d | Passed: %d | Failed: %d | Flaky: %d | Errors: %d | Skipped: %d", + run.total, run.passed, run.failed, run.flaky, run.errors, run.skipped) + logger.info(" Pass rate: %.1f%%", run.pass_rate) + logger.info(" Duration: %.0fs", run.duration_seconds) + logger.info(" Report: %s", run_dir / "report.html") + logger.info(" JSON: %s", run_dir / "results.json") + logger.info("=" * 60) + + if run.manual_results: + manual_passed = sum(1 for m in run.manual_results if m.status == "PASS") + manual_failed = sum(1 for m in run.manual_results if m.status == "FAIL") + logger.info(" Manual: %d passed, %d failed, %d skipped", + manual_passed, manual_failed, + len(run.manual_results) - manual_passed - manual_failed) + + # Exit codes + if run.failed > 0 or run.errors > 0: + return 1 + elif run.flaky > 0: + return 3 + return 0 + + +def cli() -> None: + parser = argparse.ArgumentParser( + description="Gleec Wallet QA Automation Runner" + ) + parser.add_argument( + "--matrix", + default="test_matrix.yaml", + help="Path to test_matrix.yaml (default: test_matrix.yaml)", + ) + parser.add_argument( + "--tag", + default=None, + help="Filter tests by tag (e.g., smoke, critical, p0)", + ) + parser.add_argument( + "--single", + action="store_true", + help="Single attempt per test (no majority vote)", + ) + parser.add_argument( + "--include-manual", + action="store_true", + help="Include interactive/manual tests after automated suite", + ) + parser.add_argument( + "--manual-only", + action="store_true", + help="Run only manual/interactive tests (skip automated)", + ) + parser.add_argument( + "--ollama-url", + default="http://localhost:11434", + help="Ollama server URL", + ) + parser.add_argument( + "--skyvern-url", + default="http://localhost:8000", + help="Skyvern server URL", + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Enable debug logging", + ) + args = parser.parse_args() + + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S", + ) + + exit_code = asyncio.run(main( + matrix_path=args.matrix, + tag_filter=args.tag, + single=args.single, + include_manual=args.include_manual, + manual_only=args.manual_only, + ollama_url=args.ollama_url, + skyvern_url=args.skyvern_url, + )) + sys.exit(exit_code) + + +if __name__ == "__main__": + cli() diff --git a/automated_testing/setup.sh b/automated_testing/setup.sh new file mode 100755 index 0000000000..6aa26951d9 --- /dev/null +++ b/automated_testing/setup.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "=== Gleec QA Automation Setup ===" + +# --------------------------------------------------------------------------- +# Platform detection +# --------------------------------------------------------------------------- +IS_WSL=false +IS_WINDOWS_HOST=false + +if grep -qi microsoft /proc/version 2>/dev/null; then + IS_WSL=true + IS_WINDOWS_HOST=true + echo "[platform] Running inside WSL2 (Windows host detected)" +elif [[ "$(uname -s)" == *MINGW* ]] || [[ "$(uname -s)" == *MSYS* ]]; then + IS_WINDOWS_HOST=true + echo "[platform] Running on native Windows — please use WSL2 for the runner" + exit 1 +else + echo "[platform] Running on Linux/macOS" +fi + +# --------------------------------------------------------------------------- +# 1. Ollama +# --------------------------------------------------------------------------- +if $IS_WSL; then + echo "[ollama] On WSL2: Ollama should run natively on Windows for best GPU performance." + echo "[ollama] Install from https://ollama.com/download/windows if not already installed." + echo "[ollama] Checking if Ollama is reachable on localhost:11434..." + if curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then + echo "[ollama] Ollama is reachable from WSL2." + else + echo "[ollama] WARNING: Ollama not reachable on localhost:11434." + echo "[ollama] Start Ollama on Windows and ensure it listens on all interfaces." + echo "[ollama] Set OLLAMA_HOST=0.0.0.0 in Windows environment variables if needed." + fi +else + if ! command -v ollama &> /dev/null; then + echo "[ollama] Installing Ollama..." + curl -fsSL https://ollama.com/install.sh | sh + else + echo "[ollama] Ollama already installed." + fi + + if ! curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then + echo "[ollama] Starting Ollama server..." + ollama serve & + sleep 3 + fi +fi + +# --------------------------------------------------------------------------- +# 2. Pull vision model +# --------------------------------------------------------------------------- +echo "[model] Pulling qwen2.5-vl:32b (this may take a while on first run)..." +if $IS_WSL; then + echo "[model] Run 'ollama pull qwen2.5-vl:32b' on your Windows host if not already pulled." +else + ollama pull qwen2.5-vl:32b +fi + +# --------------------------------------------------------------------------- +# 3. Create directory structure +# --------------------------------------------------------------------------- +echo "[dirs] Creating project directories..." +mkdir -p results/screenshots results/videos + +# --------------------------------------------------------------------------- +# 4. Environment file +# --------------------------------------------------------------------------- +if [ ! -f .env ]; then + echo "[env] Creating .env from .env.example..." + cp .env.example .env + echo "[env] Edit .env to set your APP_BASE_URL and other overrides." +else + echo "[env] .env already exists, skipping." +fi + +# --------------------------------------------------------------------------- +# 5. Python dependencies +# --------------------------------------------------------------------------- +echo "[python] Installing Python dependencies..." +pip install -r requirements.txt + +# --------------------------------------------------------------------------- +# 6. Docker stack +# --------------------------------------------------------------------------- +echo "[docker] Starting Skyvern + PostgreSQL..." +docker compose up -d + +echo "" +echo "=== Setup complete ===" +echo " Ollama: http://localhost:11434" +echo " Skyvern: http://localhost:8000" +echo "" +echo "Next steps:" +echo " 1. Edit .env if needed (APP_BASE_URL, model choice, etc.)" +echo " 2. Run smoke test: python -m runner.runner --tag smoke" +echo " 3. Run full suite: python -m runner.runner" +echo "" diff --git a/automated_testing/test_matrix.yaml b/automated_testing/test_matrix.yaml new file mode 100644 index 0000000000..948791ed5b --- /dev/null +++ b/automated_testing/test_matrix.yaml @@ -0,0 +1,2807 @@ +# ============================================================================= +# Gleec Wallet — Skyvern Automated Test Matrix (Web Only) +# ============================================================================= +# +# Converted from GLEEC_WALLET_MANUAL_TEST_CASES.md for the Skyvern + Ollama +# vision-based QA runner. Only Grade-A (fully automatable) and Grade-B +# (partially automatable UI steps) tests are included. +# +# Platform scope: Web (Chrome) — Flutter canvas-rendered application. +# Test assets: DOC/MARTY testnet coins via in-app faucet. +# +# IMPORTANT: Before running, fill in the values under `test_data` with +# your actual environment-specific addresses, seeds, and credentials. +# +# ============================================================================= + +config: + base_url: "https://app.gleecwallet.com" # Or your QA staging URL + default_engine: "skyvern-2.0" + max_steps_per_test: 30 + timeout_per_test: 180 + screenshot_on_complete: true + video_recording: true + +# --------------------------------------------------------------------------- +# TEST DATA — Fill these in with your QA environment values +# --------------------------------------------------------------------------- +test_data: + wallet_password: "QaTestPass!2026" + wallet_password_weak: "abc" + import_seed_12: "abandon abandon abandon abandon abandon abandon abandon abandon abandon abandon abandon about" + doc_recipient_address: "RPDPE1XqGuHXSJn9q6VAaGDoRVMEwAYjT3" + marty_recipient_address: "RQCyFA4cAyjpfzcCGnxNxC4YTQsRDVPzec" + invalid_address: "RThisIsNotAValidAddress12345" + wrong_network_address: "0x742d35Cc6634C0532925a3b844Bc9e7595f2bD18" + evm_token_contract: "0xReplaceMeWithTestTokenContract" # Replace with a real testnet ERC-20 contract for your environment + self_address: "WILL_BE_CAPTURED_DURING_TEST" + alt_doc_address: "RKXzCCaT5ukqnyJBKTr9KyEpCBHR8itEFd" + alt_marty_address: "RD8WeYCaBQSx9e6mH5hX51uZ5FxNyirawj" + +# --------------------------------------------------------------------------- +# SETUP — Runs before each test to ensure logged-in state +# --------------------------------------------------------------------------- +setup: + prompt: | + This is a Flutter web app rendered on an HTML canvas. Identify all elements visually. + Wait 3 seconds for the app to fully load after the page appears. + + If you see a login screen with a password field: + Enter the password 'QaTestPass!2026' in the password field. + Click the login or unlock button. + Wait until a dashboard or home screen with coin balances or a wallet overview appears. + + If you see a welcome screen with 'Create Wallet' and 'Import Wallet' buttons: + COMPLETE — the app is at the initial state and no login is needed for this test. + + If you already see a dashboard with coin balances: + COMPLETE — already logged in. + + COMPLETE when the dashboard or wallet overview is visible. + success_criteria: "App is either on the dashboard (logged in) or the welcome screen" + +# ============================================================================= +# TESTS — Ordered by execution dependency and priority +# ============================================================================= +tests: + + # =========================================================================== + # PHASE 1: AUTH + WALLET LIFECYCLE (P0) + # =========================================================================== + + - id: GW-AUTO-AUTH-001a + name: "Create wallet — password and seed display" + source_manual_id: GW-MAN-AUTH-001 + tags: [auth, critical, smoke, p0] + max_steps: 25 + timeout: 240 + steps: + - action: | + Look for a button labeled 'Create Wallet' or 'Create New Wallet' on the welcome screen. + Click it. + checkpoint: "A password entry form appears with at least one password input field" + - action: | + Enter 'QaTestPass!2026' into the password field. + If there is a 'Confirm Password' or second password field, enter the same password there. + Click the continue, next, or create button. + checkpoint: "A seed phrase screen appears showing 12 or 24 words arranged in a grid or numbered list" + - action: | + Read all the seed words displayed on screen. Do NOT click continue or next yet. + Look for any warning text about backing up the seed phrase. + checkpoint: "Seed words are visible on screen with a warning about backup" + - action: | + Look for a 'Continue', 'Next', 'I have backed up', or similar button. + If there is a checkbox saying 'I have backed up my seed' or similar, check it first. + Then click the continue/next button. + checkpoint: "Either a seed confirmation challenge appears (asking to select words in order) or the dashboard loads" + expected_result: "Wallet creation reaches seed display and progresses to confirmation or dashboard" + extraction_schema: + type: object + properties: + seed_words_visible: + type: boolean + description: "Whether seed words were displayed on screen" + seed_word_count: + type: integer + description: "Number of seed words shown (12 or 24)" + reached_confirmation_or_dashboard: + type: boolean + description: "Whether the flow progressed past seed display" + + - id: GW-AUTO-AUTH-001b + name: "Create wallet — complete seed confirmation" + source_manual_id: GW-MAN-AUTH-001 + tags: [auth, critical, smoke, p0] + max_steps: 30 + timeout: 300 + prompt: | + You are completing the wallet creation flow for a Gleec Wallet. + If you see a seed confirmation challenge (asking you to select or enter specific seed words + in a certain order), complete it by clicking the correct words in the correct order. + The words should be visible on screen as clickable chips or buttons. + + If you see the dashboard already, COMPLETE — no confirmation was needed. + + After completing the confirmation, look for a 'Done', 'Finish', or similar button and click it. + + COMPLETE when the main dashboard or home screen appears showing coin balances, + a wallet overview, or a coin list. + expected_result: "Dashboard is visible after completing seed confirmation" + extraction_schema: + type: object + properties: + dashboard_visible: + type: boolean + description: "Whether the dashboard/home screen loaded" + confirmation_challenge_present: + type: boolean + description: "Whether a seed confirmation challenge was shown" + + - id: GW-AUTO-AUTH-003 + name: "Import wallet from valid 12-word seed" + source_manual_id: GW-MAN-AUTH-003 + tags: [auth, critical, p0] + max_steps: 30 + timeout: 300 + steps: + - action: | + If on the welcome screen, click 'Import Wallet' or 'Restore Wallet'. + If on the dashboard already, open the wallet manager (usually in a sidebar, + top-left menu, or account icon), and look for 'Import' or 'Add Wallet' > 'Import'. + checkpoint: "A seed phrase input screen appears with text fields or a text area for entering words" + - action: | + Enter the following 12-word seed phrase into the input fields: + abandon abandon abandon abandon abandon abandon abandon abandon abandon abandon abandon about + + If there are 12 separate input fields, enter one word per field in order. + If there is a single text area, paste the entire phrase with spaces between words. + checkpoint: "All 12 seed words are entered in the input fields" + - action: | + If a password field is shown, enter 'QaTestPass!2026' as the password. + If a confirm password field is shown, enter the same password again. + Click the 'Import', 'Restore', 'Continue', or 'Submit' button. + checkpoint: "Either a loading/syncing indicator appears or the dashboard loads" + - action: | + Wait for the import to complete. Look for the dashboard or home screen. + If you see a coin list or wallet overview, the import succeeded. + checkpoint: "The dashboard is visible with a coin list or wallet overview" + expected_result: "Wallet imported successfully and dashboard shows with coin list" + extraction_schema: + type: object + properties: + import_succeeded: + type: boolean + description: "Whether the wallet import completed" + dashboard_visible: + type: boolean + description: "Whether the dashboard loaded after import" + + - id: GW-AUTO-AUTH-004 + name: "Invalid password attempts and lockout feedback" + source_manual_id: GW-MAN-AUTH-004 + tags: [auth, critical, security, p0] + max_steps: 20 + prompt: | + You should be on the login/unlock screen with a password field. + If not, look for a logout option in settings and log out first. + + Enter the wrong password 'WrongPassword1' and click the login/unlock button. + Observe any error message that appears. + + Enter another wrong password 'WrongPassword2' and click login again. + Observe any updated error message or lockout/cooldown indicator. + + Enter a third wrong password 'WrongPassword3' and click login. + Look for any lockout message, cooldown timer, or additional security warning. + + COMPLETE after the third failed attempt. Report all error messages you see. + expected_result: "Error messages shown for wrong passwords; possible lockout after multiple attempts" + extraction_schema: + type: object + properties: + error_messages: + type: array + items: + type: string + description: "All error messages shown during failed login attempts" + lockout_triggered: + type: boolean + description: "Whether a lockout or cooldown was triggered" + lockout_message: + type: string + description: "Text of any lockout/cooldown message shown" + + # =========================================================================== + # PHASE 2: WALLET MANAGER (P0/P1) + # =========================================================================== + + - id: GW-AUTO-WAL-001 + name: "Create second wallet + rename + switch" + source_manual_id: GW-MAN-WAL-001 + tags: [wallet, critical, p0] + max_steps: 30 + steps: + - action: | + From the dashboard, open the wallet manager. Look for a wallet icon, + account selector, sidebar menu, or dropdown in the top area of the screen. + Click it to open the wallet list or wallet management area. + checkpoint: "A wallet list or wallet manager panel is visible" + - action: | + Look for a 'Create Wallet', 'Add Wallet', or '+' button and click it. + If prompted, choose 'Create New' (not import). + Enter a password if required ('QaTestPass!2026'). + Complete any seed backup flow shown (read words, confirm if needed). + checkpoint: "A second wallet has been created and appears in the wallet list" + - action: | + Look for a rename, edit, or pencil icon next to one of the wallet names. + If found, click it and rename the wallet to 'QA-Wallet-Primary'. + If no inline rename exists, look in a wallet settings/detail screen. + checkpoint: "One wallet is now named 'QA-Wallet-Primary' or a rename action was attempted" + - action: | + Switch between the wallets by clicking on the other wallet in the list. + Observe that the dashboard content or coin balances change to reflect the selected wallet. + checkpoint: "The dashboard or balance display changed after switching wallets" + expected_result: "Two wallets exist, rename works, switching updates dashboard context" + extraction_schema: + type: object + properties: + second_wallet_created: + type: boolean + rename_successful: + type: boolean + switch_changed_content: + type: boolean + wallet_count: + type: integer + + - id: GW-AUTO-WAL-002 + name: "Delete wallet with cancel and confirm" + source_manual_id: GW-MAN-WAL-002 + tags: [wallet, critical, security, p0] + max_steps: 25 + steps: + - action: | + Open the wallet manager. You should see at least two wallets. + Look for a delete, trash, or remove icon/option on the non-active wallet. + Click it. + checkpoint: "A confirmation dialog appears asking to confirm wallet deletion" + - action: | + Click 'Cancel', 'No', or the X button to dismiss the confirmation dialog. + Verify the wallet is still listed in the wallet manager. + checkpoint: "The wallet is still present in the wallet list after canceling" + - action: | + Click the delete/remove option again for the same wallet. + This time click 'Confirm', 'Delete', or 'Yes' to proceed. + checkpoint: "The wallet has been removed from the list" + expected_result: "Cancel preserves wallet; confirm removes it" + extraction_schema: + type: object + properties: + cancel_preserved_wallet: + type: boolean + confirm_deleted_wallet: + type: boolean + remaining_wallet_count: + type: integer + + # =========================================================================== + # PHASE 3: COIN MANAGEMENT + DASHBOARD (P1) + # =========================================================================== + + - id: GW-AUTO-COIN-001 + name: "Enable test coins and activate DOC/MARTY" + source_manual_id: GW-MAN-COIN-001 + tags: [coin, smoke, p1, prerequisite] + max_steps: 25 + steps: + - action: | + Navigate to Settings. Look for a gear icon, 'Settings' in the sidebar menu, + or an account/settings option in the navigation. + checkpoint: "The settings page or settings menu is visible" + - action: | + Look for a toggle or switch labeled 'Test Coins', 'Show Test Coins', + 'Enable Test Coins', or 'Testnet'. Enable/turn on this toggle. + If it is already enabled, proceed. + checkpoint: "Test coins toggle is in the ON/enabled state" + - action: | + Navigate to the Coin Manager, Coins list, or 'Add Coins' section. + Search for 'DOC' using any search or filter bar. + If DOC appears, activate/enable it by clicking its toggle or add button. + checkpoint: "DOC coin is now activated and appears in the active coins list" + - action: | + Search for 'MARTY' in the same coin manager. + Activate/enable MARTY. + checkpoint: "MARTY coin is now activated and appears in the active coins list" + expected_result: "Test coins toggle is on; DOC and MARTY are both activated" + extraction_schema: + type: object + properties: + test_coins_enabled: + type: boolean + doc_activated: + type: boolean + marty_activated: + type: boolean + + - id: GW-AUTO-COIN-002 + name: "Search, activate, deactivate coin with filter" + source_manual_id: GW-MAN-COIN-002 + tags: [coin, p1] + max_steps: 20 + prompt: | + Navigate to the Coin Manager or coin list from the dashboard. + + If there is a search or filter bar, type 'MARTY' in it. + Verify that MARTY appears in the results. + + If MARTY is currently deactivated/disabled, click its toggle or add button to activate it. + If there is an 'active only' or 'enabled only' filter option, select it. + Verify MARTY appears in the filtered list. + + Now deactivate MARTY by clicking its toggle or remove button. + Verify MARTY is no longer in the 'active only' filtered list. + Clear any filters. + + COMPLETE when you have confirmed MARTY can be activated and deactivated and + the filter reflects the current state. + expected_result: "MARTY activates/deactivates correctly and filters reflect state" + extraction_schema: + type: object + properties: + search_found_marty: + type: boolean + marty_toggleable: + type: boolean + filter_reflected_state: + type: boolean + + - id: GW-AUTO-DASH-001 + name: "Hide balances and hide zero balances toggles" + source_manual_id: GW-MAN-DASH-001 + tags: [dashboard, p1, smoke] + max_steps: 20 + prompt: | + Navigate to the main dashboard or wallet overview. + You should see a list of coins with balance amounts. + + Look for a 'Hide Balances' toggle, eye icon, or privacy button on the dashboard. + Click it to hide/mask balances. + Verify that balance amounts are replaced with asterisks, dots, dashes, or a hidden indicator. + + Now look for a 'Hide Zero Balances' toggle or similar option. + If found, enable it. + Verify that coins with 0 balance are no longer shown in the list. + + Restore both toggles to their original state (show balances, show all coins). + + COMPLETE when both toggles have been tested and restored. + expected_result: "Balances can be hidden/shown; zero-balance coins can be filtered" + extraction_schema: + type: object + properties: + hide_balances_works: + type: boolean + balances_masked_indicator: + type: string + description: "What replaced the balance values (e.g., '***', '---', etc.)" + hide_zero_balances_works: + type: boolean + + # =========================================================================== + # PHASE 4: FAUCET FUNDING (P0 — prerequisite for money movement tests) + # =========================================================================== + + - id: GW-AUTO-SEND-001 + name: "Faucet funding for DOC and MARTY" + source_manual_id: GW-MAN-SEND-001 + tags: [send, critical, smoke, p0, prerequisite] + max_steps: 25 + timeout: 240 + steps: + - action: | + Navigate to the DOC coin page by clicking on 'DOC' in the dashboard coin list + or navigating to the DOC details/overview screen. + checkpoint: "The DOC coin detail page is visible showing the DOC balance and/or address" + - action: | + Look for a 'Faucet', 'Get Test Coins', 'Request Funds', or tap icon that + triggers the in-app faucet for DOC. Click it. + Wait 5 seconds for the response. + checkpoint: "A success message, toast, or snackbar appears confirming the faucet request" + - action: | + Navigate back to the dashboard, then open the MARTY coin page by clicking + on 'MARTY' in the coin list. + checkpoint: "The MARTY coin detail page is visible" + - action: | + Look for the same faucet/request button on the MARTY page. Click it. + Wait 5 seconds for the response. + checkpoint: "A success message appears for the MARTY faucet request" + expected_result: "Both DOC and MARTY faucet requests succeeded" + extraction_schema: + type: object + properties: + doc_faucet_success: + type: boolean + doc_faucet_message: + type: string + marty_faucet_success: + type: boolean + marty_faucet_message: + type: string + + - id: GW-AUTO-SEND-002a + name: "Faucet cooldown/denied handling" + source_manual_id: GW-MAN-SEND-002 + tags: [send, critical, p0] + max_steps: 15 + prompt: | + Navigate to the DOC coin page. + Look for the faucet/request button and click it again immediately. + You should see a cooldown, denied, rate-limited, or 'already requested' message. + + Report the exact text of any message shown. + + COMPLETE when you have observed the cooldown/denied response. + expected_result: "Cooldown or denied message is shown for rapid repeat faucet request" + extraction_schema: + type: object + properties: + cooldown_message_shown: + type: boolean + message_text: + type: string + + # =========================================================================== + # PHASE 5: SEND / WITHDRAW (P0) + # =========================================================================== + + - id: GW-AUTO-SEND-003 + name: "DOC send happy path" + source_manual_id: GW-MAN-SEND-003 + tags: [send, critical, smoke, p0] + max_steps: 25 + timeout: 240 + steps: + - action: | + Navigate to the DOC coin page and find the 'Send', 'Withdraw', or arrow-up button. + Click it to open the send form. + checkpoint: "A send form is visible with fields for recipient address and amount" + - action: | + Enter the recipient address 'RPDPE1XqGuHXSJn9q6VAaGDoRVMEwAYjT3' into the address/recipient field. + Enter the amount '0.01' into the amount field. + checkpoint: "Both fields are filled with the entered values" + - action: | + Click 'Next', 'Preview', 'Review', or 'Continue' to see the transaction summary. + Look for a confirmation screen showing the amount, recipient, and fee. + checkpoint: "A transaction summary or confirmation screen is visible showing amount, address, and fee" + - action: | + Click 'Confirm', 'Send', or 'Submit' to broadcast the transaction. + Wait 5 seconds for confirmation feedback. + checkpoint: "A success message, pending transaction indicator, or transaction hash is shown" + expected_result: "Transaction submitted with success confirmation and visible in pending/history" + extraction_schema: + type: object + properties: + transaction_submitted: + type: boolean + success_or_pending_message: + type: string + fee_displayed: + type: string + transaction_hash: + type: string + description: "Transaction hash if visible" + + - id: GW-AUTO-SEND-004 + name: "Address validation — invalid and wrong-network" + source_manual_id: GW-MAN-SEND-004 + tags: [send, critical, p0] + max_steps: 20 + steps: + - action: | + Navigate to DOC send screen. Enter the invalid address 'RThisIsNotAValidAddress12345' + in the recipient field. Enter amount '0.01'. Click send/next/continue. + checkpoint: "A red error message appears mentioning invalid address, wrong format, or similar" + - action: | + Clear the address field. Enter the Ethereum address '0x742d35Cc6634C0532925a3b844Bc9e7595f2bD18' + which is a wrong-network address for DOC. Click send/next/continue. + checkpoint: "An error message appears about unsupported address format or wrong network" + expected_result: "Both invalid and wrong-network addresses are blocked with error messages" + extraction_schema: + type: object + properties: + invalid_address_blocked: + type: boolean + invalid_address_error: + type: string + wrong_network_blocked: + type: boolean + wrong_network_error: + type: string + + - id: GW-AUTO-SEND-005 + name: "Amount validation — zero, too small, exceeds balance" + source_manual_id: GW-MAN-SEND-005 + tags: [send, critical, boundary, p0] + max_steps: 25 + steps: + - action: | + Navigate to DOC send screen. Enter a valid DOC address in the recipient field. + Enter '0' as the amount. Click send/next/continue. + checkpoint: "An error message appears about amount being zero or must be greater than zero" + - action: | + Clear the amount field. Enter '0.000000001' as the amount (extremely small). + Click send/next/continue. + checkpoint: "An error message about minimum amount or insufficient value appears" + - action: | + Clear the amount field. Enter '999999999' as the amount (far exceeds balance). + Click send/next/continue. + checkpoint: "An error about insufficient funds or balance exceeded appears" + expected_result: "Zero, sub-minimum, and over-balance amounts all blocked with errors" + extraction_schema: + type: object + properties: + zero_amount_error: + type: string + tiny_amount_error: + type: string + exceeds_balance_error: + type: string + + # =========================================================================== + # PHASE 6: DEX (P0) + # =========================================================================== + + - id: GW-AUTO-DEX-001 + name: "Maker limit order creation" + source_manual_id: GW-MAN-DEX-001 + tags: [dex, critical, smoke, p0] + max_steps: 25 + steps: + - action: | + Navigate to the DEX section from the main menu or sidebar. + Look for 'DEX', 'Exchange', 'Trade', or 'Swap' in the navigation. + checkpoint: "The DEX trading interface is visible with pair selectors and order forms" + - action: | + Select the trading pair DOC/MARTY. Look for a pair selector or dropdown + and choose DOC as the base and MARTY as the quote (or vice versa if that is how they are listed). + checkpoint: "DOC/MARTY pair is selected and the orderbook or trading form reflects this pair" + - action: | + Look for a 'Maker', 'Limit', or 'Place Order' option. + Enter a price of '1' (1 DOC = 1 MARTY or similar). + Enter an amount of '0.1' in the amount field. + checkpoint: "Price and amount fields are filled" + - action: | + Click 'Place Order', 'Submit', 'Create Order', or similar button. + Wait 5 seconds for confirmation. + checkpoint: "Order creation is confirmed with a success message or the order appears in an 'Open Orders' section" + expected_result: "Maker order created and visible in open orders" + extraction_schema: + type: object + properties: + order_created: + type: boolean + order_visible_in_list: + type: boolean + order_details: + type: object + properties: + pair: + type: string + price: + type: string + amount: + type: string + + - id: GW-AUTO-DEX-003 + name: "DEX validation — invalid inputs" + source_manual_id: GW-MAN-DEX-003 + tags: [dex, critical, boundary, p0] + max_steps: 20 + steps: + - action: | + On the DEX trading screen with DOC/MARTY pair selected, enter '0' as the amount. + Click the submit/place order button. + checkpoint: "An error message about zero or invalid amount appears" + - action: | + Clear the amount field. Enter '999999999' as the amount (exceeds available balance). + Click submit/place order. + checkpoint: "An error about insufficient funds or balance appears" + expected_result: "Zero and over-balance DEX orders are blocked with validation messages" + extraction_schema: + type: object + properties: + zero_amount_blocked: + type: boolean + zero_amount_error: + type: string + insufficient_funds_blocked: + type: boolean + insufficient_funds_error: + type: string + + # =========================================================================== + # PHASE 7: BRIDGE (P0) + # =========================================================================== + + - id: GW-AUTO-BRDG-001 + name: "Bridge transfer happy path" + source_manual_id: GW-MAN-BRDG-001 + tags: [bridge, critical, smoke, p0] + max_steps: 25 + timeout: 240 + steps: + - action: | + Navigate to the Bridge section from the main menu or sidebar. + Look for 'Bridge', 'Cross-chain', or a bridge icon in the navigation. + checkpoint: "The bridge interface is visible with source/destination selectors" + - action: | + Select a supported source coin and destination. If DOC has a bridge route, + select DOC as source. Look for available destination chains/coins and select one. + Enter a valid amount like '0.1'. + checkpoint: "Source, destination, and amount are all filled in" + - action: | + If a destination address field is shown, enter a valid recipient address. + Click 'Preview', 'Next', or 'Review' to see the bridge transfer summary. + checkpoint: "A summary showing amount, fees, estimated time, and route is displayed" + - action: | + Click 'Confirm', 'Bridge', or 'Submit' to initiate the bridge transfer. + Wait 5 seconds for confirmation. + checkpoint: "A success confirmation, pending status, or bridge tracking screen appears" + expected_result: "Bridge transfer initiated with confirmation" + extraction_schema: + type: object + properties: + bridge_initiated: + type: boolean + status_message: + type: string + fee_displayed: + type: string + + - id: GW-AUTO-BRDG-002 + name: "Unsupported bridge pair validation" + source_manual_id: GW-MAN-BRDG-002 + tags: [bridge, critical, p0] + max_steps: 15 + prompt: | + Navigate to the Bridge section. + Try to select a combination of source and destination that is not supported. + Look for grayed-out options, 'not available', 'unsupported', or similar indicators. + + If all combinations appear available, try entering an amount and see if any + pair-specific validation message appears. + + COMPLETE when you have identified how the bridge handles unsupported pairs. + expected_result: "Unsupported pairs are blocked or marked unavailable" + extraction_schema: + type: object + properties: + unsupported_handling_exists: + type: boolean + blocking_message: + type: string + + - id: GW-AUTO-BRDG-003 + name: "Bridge amount boundaries" + source_manual_id: GW-MAN-BRDG-003 + tags: [bridge, critical, boundary, p0] + max_steps: 20 + prompt: | + Navigate to the Bridge section and select a supported pair. + + Enter an extremely small amount like '0.0000001' and observe if a minimum + amount error appears. + + Clear and enter '999999999' (exceeds balance) and observe the error. + + Look for any displayed minimum or maximum amount on the bridge form + and report those values. + + COMPLETE after testing both boundaries. + expected_result: "Below-min and above-balance bridge amounts are blocked" + extraction_schema: + type: object + properties: + min_amount_error: + type: string + exceeds_balance_error: + type: string + displayed_min_amount: + type: string + displayed_max_amount: + type: string + + # =========================================================================== + # PHASE 8: NFT (P1) + # =========================================================================== + + - id: GW-AUTO-NFT-001 + name: "NFT list, details, and history" + source_manual_id: GW-MAN-NFT-001 + tags: [nft, p1] + max_steps: 20 + prompt: | + Navigate to the NFT section from the main menu or sidebar. + + If NFT is disabled or hidden, COMPLETE and report that NFT is not available. + + If the NFT section opens, observe the NFT list. Report how many NFTs are shown. + If any NFTs exist, click on the first one to open its detail page. + Look for name, image, collection, and any history or transaction entries. + Navigate back to the NFT list. + + If filters are available (collection, status, date), try applying one. + + COMPLETE after viewing the NFT list and at least one detail page (if items exist). + expected_result: "NFT section accessible with list and detail views" + extraction_schema: + type: object + properties: + nft_section_available: + type: boolean + nft_count: + type: integer + detail_view_loaded: + type: boolean + filters_available: + type: boolean + + # =========================================================================== + # PHASE 9: SETTINGS (P1) + # =========================================================================== + + - id: GW-AUTO-SET-002 + name: "Analytics/privacy toggles" + source_manual_id: GW-MAN-SET-002 + tags: [settings, security, p1] + max_steps: 15 + prompt: | + Navigate to Settings. + Look for any toggles related to 'Analytics', 'Diagnostics', 'Usage Data', + 'Privacy', or 'Telemetry'. + + Toggle each one off then on (or on then off) and verify the toggle state updates + visually when clicked. + + Report the names and current states of all privacy-related toggles found. + + COMPLETE after toggling available privacy settings. + expected_result: "Privacy/analytics toggles are interactive and state changes visually" + extraction_schema: + type: object + properties: + toggles_found: + type: array + items: + type: object + properties: + toggle_name: + type: string + responds_to_click: + type: boolean + + - id: GW-AUTO-SET-003 + name: "Test coin toggle impact" + source_manual_id: GW-MAN-SET-003 + tags: [settings, smoke, p1] + max_steps: 20 + steps: + - action: | + Navigate to Settings and find the 'Test Coins' toggle. + Turn it OFF. + checkpoint: "Test coins toggle is in the OFF/disabled state" + - action: | + Navigate to the Coin Manager or coins list. + Search for 'DOC'. Observe whether DOC appears in the list. + checkpoint: "DOC is NOT visible in the coin list when test coins are disabled" + - action: | + Go back to Settings and turn the Test Coins toggle ON. + checkpoint: "Test coins toggle is back in the ON/enabled state" + - action: | + Return to the Coin Manager and search for 'DOC' again. + checkpoint: "DOC is now visible in the coin list" + expected_result: "DOC visibility toggles with the test coins setting" + extraction_schema: + type: object + properties: + doc_hidden_when_off: + type: boolean + doc_visible_when_on: + type: boolean + + # =========================================================================== + # PHASE 10: NAVIGATION (P1) + # =========================================================================== + + - id: GW-AUTO-NAV-001 + name: "Main menu route integrity" + source_manual_id: GW-MAN-NAV-001 + tags: [navigation, smoke, p1] + max_steps: 30 + prompt: | + Starting from the dashboard/home screen, visit every main navigation item + one by one. Look for menu items or sidebar links such as: + Dashboard, Wallet, DEX/Exchange, Bridge, NFT, Settings, Market Maker/Bot, Fiat. + + For each item: + 1. Click it + 2. Wait 3 seconds for the page to load + 3. Note whether content loaded or an error/blank screen appeared + 4. Use the browser back button or app back arrow to return + + COMPLETE after visiting all available navigation items. + expected_result: "All main navigation items load content without errors" + extraction_schema: + type: object + properties: + tabs_visited: + type: array + items: + type: object + properties: + tab_name: + type: string + loaded_successfully: + type: boolean + error_visible: + type: boolean + + - id: GW-AUTO-NAV-003 + name: "Unsaved changes prompt on form exit" + source_manual_id: GW-MAN-NAV-003 + tags: [navigation, p2] + max_steps: 20 + prompt: | + Navigate to the DOC send screen. + Enter some text in the recipient address field (like 'test') and an amount ('0.5'). + Do NOT click send/confirm. + + Now click the browser back button or a navigation menu item to leave the page. + + If a dialog appears asking 'Discard changes?', 'Leave this page?', or similar: + Click 'Stay', 'Cancel', or 'No' to remain on the page. + Verify your entered data is still present. + Then try to navigate away again and this time click 'Discard', 'Leave', or 'Yes'. + + If no dialog appears, note that as well. + + COMPLETE after testing the unsaved changes behavior. + expected_result: "Unsaved changes dialog appears and respects Stay/Discard choices" + extraction_schema: + type: object + properties: + dialog_appeared: + type: boolean + stay_preserved_data: + type: boolean + discard_exited_page: + type: boolean + + # =========================================================================== + # PHASE 11: MARKET MAKER BOT (P1) + # =========================================================================== + + - id: GW-AUTO-BOT-001 + name: "Create and start market maker bot" + source_manual_id: GW-MAN-BOT-001 + tags: [bot, p1] + max_steps: 25 + steps: + - action: | + Navigate to the Market Maker Bot section from the menu. + If the bot feature is disabled or hidden, COMPLETE and report it as unavailable. + checkpoint: "Bot management interface is visible with a create/add option" + - action: | + Click 'Create Bot', 'Add Bot', 'New', or similar button. + Select the DOC/MARTY pair. + Enter a spread value like '5' or '5%'. + Enter a volume or amount like '0.1'. + checkpoint: "Bot configuration form is filled with pair, spread, and volume" + - action: | + Click 'Save', 'Create', or 'Start' to create the bot. + If separate save and start actions exist, save first then start. + checkpoint: "The bot appears in the bot list with a 'Running', 'Active', or similar status" + expected_result: "Bot created and started with active status" + extraction_schema: + type: object + properties: + bot_feature_available: + type: boolean + bot_created: + type: boolean + bot_status: + type: string + + - id: GW-AUTO-BOT-002 + name: "Bot validation — invalid config" + source_manual_id: GW-MAN-BOT-002 + tags: [bot, boundary, p1] + max_steps: 15 + prompt: | + Navigate to the Bot creation form. + + Try entering a spread of '0' or '-1' and observe any validation error. + Try entering a volume/amount of '0' and observe any validation error. + Try submitting without selecting a pair (if possible). + + Report all validation messages seen. + + COMPLETE after testing invalid bot configuration inputs. + expected_result: "Invalid bot config values are blocked with validation messages" + extraction_schema: + type: object + properties: + invalid_spread_error: + type: string + invalid_volume_error: + type: string + missing_pair_error: + type: string + + # =========================================================================== + # PHASE 12: FIAT ON-RAMP (P0) + # =========================================================================== + + - id: GW-AUTO-FIAT-001 + name: "Fiat menu access and connect-wallet gating" + source_manual_id: GW-MAN-FIAT-001 + tags: [fiat, smoke, p0] + max_steps: 15 + prompt: | + Navigate to the Fiat section from the main menu. + Look for 'Fiat', 'Buy Crypto', 'Buy', or similar navigation item. + + If the Fiat section loads, observe whether form fields are enabled. + Look for a currency selector, crypto asset selector, and amount field. + + If a 'Connect Wallet' gate is shown, report it. + + COMPLETE when you have observed the Fiat section state. + expected_result: "Fiat section is accessible and form state reflects auth state" + extraction_schema: + type: object + properties: + fiat_section_available: + type: boolean + form_fields_enabled: + type: boolean + connect_wallet_gate_shown: + type: boolean + + - id: GW-AUTO-FIAT-002 + name: "Fiat form validation — boundary amounts" + source_manual_id: GW-MAN-FIAT-002 + tags: [fiat, critical, boundary, p0] + max_steps: 20 + prompt: | + Navigate to the Fiat section. If fields are available: + + Select a fiat currency (e.g., USD or EUR) and a crypto asset. + Enter '0.01' (likely below minimum) in the amount field and observe validation. + Enter '999999999' (above maximum) and observe validation. + Enter a valid amount like '50' and verify the form allows progression. + + If payment method options exist, switch between them. + + COMPLETE after testing boundary amounts. + expected_result: "Below-min and above-max fiat amounts are rejected with messages" + extraction_schema: + type: object + properties: + below_min_error: + type: string + above_max_error: + type: string + valid_amount_accepted: + type: boolean + + # =========================================================================== + # PHASE 13: SECURITY SETTINGS (P0) + # =========================================================================== + + - id: GW-AUTO-SECX-002 + name: "Seed backup show and confirm lifecycle" + source_manual_id: GW-MAN-SECX-002 + tags: [security, critical, p0] + max_steps: 25 + steps: + - action: | + Navigate to Settings > Security. Look for 'Seed Phrase', 'Backup', + 'View Seed', or 'Recovery Phrase' option. + checkpoint: "A seed/backup option is visible in security settings" + - action: | + Click on the seed/backup option. If a password prompt appears, + enter 'QaTestPass!2026' and submit. + checkpoint: "Seed words are displayed on screen (12 or 24 words)" + - action: | + If a 'Confirm Backup' or seed verification challenge is available, attempt it. + If the flow allows navigating away, go back to settings. + checkpoint: "Seed backup flow completed or navigation returned to settings" + expected_result: "Seed is protected behind password and displays correctly" + extraction_schema: + type: object + properties: + password_required: + type: boolean + seed_displayed: + type: boolean + seed_word_count: + type: integer + + - id: GW-AUTO-SECX-004 + name: "Change password flow" + source_manual_id: GW-MAN-SECX-004 + tags: [security, critical, p0] + max_steps: 25 + steps: + - action: | + Navigate to Settings > Security > Change Password. + Enter the wrong current password 'WrongOldPass' and a new password 'NewPass2026!'. + Click submit/save. + checkpoint: "An error message appears about incorrect current password" + - action: | + Clear the fields. Enter the correct current password 'QaTestPass!2026'. + Enter new password 'NewPass2026!' and confirm it. + Click submit/save. + checkpoint: "A success message appears confirming the password has been changed" + - action: | + IMPORTANT: Change the password back. Go to Change Password again. + Enter current password 'NewPass2026!' and new password 'QaTestPass!2026'. + Confirm and submit. + checkpoint: "Password has been reverted to the original for subsequent tests" + expected_result: "Wrong current password rejected; valid change succeeds; password reverted" + extraction_schema: + type: object + properties: + wrong_current_rejected: + type: boolean + password_changed: + type: boolean + password_reverted: + type: boolean + + # =========================================================================== + # PHASE 14: CUSTOM TOKEN (P1) + # =========================================================================== + + - id: GW-AUTO-CTOK-001 + name: "Import custom token happy path" + source_manual_id: GW-MAN-CTOK-001 + tags: [custom_token, p1] + max_steps: 20 + steps: + - action: | + Navigate to the Coin Manager. Look for an 'Import Token', 'Add Custom Token', + or similar option. Click it. + checkpoint: "A custom token import form appears with network selector and contract address field" + - action: | + Select an EVM network (e.g., Ethereum, BSC, or whatever is available). + Enter the contract address '0xReplaceMeWithTestTokenContract' in the contract field. + Click 'Fetch', 'Search', 'Load', or similar. + checkpoint: "Token metadata appears showing name, symbol, and/or decimals" + - action: | + Click 'Import', 'Add', 'Confirm', or similar to add the token. + checkpoint: "Token is added and appears in the coin list" + expected_result: "Custom token imported and visible in coin list" + extraction_schema: + type: object + properties: + token_fetched: + type: boolean + token_name: + type: string + token_symbol: + type: string + token_imported: + type: boolean + + - id: GW-AUTO-CTOK-002 + name: "Custom token — invalid contract handling" + source_manual_id: GW-MAN-CTOK-002 + tags: [custom_token, p1] + max_steps: 15 + prompt: | + Navigate to the custom token import form. + Select an EVM network. + Enter 'NotARealContract123' as the contract address. + Click fetch/search/load. + + Observe the error message or not-found state. + Report the message shown. + + COMPLETE after observing the error handling. + expected_result: "Invalid contract shows error/not-found message" + extraction_schema: + type: object + properties: + error_shown: + type: boolean + error_message: + type: string + + # =========================================================================== + # PHASE 15: LOCALIZATION (P2) + # =========================================================================== + + - id: GW-AUTO-L10N-001 + name: "Translation completeness check" + source_manual_id: GW-MAN-L10N-001 + tags: [l10n, p2] + max_steps: 25 + prompt: | + Navigate to Settings and change the app language/locale to a non-English option + (e.g., the second language in the list). + + After the language changes, navigate to the Dashboard, then to the Send screen, + then to Settings. + + Look for any text that appears to be untranslated (still in English when + everything else is in the new language) or any text that looks like a raw + localization key (e.g., 'send.button.label' or 'error_invalid_address'). + + Report any untranslated strings found. + + Change the language back to English before completing. + + COMPLETE after checking three screens and reverting to English. + expected_result: "No raw localization keys visible; all text appears translated" + extraction_schema: + type: object + properties: + language_changed_to: + type: string + untranslated_strings_found: + type: array + items: + type: string + language_reverted: + type: boolean + + # =========================================================================== + # PHASE 16: FEATURE GATING (P0/P1) + # =========================================================================== + + - id: GW-AUTO-GATE-001 + name: "Trading-disabled mode tooltips" + source_manual_id: GW-MAN-GATE-001 + tags: [gating, p0] + max_steps: 15 + prompt: | + Look at the main navigation/sidebar menu items. + Identify any menu items that appear disabled, grayed out, or marked with + a lock icon or 'disabled' indicator. + + For each disabled item, hover over it (move cursor to it) and look for + a tooltip or popup message explaining why it is disabled. + + Report which items are disabled and their tooltip text. + + If no items appear disabled, report that all navigation items are active. + + COMPLETE after checking all main navigation items. + expected_result: "Disabled features show explanatory tooltips; active features are accessible" + extraction_schema: + type: object + properties: + disabled_items: + type: array + items: + type: object + properties: + item_name: + type: string + tooltip_text: + type: string + all_active: + type: boolean + + # =========================================================================== + # PHASE 17: SUPPORT, FEEDBACK, MISC (P2) + # =========================================================================== + + - id: GW-AUTO-SUP-001 + name: "Support page content and links" + source_manual_id: GW-MAN-SUP-001 + tags: [support, p2] + max_steps: 15 + prompt: | + Navigate to the Support, Help, or FAQ section from settings or the main menu. + + Verify that content loads (FAQ items, support text, or help articles). + Look for any 'Contact', 'Email', or external support link and note the URL target. + Look for a 'My Coins Missing' or similar help dialog and open it if available. + + COMPLETE when you have reviewed the support content. + expected_result: "Support page loads with content and functioning links" + extraction_schema: + type: object + properties: + support_page_loaded: + type: boolean + faq_content_visible: + type: boolean + contact_link_present: + type: boolean + missing_coins_dialog: + type: boolean + + - id: GW-AUTO-FEED-001 + name: "Feedback entry points" + source_manual_id: GW-MAN-FEED-001 + tags: [feedback, p2] + max_steps: 15 + prompt: | + Look for feedback entry points: + 1. A 'Feedback' option in the Settings menu + 2. A floating bug/feedback button on the screen (usually bottom-right corner) + + If you find either, click it to open the feedback form. + If a feedback form opens, observe its fields (text input, screenshot option). + Close it without submitting. + + COMPLETE after identifying available feedback entry points. + expected_result: "Feedback form accessible from at least one entry point" + extraction_schema: + type: object + properties: + settings_feedback_found: + type: boolean + floating_button_found: + type: boolean + feedback_form_opened: + type: boolean + + # =========================================================================== + # PHASE 18: ADVANCED SETTINGS + WALLET (P1/P2) + # =========================================================================== + + - id: GW-AUTO-SETX-001 + name: "Weak-password toggle enforcement" + source_manual_id: GW-MAN-SETX-001 + tags: [settings, security, p1] + max_steps: 20 + prompt: | + Navigate to Settings > Advanced or Security settings. + Look for a toggle labeled 'Allow Weak Password', 'Weak Password', or similar. + + If found, turn it OFF (disallow weak passwords). + Then attempt to create or import a wallet using the weak password 'abc'. + Observe if the app blocks you with a password strength error. + + Then go back and turn the toggle ON (allow weak passwords). + Retry with the same weak password 'abc' and observe if it is now accepted. + + IMPORTANT: Cancel the wallet creation after testing — don't actually create + a wallet with a weak password. + + COMPLETE after testing both toggle states. + expected_result: "Weak password blocked when toggle is off, accepted when on" + extraction_schema: + type: object + properties: + weak_password_toggle_found: + type: boolean + blocked_when_off: + type: boolean + accepted_when_on: + type: boolean + + - id: GW-AUTO-SETX-007 + name: "Reset activated coins for wallet" + source_manual_id: GW-MAN-SETX-007 + tags: [settings, recovery, p2] + max_steps: 15 + prompt: | + Navigate to Settings > Advanced. + Look for 'Reset Activated Coins', 'Reset Coins', or similar option. + + If found, click it. A wallet selector or confirmation dialog should appear. + Click 'Cancel' or dismiss the dialog without confirming. + + Then open it again, select a wallet if prompted, and confirm the reset. + Observe the completion message. + + COMPLETE after testing cancel and confirm paths. + expected_result: "Reset operation has cancel/confirm safety and shows completion message" + extraction_schema: + type: object + properties: + reset_option_found: + type: boolean + cancel_preserved_state: + type: boolean + reset_completed: + type: boolean + completion_message: + type: string + + - id: GW-AUTO-WALX-001 + name: "Wallet overview cards and privacy toggle" + source_manual_id: GW-MAN-WALX-001 + tags: [wallet, p1] + max_steps: 15 + prompt: | + Navigate to the wallet overview or dashboard. + Look for overview cards showing: current balance, total investment, + profit/loss, or portfolio value. + + If a privacy/eye icon is visible near the cards, click it to toggle privacy mode. + Observe if the values are masked/hidden. + Click it again to reveal the values. + + COMPLETE after testing the privacy toggle on overview cards. + expected_result: "Overview cards display and privacy toggle masks/reveals values" + extraction_schema: + type: object + properties: + overview_cards_visible: + type: boolean + card_types: + type: array + items: + type: string + privacy_toggle_works: + type: boolean + + - id: GW-AUTO-WADDR-001 + name: "Multi-address display and controls" + source_manual_id: GW-MAN-WADDR-001 + tags: [wallet, p1] + max_steps: 20 + prompt: | + Navigate to a coin detail page (e.g., DOC). + Look for an 'Addresses' section, tab, or expandable area. + + If multiple addresses are shown, try any available controls: + - 'Hide zero balance addresses' toggle + - 'Expand all' / 'Collapse all' button + - Copy button (click the copy icon next to an address) + - QR code button (click to view QR) + + Report which controls are available and whether they respond to clicks. + + COMPLETE after reviewing the address controls. + expected_result: "Address list displays with interactive controls" + extraction_schema: + type: object + properties: + multiple_addresses_shown: + type: boolean + address_count: + type: integer + controls_found: + type: array + items: + type: string + description: "Names of controls found (hide-zero, expand, copy, qr, faucet)" + + - id: GW-AUTO-WADDR-002 + name: "Create new address" + source_manual_id: GW-MAN-WADDR-002 + tags: [wallet, p1] + max_steps: 15 + steps: + - action: | + On the coin addresses section, look for a 'Create New Address', 'Generate Address', + or '+' button. + checkpoint: "A create/generate address button is visible" + - action: | + Click the create button. If a confirmation dialog appears, confirm it. + Wait 3 seconds for the new address to be generated. + checkpoint: "A new address appears in the address list that was not there before" + expected_result: "New address generated and visible in the list" + extraction_schema: + type: object + properties: + new_address_created: + type: boolean + address_list_updated: + type: boolean + + - id: GW-AUTO-SECX-003 + name: "Unban pubkeys operation" + source_manual_id: GW-MAN-SECX-003 + tags: [security, p1] + max_steps: 15 + prompt: | + Navigate to Settings > Security. + Look for 'Unban Pubkeys', 'Unban', or 'Banned Keys' option. + + If found, click it. Observe any progress indicator, results dialog, + or snackbar message showing counts of unbanned keys or a no-op result. + + If not found, report that the option is not available. + + COMPLETE after executing or locating the unban option. + expected_result: "Unban operation runs and shows results or is not applicable" + extraction_schema: + type: object + properties: + unban_option_found: + type: boolean + operation_result: + type: string + description: "Success message, count of unbanned keys, or no-op message" + + - id: GW-AUTO-GATE-003 + name: "NFT menu disabled state and route safety" + source_manual_id: GW-MAN-GATE-003 + tags: [gating, p1] + max_steps: 15 + prompt: | + Look at the main navigation menu. Find the NFT menu item. + + If NFT appears disabled (grayed out, has a lock icon, or shows a disabled tooltip): + Hover over it and report the tooltip text. + Try clicking it anyway and report what happens. + + If NFT appears enabled, click it and verify the NFT section loads. + + COMPLETE after checking the NFT menu item state. + expected_result: "NFT disabled state is clear with tooltip; or NFT section loads normally" + extraction_schema: + type: object + properties: + nft_menu_state: + type: string + description: "enabled, disabled, or hidden" + tooltip_text: + type: string + click_result: + type: string + + + # =========================================================================== + # PHASE 19: MISSING GRADE-A TESTS + # =========================================================================== + + - id: GW-AUTO-COIN-003 + name: "Deactivate coin with balance — warning and restore" + source_manual_id: GW-MAN-COIN-003 + tags: [coin, p1] + max_steps: 20 + steps: + - action: | + Navigate to the Coin Manager. Find DOC in the active coins list. + DOC should have a non-zero balance from earlier faucet funding. + checkpoint: "DOC is visible in the active coins list with a balance > 0" + - action: | + Click the toggle or remove button next to DOC to deactivate it. + A warning dialog should appear about deactivating a coin with a balance. + checkpoint: "A warning dialog is shown mentioning the coin has a balance" + - action: | + Click 'Cancel' or 'No' to dismiss the warning. DOC should remain active. + checkpoint: "DOC is still in the active coins list" + - action: | + Click deactivate again. This time confirm the deactivation. + Then re-activate DOC by finding it in the inactive list and enabling it. + checkpoint: "DOC is back in the active coins list" + expected_result: "Warning shown for balance coin; cancel preserves; reactivation works" + extraction_schema: + type: object + properties: + warning_shown: + type: boolean + cancel_preserved: + type: boolean + reactivation_successful: + type: boolean + + - id: GW-AUTO-NFT-002 + name: "NFT send happy path" + source_manual_id: GW-MAN-NFT-002 + tags: [nft, p1] + max_steps: 25 + timeout: 240 + steps: + - action: | + Navigate to the NFT section. If no NFTs are available, COMPLETE and + report that no NFTs exist for testing. + If NFTs exist, click on the first NFT to open its detail page. + checkpoint: "NFT detail page is visible with name, image, and a send/transfer option" + - action: | + Click the 'Send', 'Transfer', or share icon on the NFT detail page. + A send form should appear with a recipient address field. + checkpoint: "NFT send form is visible with a recipient address field" + - action: | + Enter a valid recipient address 'RPDPE1XqGuHXSJn9q6VAaGDoRVMEwAYjT3' in the + recipient field. Click 'Send', 'Confirm', or 'Transfer'. + If a confirmation dialog appears, confirm it. + checkpoint: "A success message, pending indicator, or transaction hash is shown" + expected_result: "NFT transfer initiated with confirmation" + extraction_schema: + type: object + properties: + nfts_available: + type: boolean + transfer_initiated: + type: boolean + status_message: + type: string + + - id: GW-AUTO-NFT-003 + name: "NFT send failure — invalid recipient" + source_manual_id: GW-MAN-NFT-003 + tags: [nft, p1] + max_steps: 20 + steps: + - action: | + Navigate to an NFT detail page and open the send/transfer form. + If no NFTs exist, COMPLETE and report not applicable. + checkpoint: "NFT send form is visible" + - action: | + Enter an invalid address 'RThisIsNotAValidAddress12345' in the recipient field. + Click send/transfer. + checkpoint: "An error message appears about the invalid address" + - action: | + Clear the address field and enter an empty string or leave it blank. + Click send/transfer. + checkpoint: "An error message appears about a required address field" + expected_result: "Invalid and empty addresses are blocked with error messages" + extraction_schema: + type: object + properties: + invalid_address_blocked: + type: boolean + invalid_address_error: + type: string + empty_address_blocked: + type: boolean + + - id: GW-AUTO-L10N-003 + name: "Locale-specific date and number formatting" + source_manual_id: GW-MAN-L10N-003 + tags: [l10n, p2] + max_steps: 25 + prompt: | + Navigate to Settings and change the language to a locale that uses + different number/date formatting (e.g., German or French if available, + which uses commas for decimals and dots for thousands). + + After the language changes: + 1. Navigate to the Dashboard and look at any displayed balance amounts. + Check if the decimal separator matches the locale convention. + 2. Navigate to a coin detail page with transaction history. + Look at any dates displayed and check if they use the locale format + (e.g., DD.MM.YYYY for German, DD/MM/YYYY for French). + 3. Report the formatting you observe. + + Change the language back to English before completing. + + COMPLETE after checking number and date formatting. + expected_result: "Numbers and dates reflect the selected locale formatting" + extraction_schema: + type: object + properties: + locale_selected: + type: string + decimal_separator_observed: + type: string + date_format_observed: + type: string + language_reverted: + type: boolean + + - id: GW-AUTO-CTOK-003 + name: "Cancel/back from custom token import — no side effects" + source_manual_id: GW-MAN-CTOK-003 + tags: [custom_token, p1] + max_steps: 15 + steps: + - action: | + Navigate to the custom token import form. + Select an EVM network and enter any contract address text. + checkpoint: "The import form has some entered data" + - action: | + Click 'Back', 'Cancel', or navigate away from the import form + using the back arrow or a navigation menu item. + checkpoint: "The import form is no longer visible; you are back on the coin list or previous page" + - action: | + Navigate to the coin list and verify that no new unknown token + was added during the cancelled import. + checkpoint: "The coin list is unchanged — no partially imported token is present" + expected_result: "Cancelling import leaves no side effects in the coin list" + extraction_schema: + type: object + properties: + cancel_successful: + type: boolean + no_side_effects: + type: boolean + + # =========================================================================== + # PHASE 20: GRADE-B AUTOMATED PORTIONS + # =========================================================================== + # These tests cover only the UI-automatable steps of Grade-B cases. + # The manual_verification_note field explains what must be checked by a human. + # =========================================================================== + + - id: GW-AUTO-AUTH-002a + name: "Login and logout cycle (within session)" + source_manual_id: GW-MAN-AUTH-002 + tags: [auth, p1] + max_steps: 20 + manual_verification_note: "Session persistence after app restart must be tested manually (MAN-AUTH-002b)" + steps: + - action: | + If on the dashboard, navigate to Settings and look for a 'Logout', + 'Sign Out', or 'Lock' option. Click it. + checkpoint: "The login/unlock screen or welcome screen is visible" + - action: | + Log back in by entering the password 'QaTestPass!2026' and clicking + the login/unlock button. + checkpoint: "The dashboard is visible again after logging in" + expected_result: "Logout returns to login screen; re-login restores dashboard" + extraction_schema: + type: object + properties: + logout_successful: + type: boolean + relogin_successful: + type: boolean + + - id: GW-AUTO-DEX-002a + name: "Taker order form and submission attempt" + source_manual_id: GW-MAN-DEX-002 + tags: [dex, p1] + max_steps: 20 + manual_verification_note: "Actual taker execution depends on orderbook liquidity — verify manually if fill occurs" + prompt: | + Navigate to the DEX section and select the DOC/MARTY pair. + + Look for a 'Taker', 'Market', 'Swap', or 'Simple' tab/mode. + If available, switch to it. + + Enter an amount of '0.01' in the sell/amount field. + Observe if a matching order or price is shown from the orderbook. + + If a 'Swap', 'Trade', or 'Submit' button is available and enabled, click it. + Wait 5 seconds and report the result. + + If no orderbook liquidity exists, the form may show 'No orders available' + or the button may be disabled. Report this state. + + COMPLETE after observing the taker form behavior. + expected_result: "Taker form loads with pair; shows orderbook state or allows submission" + extraction_schema: + type: object + properties: + taker_form_available: + type: boolean + orderbook_has_liquidity: + type: boolean + submission_result: + type: string + + - id: GW-AUTO-DEX-004a + name: "Cancel open maker order" + source_manual_id: GW-MAN-DEX-004 + tags: [dex, p1] + max_steps: 20 + manual_verification_note: "Partial fill depends on external market activity — verify manually" + steps: + - action: | + Navigate to the DEX section. Look for an 'Open Orders', 'My Orders', + or 'Active Orders' tab or section. + checkpoint: "Open orders section is visible" + - action: | + If any open orders exist, click the cancel button (X, trash, or 'Cancel') + on one of them. If a confirmation dialog appears, confirm it. + checkpoint: "The order is removed from the open orders list or a cancellation message appears" + expected_result: "Open order can be cancelled successfully" + extraction_schema: + type: object + properties: + orders_visible: + type: boolean + cancel_successful: + type: boolean + cancel_message: + type: string + + - id: GW-AUTO-DEX-005a + name: "Swap history filtering" + source_manual_id: GW-MAN-DEX-005 + tags: [dex, p1] + max_steps: 20 + manual_verification_note: "File export verification requires filesystem access — verify manually (MAN-VERIFY-DEX-005)" + prompt: | + Navigate to the DEX section and find the 'History', 'Swap History', + or 'Completed' tab/section. + + If history entries exist: + 1. Try any available filters (date range, pair, status). + 2. Verify that filtering changes the displayed entries. + 3. If a sort option exists (by date, amount), try it. + 4. Look for an 'Export' or download button. Report if it exists. + + If no history exists, report that. + + COMPLETE after testing available filters. + expected_result: "History view loads; filters change displayed results" + extraction_schema: + type: object + properties: + history_entries_exist: + type: boolean + filters_available: + type: array + items: + type: string + export_button_exists: + type: boolean + + - id: GW-AUTO-CDET-001a + name: "Coin detail — address display, copy, QR" + source_manual_id: GW-MAN-CDET-001 + tags: [wallet, p1] + max_steps: 20 + manual_verification_note: "Clipboard content and explorer link correctness must be verified manually (MAN-VERIFY-CDET-001)" + steps: + - action: | + Navigate to the DOC coin detail page. + Look for the receiving address displayed on the page. + checkpoint: "A wallet address starting with 'R' is visible on the coin detail page" + - action: | + Look for a copy button (clipboard icon) next to the address. + Click it. Observe any 'Copied' toast or feedback. + checkpoint: "A 'Copied' or clipboard confirmation message appears" + - action: | + Look for a QR code button or icon. Click it. + A QR code image should appear in a dialog or overlay. + checkpoint: "A QR code is displayed" + - action: | + Close the QR dialog and look for an explorer link button. + If present, note its location but do not click (external navigation). + checkpoint: "Explorer link button identified or noted as absent" + expected_result: "Address visible with copy, QR, and explorer link controls" + extraction_schema: + type: object + properties: + address_displayed: + type: boolean + copy_feedback_shown: + type: boolean + qr_code_displayed: + type: boolean + explorer_link_exists: + type: boolean + + - id: GW-AUTO-CDET-002a + name: "Transaction list view and detail" + source_manual_id: GW-MAN-CDET-002 + tags: [wallet, p1] + max_steps: 20 + manual_verification_note: "Pending→confirmed progression and explorer link require real chain time (MAN-VERIFY-CDET-002)" + prompt: | + Navigate to the DOC coin detail page. + Look for a 'Transactions', 'History', or transaction list section. + + If transactions exist: + 1. Note the number of transactions visible. + 2. Click on the first transaction to see its details. + 3. Look for: amount, fee, date/time, transaction hash, status, addresses. + 4. Navigate back to the transaction list. + + Report the fields visible in the transaction detail. + + COMPLETE after reviewing the transaction list and one detail entry. + expected_result: "Transaction list loads; detail view shows amount, hash, status" + extraction_schema: + type: object + properties: + transactions_exist: + type: boolean + transaction_count: + type: integer + detail_fields_visible: + type: array + items: + type: string + + - id: GW-AUTO-CDET-003a + name: "Price chart rendering" + source_manual_id: GW-MAN-CDET-003 + tags: [wallet, p1] + max_steps: 15 + manual_verification_note: "Offline fallback behavior requires network toggle — verify manually" + prompt: | + Navigate to a coin detail page that is likely to have price data + (e.g., KMD, BTC, ETH, or any major coin if active). + + Look for a price chart, graph, or chart area on the page. + + If a chart is visible: + 1. Report whether it shows price data with a line or candlestick graph. + 2. Look for time range selectors (1D, 1W, 1M, 1Y, All) and click one. + 3. Report if the chart updates when switching time ranges. + + If no chart is visible, report that. + + COMPLETE after checking chart functionality. + expected_result: "Price chart renders with data; time range selectors work" + extraction_schema: + type: object + properties: + chart_visible: + type: boolean + chart_has_data: + type: boolean + time_ranges_work: + type: boolean + + - id: GW-AUTO-SEC-001a + name: "Seed phrase reveal (password-gated)" + source_manual_id: GW-MAN-SEC-001 + tags: [security, p1] + max_steps: 20 + manual_verification_note: "Screenshot masking and app-backgrounding behavior are manual (MAN-SEC-001b)" + steps: + - action: | + Navigate to Settings > Security. Look for 'View Seed', 'Show Seed', + 'Recovery Phrase', or 'Backup Seed' option. Click it. + checkpoint: "A password prompt appears before showing the seed" + - action: | + Enter the password 'QaTestPass!2026' and submit. + checkpoint: "Seed words (12 or 24) are now displayed on screen" + - action: | + Navigate back or close the seed view. Verify the seed is no longer visible. + checkpoint: "Seed words are no longer displayed after navigating away" + expected_result: "Seed is password-gated and hidden after navigation" + extraction_schema: + type: object + properties: + password_required: + type: boolean + seed_displayed_after_auth: + type: boolean + seed_hidden_on_exit: + type: boolean + + - id: GW-AUTO-SET-001a + name: "Theme and language switching" + source_manual_id: GW-MAN-SET-001 + tags: [settings, p1] + max_steps: 20 + manual_verification_note: "Persistence after app restart is manual (MAN-SET-004)" + prompt: | + Navigate to Settings. + + Look for a theme switcher (Light/Dark mode toggle). + If found, switch to the opposite theme. Observe if the background + and text colors change. Switch back. + + Look for a language/locale selector. + If found, switch to a different language briefly. + Verify the UI text changes. Switch back to English. + + Report which settings were available and functional. + + COMPLETE after testing available appearance settings. + expected_result: "Theme and/or language changes take effect immediately" + extraction_schema: + type: object + properties: + theme_toggle_found: + type: boolean + theme_change_visible: + type: boolean + language_selector_found: + type: boolean + language_change_visible: + type: boolean + + - id: GW-AUTO-BOT-003a + name: "Edit, stop, and restart market maker bot" + source_manual_id: GW-MAN-BOT-003 + tags: [bot, p1] + max_steps: 25 + manual_verification_note: "Persistence after app restart is manual" + steps: + - action: | + Navigate to the Market Maker Bot section. + If a running bot exists, look for an edit button or settings icon. + Click it to open the bot configuration. + checkpoint: "Bot configuration/edit form is visible" + - action: | + Change the spread value slightly (e.g., from 5 to 6) and save. + checkpoint: "Configuration saved with updated spread value" + - action: | + Look for a 'Stop' or 'Pause' button on the bot. Click it. + checkpoint: "Bot status changes to 'Stopped', 'Paused', or similar" + - action: | + Click 'Start', 'Resume', or 'Restart' to reactivate the bot. + checkpoint: "Bot status returns to 'Running' or 'Active'" + expected_result: "Bot can be edited, stopped, and restarted" + extraction_schema: + type: object + properties: + edit_successful: + type: boolean + stop_successful: + type: boolean + restart_successful: + type: boolean + + - id: GW-AUTO-SECX-001a + name: "Private key export UI" + source_manual_id: GW-MAN-SECX-001 + tags: [security, p1] + max_steps: 20 + manual_verification_note: "Actual download/share action may cross browser boundary — verify file manually" + steps: + - action: | + Navigate to a coin detail page (e.g., DOC). + Look for a 'Private Key', 'Export Key', or key icon option. + If not on the coin page, check Settings > Security. + checkpoint: "A private key export option is found" + - action: | + Click the export option. If a password prompt appears, + enter 'QaTestPass!2026' and submit. + checkpoint: "The private key is displayed on screen or a download/copy option appears" + - action: | + If a copy button is available, click it. Look for copy confirmation. + Navigate away to ensure the key is no longer visible. + checkpoint: "Key is hidden after navigating away" + expected_result: "Private key export is password-gated and displayed securely" + extraction_schema: + type: object + properties: + export_option_found: + type: boolean + password_required: + type: boolean + key_displayed: + type: boolean + + - id: GW-AUTO-SETX-002a + name: "Trading bot master toggles" + source_manual_id: GW-MAN-SETX-002 + tags: [settings, p1] + max_steps: 15 + manual_verification_note: "Whether a running bot actually stops on disable requires active bot verification" + prompt: | + Navigate to Settings > Advanced or Trading settings. + Look for toggles related to the Market Maker Bot feature: + 'Enable Trading Bot', 'Market Maker', or similar. + + If found, toggle it off then on. Observe any confirmation dialog + or immediate state change. + + Report the toggle names, their current states, and whether they + respond to clicks. + + COMPLETE after testing bot-related toggles. + expected_result: "Bot toggles are interactive and state changes visually" + extraction_schema: + type: object + properties: + bot_toggles_found: + type: boolean + toggle_names: + type: array + items: + type: string + toggles_respond: + type: boolean + + - id: GW-AUTO-SETX-004a + name: "View and copy swap data" + source_manual_id: GW-MAN-SETX-004 + tags: [settings, p2] + max_steps: 15 + manual_verification_note: "File export requires filesystem access — verify manually" + prompt: | + Navigate to Settings > Advanced. + Look for 'Show Swap Data', 'Swap Export', or similar option. + + If found, click it. Observe if swap data is displayed in a dialog, + text area, or expandable section. + + If a 'Copy' button is available, click it and observe copy feedback. + + Report what is displayed. + + COMPLETE after reviewing the swap data view. + expected_result: "Swap data is viewable in settings with copy option" + extraction_schema: + type: object + properties: + swap_data_option_found: + type: boolean + data_displayed: + type: boolean + copy_button_exists: + type: boolean + + - id: GW-AUTO-WALX-002a + name: "Wallet overview tabs — Assets, Growth, PnL" + source_manual_id: GW-MAN-WALX-002 + tags: [wallet, p1] + max_steps: 20 + manual_verification_note: "Logged-out fallback behavior requires logout — verify manually" + prompt: | + Navigate to the wallet overview or dashboard. + Look for tabs or toggle buttons labeled 'Assets', 'Growth', + 'Profit & Loss', 'PnL', 'Portfolio', or similar. + + If tabs exist: + 1. Click each tab and verify that content changes. + 2. Report the tab names and whether each loaded content. + + If a chart is shown on any tab, note its type (line, bar, etc.). + + COMPLETE after visiting all available overview tabs. + expected_result: "Overview tabs switch content correctly" + extraction_schema: + type: object + properties: + tabs_found: + type: array + items: + type: string + all_tabs_load_content: + type: boolean + + - id: GW-AUTO-RWD-001a + name: "Rewards section view" + source_manual_id: GW-MAN-RWD-001 + tags: [wallet, p2] + max_steps: 15 + manual_verification_note: "Actual reward claim depends on KMD reward availability — verify manually" + prompt: | + Navigate to the KMD coin detail page (if KMD is active). + Look for a 'Rewards', 'Claim Rewards', or similar section. + + If found: + 1. Observe if a reward amount or status is displayed. + 2. If a 'Claim' or 'Collect' button exists, note whether it is + enabled or disabled. + 3. If a refresh button exists, click it. + + If KMD is not active or rewards are not visible, report that. + + COMPLETE after reviewing the rewards section. + expected_result: "Rewards section accessible with status display" + extraction_schema: + type: object + properties: + rewards_section_found: + type: boolean + reward_amount_displayed: + type: boolean + claim_button_state: + type: string + description: "enabled, disabled, or not_found" + + - id: GW-AUTO-BREF-001a + name: "Bitrefill widget visibility" + source_manual_id: GW-MAN-BREF-001 + tags: [fiat, p2] + max_steps: 15 + manual_verification_note: "Bitrefill widget interaction crosses domain boundaries — verify manually" + prompt: | + Navigate to the Fiat or Buy Crypto section. + Look for a 'Bitrefill', 'Gift Cards', or third-party widget/button. + + If found, report its location and label. + Click it and observe what happens (embedded widget, external redirect, etc.). + + If not found, check Settings for any Bitrefill toggle or option. + + COMPLETE after locating the Bitrefill entry point. + expected_result: "Bitrefill option is visible and clickable" + extraction_schema: + type: object + properties: + bitrefill_found: + type: boolean + location: + type: string + click_result: + type: string + + - id: GW-AUTO-ZHTL-001a + name: "ZHTLC activation dialog" + source_manual_id: GW-MAN-ZHTL-001 + tags: [coin, p2] + max_steps: 20 + manual_verification_note: "Logout during activation must be tested manually" + prompt: | + Navigate to the Coin Manager and search for a ZHTLC coin + (e.g., ARRR, ZOMBIE, or any privacy coin marked as ZHTLC). + + If a ZHTLC coin is found: + 1. Click to activate it. + 2. Observe if a special activation dialog or progress indicator appears + (ZHTLC coins typically require extended activation time). + 3. Report the dialog content and any estimated time shown. + 4. If the activation takes too long, note the progress state. + + If no ZHTLC coin is found, report that. + + COMPLETE after observing the ZHTLC activation flow. + expected_result: "ZHTLC activation shows progress dialog with estimated time" + extraction_schema: + type: object + properties: + zhtlc_coin_found: + type: boolean + activation_dialog_shown: + type: boolean + estimated_time_displayed: + type: string + activation_started: + type: boolean + + # =========================================================================== + # PHASE 21: COMPOSITE TESTS — formerly manual, now automated via + # Playwright (browser-level offline, viewport, lifecycle) + Skyvern + # =========================================================================== + + # --- NETWORK MANIPULATION TESTS --- + + - id: GW-AUTO-DASH-002 + name: "Balance refresh + offline indicator" + source_manual_id: GW-MAN-DASH-002 + tags: [dashboard, network, composite] + timeout: 300 + expected_result: "Offline indicator shown when network disabled; recovery on re-enable" + phases: + - type: skyvern + action: "Verify dashboard loads with coin balances" + prompt: | + Navigate to the dashboard. Verify coin balances are visible and + the page is fully loaded. Report the page state. + expected: "Dashboard with balances visible" + - type: playwright + action: set_offline + - type: os_call + action: wait + args: {seconds: 5} + - type: skyvern + action: "Check for offline indicator" + prompt: | + Look at the current screen. Is there an offline indicator, error + banner, connection warning, or any visual cue that the app has lost + connectivity? Report what you see. + expected: "Offline indicator or error banner visible" + - type: playwright + action: set_online + - type: os_call + action: wait + args: {seconds: 10} + - type: skyvern + action: "Verify recovery after network restore" + prompt: | + The network has been restored. Check if the dashboard has recovered: + are balances visible again? Is there still an offline indicator? + Report the current state. + expected: "Dashboard recovered with balances visible" + + - id: GW-AUTO-SEND-006 + name: "Interrupted send + duplicate prevention" + source_manual_id: GW-MAN-SEND-006 + tags: [send, network, critical, composite] + timeout: 360 + expected_result: "Send initiated; network kill shows pending; recovery reconciles; no duplicates" + phases: + - type: skyvern + action: "Navigate to DOC send and fill form" + prompt: | + Navigate to the DOC coin page and open the send form. + Enter recipient address 'RPDPE1XqGuHXSJn9q6VAaGDoRVMEwAYjT3' + and amount '0.001'. Click Send/Confirm to submit. + Wait 2 seconds. Report the current state (pending, submitted, etc.). + expected: "Transaction submitted or pending" + - type: playwright + action: set_offline + - type: os_call + action: wait + args: {seconds: 5} + - type: skyvern + action: "Observe pending state during outage" + prompt: | + The network is now disabled. Look at the screen. Is there a pending + transaction indicator, error message, or offline warning? + Report exactly what you see. + expected: "Pending state or error visible" + - type: playwright + action: set_online + - type: os_call + action: wait + args: {seconds: 15} + - type: skyvern + action: "Verify recovery and no duplicate" + prompt: | + Network is restored. Navigate to the DOC transaction history. + Check if the transaction status has reconciled (confirmed or failed). + Count how many instances of this transaction appear. + Report status and count. + expected: "Transaction reconciled, exactly 1 instance" + + - id: GW-AUTO-ERR-001 + name: "Global network outage messaging" + source_manual_id: GW-MAN-ERR-001 + tags: [error_handling, network, composite] + timeout: 300 + expected_result: "Offline indicators on all screens; recovery with no stale spinners" + phases: + - type: playwright + action: set_offline + - type: os_call + action: wait + args: {seconds: 5} + - type: skyvern + action: "Check offline state across screens" + prompt: | + The network is disabled. Navigate through these screens in order: + Dashboard, DEX, Bridge, Settings. + For each screen, report whether you see an offline indicator, error + message, or spinner. Note any crashes or blank screens. + expected: "Offline indicators on screens, no crashes" + - type: playwright + action: set_online + - type: os_call + action: wait + args: {seconds: 15} + - type: skyvern + action: "Verify full recovery" + prompt: | + Network is restored. Check Dashboard, DEX, Bridge screens. + Are they showing live data again? Are there any stale spinners + or error messages still stuck on screen? + Report the state of each screen. + expected: "All screens recovered, no stale spinners" + + - id: GW-AUTO-ERR-003 + name: "Stale-state reconciliation after offline" + source_manual_id: GW-MAN-ERR-003 + tags: [error_handling, network, composite] + timeout: 360 + expected_result: "After network restore, local state matches authoritative state" + phases: + - type: skyvern + action: "Initiate a transaction" + prompt: | + Navigate to DOC send. Enter recipient 'RKXzCCaT5ukqnyJBKTr9KyEpCBHR8itEFd' + and amount '0.001'. Click send/confirm. + expected: "Transaction submitted" + - type: playwright + action: set_offline + - type: os_call + action: wait + args: {seconds: 10} + - type: playwright + action: set_online + - type: os_call + action: wait + args: {seconds: 20} + - type: skyvern + action: "Check state reconciliation" + prompt: | + Navigate to DOC transaction history. Check the latest transaction. + Is its status resolved (confirmed or failed, not stuck on pending)? + Are there any duplicate or ghost entries? + Report transaction count and status. + expected: "Transaction reconciled, no duplicates" + + # --- APP LIFECYCLE TESTS (browser restart simulation) --- + + - id: GW-AUTO-AUTH-002b + name: "Session persistence across app restart" + source_manual_id: GW-MAN-AUTH-002 + tags: [auth, lifecycle, composite] + timeout: 300 + expected_result: "Remember-me persists session; logout clears it" + phases: + - type: skyvern + action: "Log in with remember-me enabled" + prompt: | + Log in to the wallet with password 'QaTestPass!2026'. + If there is a 'Remember Me', 'Remember Wallet', or 'Stay Logged In' + checkbox or toggle, enable it before logging in. + Wait until the dashboard loads. + expected: "Logged in with remember-me enabled" + - type: playwright + action: restart_session + - type: skyvern + action: "Check session after restart" + prompt: | + The browser was closed and reopened (simulating app restart). + What do you see? Is there a quick-login prompt, auto-login, + or are you on the full login screen? + Report the current state. + expected: "Session restored or quick-login available" + - type: skyvern + action: "Log out and verify session clear" + prompt: | + If logged in, go to Settings and click Logout. + Verify you are on the login/welcome screen. + expected: "Logged out" + - type: playwright + action: restart_session + - type: skyvern + action: "Verify session cleared after logout+restart" + prompt: | + After logout and app restart, what do you see? + Is it the full login screen with no auto-login? + expected: "Full login screen, no auto-login" + + - id: GW-AUTO-WAL-003 + name: "Wallet selection persistence across restart" + source_manual_id: GW-MAN-WAL-003 + tags: [wallet, lifecycle, composite] + timeout: 300 + expected_result: "Selected wallet persists after restart" + phases: + - type: skyvern + action: "Switch to a non-default wallet" + prompt: | + Open the wallet manager. If multiple wallets exist, switch to + a non-default wallet. Note its name. If only one wallet exists, + note the wallet name. + Report the active wallet name. + expected: "Wallet selected and name noted" + extraction_schema: + type: object + properties: + wallet_name: {type: string} + - type: playwright + action: restart_session + - type: skyvern + action: "Verify wallet persisted" + prompt: | + After app restart, log in if needed with password 'QaTestPass!2026'. + Check which wallet is active. Report the active wallet name. + expected: "Same wallet is active after restart" + extraction_schema: + type: object + properties: + wallet_name: {type: string} + + - id: GW-AUTO-DASH-003 + name: "Dashboard preferences persist across restart" + source_manual_id: GW-MAN-DASH-003 + tags: [dashboard, lifecycle, composite] + timeout: 300 + expected_result: "Dashboard customizations persist after logout and restart" + phases: + - type: skyvern + action: "Customize dashboard" + prompt: | + On the dashboard, toggle 'Hide Balances' (or similar privacy toggle) ON. + If a 'Hide Zero Balances' toggle exists, enable it too. + Note which toggles you changed. + expected: "Dashboard customized" + - type: skyvern + action: "Logout and re-login" + prompt: | + Navigate to Settings and click Logout. Then log back in with + password 'QaTestPass!2026'. Check the dashboard. + Are the toggles still in the state you set them? + expected: "Dashboard preferences preserved after re-login" + - type: playwright + action: restart_session + - type: skyvern + action: "Verify after restart" + prompt: | + After app restart, log in with 'QaTestPass!2026'. + Check the dashboard toggles. Are they still in the customized state? + expected: "Dashboard preferences preserved after restart" + + - id: GW-AUTO-SET-004 + name: "Settings persistence across logout/restart" + source_manual_id: GW-MAN-SET-004 + tags: [settings, lifecycle, composite] + timeout: 300 + expected_result: "Settings persist after logout and app restart" + phases: + - type: skyvern + action: "Change multiple settings" + prompt: | + Navigate to Settings. Change at least 2 settings: + 1. Toggle test coins ON (if off) or OFF (if on). + 2. Toggle any privacy/analytics setting. + Note the settings you changed and their new states. + expected: "Settings changed" + - type: skyvern + action: "Logout and re-login, verify" + prompt: | + Logout via Settings > Logout. Log back in with 'QaTestPass!2026'. + Navigate to Settings. Are the settings you changed still in the + new state? Report each setting and its current value. + expected: "Settings preserved after re-login" + - type: playwright + action: restart_session + - type: skyvern + action: "Verify after restart" + prompt: | + After restart, log in with 'QaTestPass!2026'. Check Settings. + Are the changed settings still in the new state? + expected: "Settings preserved after restart" + + - id: GW-AUTO-FIAT-005 + name: "Fiat form reset across logout" + source_manual_id: GW-MAN-FIAT-005 + tags: [fiat, lifecycle, composite] + timeout: 240 + expected_result: "Fiat form resets cleanly after logout and re-login" + phases: + - type: skyvern + action: "Partially fill fiat form" + prompt: | + Navigate to the Fiat section. Select a currency and enter an amount + like '100'. Do NOT submit. Note what you entered. + expected: "Fiat form partially filled" + - type: skyvern + action: "Logout and re-login, check fiat" + prompt: | + Logout via Settings. Log back in with 'QaTestPass!2026'. + Navigate to the Fiat section. Is the form clean/empty, + or does it still have the previous values? + expected: "Fiat form re-initialized cleanly" + + - id: GW-AUTO-QLOG-001 + name: "Quick-login persistence across restart" + source_manual_id: GW-MAN-QLOG-001 + tags: [auth, lifecycle, composite] + timeout: 300 + expected_result: "Quick-login prompt appears after restart when remember-me is on" + phases: + - type: skyvern + action: "Enable remember-me and log out" + prompt: | + Log in with 'QaTestPass!2026' with the remember-me option enabled. + Once on the dashboard, go to Settings and logout. + expected: "Logged in with remember-me, then logged out" + - type: playwright + action: restart_session + - type: skyvern + action: "Check for quick-login prompt" + prompt: | + After app restart, look at the screen. Is there a quick-login prompt + showing the remembered wallet? Or a password-only screen? + Report what you see. + expected: "Quick-login or remembered wallet prompt appears" + + # --- DEEP LINK TEST --- + + - id: GW-AUTO-NAV-002 + name: "Deep link handling with auth gating" + source_manual_id: GW-MAN-NAV-002 + tags: [navigation, security, composite] + timeout: 240 + expected_result: "Deep link while logged out enforces auth; redirect after login" + phases: + - type: skyvern + action: "Ensure logged out" + prompt: | + If you see a dashboard, navigate to Settings and click Logout. + If already on the login/welcome screen, COMPLETE. + expected: "On login screen" + - type: playwright + action: navigate + args: {url_suffix: "#/dex"} + - type: skyvern + action: "Check auth gating on deep link" + prompt: | + A direct URL to the DEX page was opened while logged out. + What do you see? Are you on the login screen (auth gating enforced)? + Or did the DEX page load without login? + expected: "Auth gating enforced, login required" + - type: skyvern + action: "Login and check redirect" + prompt: | + Log in with 'QaTestPass!2026'. After login, which page loaded? + Are you on the DEX page (intended deep link destination) or + the default dashboard? + expected: "Redirected to DEX after login" + + # --- RESPONSIVE / BREAKPOINT TESTS --- + + - id: GW-AUTO-RESP-001 + name: "Responsive breakpoint behavior" + source_manual_id: GW-MAN-RESP-001 + tags: [responsive, composite] + timeout: 360 + expected_result: "Layout adapts correctly at mobile, tablet, and desktop breakpoints" + phases: + - type: playwright + action: set_viewport + args: {width: 375, height: 812} + - type: skyvern + action: "Check mobile layout" + prompt: | + The viewport is now mobile width (375px). Look at the dashboard. + Are navigation, coin cards, and action buttons all visible and + properly stacked? Is there a hamburger menu or bottom nav? + Report any overflow, cutoff, or broken layout. + expected: "Mobile layout renders correctly" + - type: playwright + action: set_viewport + args: {width: 768, height: 1024} + - type: skyvern + action: "Check tablet layout" + prompt: | + The viewport is now tablet width (768px). Does the layout adapt? + Is there more content visible than at mobile width? + Report the layout style (sidebar visible? cards in grid?). + expected: "Tablet layout adapts correctly" + - type: playwright + action: set_viewport + args: {width: 1440, height: 900} + - type: skyvern + action: "Check desktop layout" + prompt: | + The viewport is now desktop width (1440px). Is the full layout + displayed with sidebar, main content area, and proper spacing? + Report any issues. + expected: "Desktop layout fully displayed" + + - id: GW-AUTO-RESP-002 + name: "Form state retention during viewport resize" + source_manual_id: GW-MAN-RESP-002 + tags: [responsive, composite] + timeout: 240 + expected_result: "Form data preserved after viewport resize" + phases: + - type: skyvern + action: "Fill form with test data" + prompt: | + Navigate to the DOC send screen. Enter recipient address + 'RPDPE1XqGuHXSJn9q6VAaGDoRVMEwAYjT3' and amount '0.5'. + Do NOT submit. Report that the form is filled. + expected: "Form filled with test data" + - type: playwright + action: set_viewport + args: {width: 375, height: 812} + - type: os_call + action: wait + args: {seconds: 3} + - type: skyvern + action: "Check form after resize" + prompt: | + The viewport was just resized to mobile width. Check the send form. + Is the recipient address and amount still filled in? + Was there any accidental submission? + Report the form state. + expected: "Form data preserved after resize" + - type: playwright + action: set_viewport + args: {width: 1440, height: 900} + + # --- CLOCK MANIPULATION --- + + - id: GW-AUTO-WARN-001 + name: "Clock warning banner under invalid time" + source_manual_id: GW-MAN-WARN-001 + tags: [system_health, composite] + timeout: 240 + expected_result: "No banner with valid clock; warning banner with invalid clock" + phases: + - type: skyvern + action: "Check with valid clock" + prompt: | + Navigate to the DEX or Bridge section. Is there any clock warning + banner or time-related warning message visible on screen? + Report what you see. + expected: "No clock warning banner" + - type: playwright + action: mock_clock + args: {offset_hours: 8760} + - type: skyvern + action: "Check with invalid clock" + prompt: | + The system clock has been mocked to a future date. Reload the page + and navigate to the DEX or Bridge. Is there now a clock warning + banner or time synchronization warning visible? + Report any warning text. + expected: "Clock warning banner appears" + - type: playwright + action: reset_clock + + # --- FILESYSTEM OPERATIONS --- + + - id: GW-AUTO-SETX-003 + name: "Export/import maker orders JSON" + source_manual_id: GW-MAN-SETX-003 + tags: [settings, filesystem, composite] + timeout: 300 + expected_result: "Export creates valid JSON; import succeeds" + phases: + - type: skyvern + action: "Navigate to export option" + prompt: | + Navigate to Settings > Advanced. Find the 'Export Maker Orders' + or similar export option. Report its exact label and location. + expected: "Export option found" + - type: playwright + action: capture_download + args: {click_text: "Export"} + - type: skyvern + action: "Verify export and try import" + prompt: | + If an export was triggered, navigate to the import option. + If an import/upload button exists, report its label. + If there is a text area for pasting JSON, report that. + expected: "Import option identified" + + - id: GW-AUTO-SETX-006 + name: "Download logs" + source_manual_id: GW-MAN-SETX-006 + tags: [settings, filesystem, composite] + timeout: 240 + expected_result: "Log file downloads successfully" + phases: + - type: skyvern + action: "Navigate to download logs" + prompt: | + Navigate to Settings > Advanced. Find the 'Download Logs' or + 'Export Logs' option. Report its exact label. + expected: "Download logs option found" + - type: playwright + action: capture_download + args: {click_text: "Download"} + + # --- CLIPBOARD VERIFICATION --- + + - id: GW-AUTO-SEC-003 + name: "Clipboard verification after address copy" + source_manual_id: GW-MAN-SEC-003 + tags: [security, clipboard, composite] + timeout: 240 + expected_result: "Copied address matches displayed address" + phases: + - type: skyvern + action: "Copy wallet address" + prompt: | + Navigate to the DOC coin detail page. Find the receiving address. + Click the copy button (clipboard icon) next to the address. + Report the address text you see on screen. + expected: "Address copied" + extraction_schema: + type: object + properties: + displayed_address: {type: string} + - type: os_call + action: read_clipboard + - type: skyvern + action: "Verify clipboard matches" + prompt: | + Report the displayed DOC address on screen one more time. + expected: "Address confirmed" + + # --- KEYBOARD ACCESSIBILITY --- + + - id: GW-AUTO-A11Y-001 + name: "Keyboard-only navigation audit" + source_manual_id: GW-MAN-A11Y-001 + tags: [accessibility, composite] + timeout: 300 + expected_result: "All elements reachable via Tab; no keyboard traps; logical focus order" + phases: + - type: playwright + action: navigate + - type: os_call + action: wait + args: {seconds: 5} + - type: playwright + action: keyboard_audit + args: {max_tabs: 80} + + # --- CONTRAST / ACCESSIBILITY AUDIT --- + + - id: GW-AUTO-A11Y-003 + name: "Contrast and accessibility audit (axe-core)" + source_manual_id: GW-MAN-A11Y-003 + tags: [accessibility, composite] + timeout: 180 + expected_result: "No critical or serious accessibility violations" + phases: + - type: playwright + action: navigate + - type: os_call + action: wait + args: {seconds: 5} + - type: playwright + action: accessibility_audit + + # --- REMAINING NETWORK TESTS (DEX, Bridge) --- + + - id: GW-AUTO-DEX-006 + name: "DEX recovery after network drop" + source_manual_id: GW-MAN-DEX-006 + tags: [dex, network, composite] + timeout: 360 + expected_result: "DEX orders/history reconcile after network drop and restore" + phases: + - type: skyvern + action: "Place a maker order" + prompt: | + Navigate to the DEX. Select DOC/MARTY pair. Place a maker order + with price '1' and amount '0.01'. Wait for order confirmation. + expected: "Order placed" + - type: playwright + action: set_offline + - type: os_call + action: wait + args: {seconds: 8} + - type: playwright + action: set_online + - type: os_call + action: wait + args: {seconds: 15} + - type: skyvern + action: "Check DEX state after recovery" + prompt: | + Navigate to DEX > Open Orders and History. + Is the order still present? Are there duplicates or ghost orders? + Report order count and statuses. + expected: "No duplicate or ghost orders" + + - id: GW-AUTO-BRDG-004 + name: "Bridge failure/recovery after network drop" + source_manual_id: GW-MAN-BRDG-004 + tags: [bridge, network, composite] + timeout: 360 + expected_result: "Bridge history reflects correct status after network recovery" + phases: + - type: skyvern + action: "Initiate bridge transfer" + prompt: | + Navigate to Bridge. Select a supported pair and enter a small amount. + Click confirm to initiate the transfer. Report the initial status. + expected: "Bridge transfer initiated" + - type: playwright + action: set_offline + - type: os_call + action: wait + args: {seconds: 8} + - type: playwright + action: set_online + - type: os_call + action: wait + args: {seconds: 20} + - type: skyvern + action: "Check bridge history after recovery" + prompt: | + Navigate to bridge history. Check the latest transfer status. + Is it resolved (completed/failed) or stuck? Any duplicates? + expected: "Bridge status reconciled" + + - id: GW-AUTO-SETX-005 + name: "Import swaps from JSON" + source_manual_id: GW-MAN-SETX-005 + tags: [settings, filesystem, composite] + timeout: 240 + expected_result: "Valid JSON imports; malformed JSON shows error" + phases: + - type: skyvern + action: "Navigate to import swaps" + prompt: | + Navigate to Settings > Advanced. Find 'Import Swaps' or + 'Paste Swap Data' option. If a text area is shown, enter the + text 'not valid json' and click import/submit. + Report the error message shown. + expected: "Error shown for invalid JSON" + - type: skyvern + action: "Try valid empty array" + prompt: | + Clear the text area and enter '[]' (empty JSON array). + Click import/submit. Report the result (error, success, or no-op). + expected: "Empty array handled gracefully" + +# ============================================================================= +# REGRESSION PACKS (tag-based filtering) +# ============================================================================= +# +# Smoke Pack: python -m runner.runner --tag smoke +# Critical Pack: python -m runner.runner --tag critical +# P0 Pack: python -m runner.runner --tag p0 +# Full Pack: python -m runner.runner (no filter) +# Composite: python -m runner.runner --tag composite +# With manual: python -m runner.runner --include-manual +# Manual only: python -m runner.runner --manual-only +# +# =============================================================================