diff --git a/.github/workflows/test-mcp-regression.yml b/.github/workflows/test-mcp-regression.yml
index 19cb25b0..9537a0fe 100644
--- a/.github/workflows/test-mcp-regression.yml
+++ b/.github/workflows/test-mcp-regression.yml
@@ -129,6 +129,25 @@ jobs:
           --skill-variant from-skill-md
           -o test-results/m1-adversarial.json
 
+      # ── M2 grounding-recall eval (warn-only, #280 PR-2) ────────────
+      # Drives the bicameral-bind skill against tests/fixtures/grounding_recall/
+      # — synthetic fixture with 23 decisions across same-name-different-module,
+      # similar-intent, and cross-language cases. Cache hits at
+      # tests/eval/fixtures/bind_judge/ keep CI cost ~$0 unless the dataset,
+      # fixture repo, or skill change. Warn-only initially per #280's gating-
+      # is-observability framing — we ship the measurement, observe the
+      # baseline, then ratchet to --gate-mode hard once the signal is stable.
+      - name: M2 grounding-recall eval (warn-only)
+        if: matrix.os == 'ubuntu-latest'
+        continue-on-error: true
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          BICAMERAL_GROUNDING_EVAL_MODEL: claude-haiku-4-5-20251001
+        run: >
+          python tests/eval_grounding_recall.py
+          --gate-mode warn
+          -o test-results/m2-grounding-recall.json
+
       # ── Generate rich E2E report from artifacts ────────────────────
       # Ubuntu-only: the script consumes the medusa adversarial corpus
       # (cloned only on Ubuntu above) plus the Phase 3 E2E artifacts
diff --git a/CHANGELOG.md b/CHANGELOG.md
index de577922..2b5bb8dc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,8 @@ All notable changes to bicameral-mcp are tracked here. Format loosely follows
 
 - **`skills/bicameral-bind/SKILL.md` (#280).** New skill that extracts the bind contract out of `skills/bicameral-ingest/SKILL.md` §2 and tightens it from advisory to mandatory: the agent must Read at least one candidate file end-to-end, confirm the symbol via `validate_symbols`, and abort on weak evidence. Documents the handler-side rejection contract added in this release. `bicameral-ingest` §2 now points at the new skill instead of duplicating the verification rules.
 
+- **`tests/eval_grounding_recall.py` — M2 grounding-recall eval harness (#280 PR-2).** Synthetic-fixture benchmark that drives the `bicameral-bind` skill end-to-end and measures three axes: precision (of bindings the agent committed, what fraction were correct), recall (of ground-truth bindings, how many the agent got right), and abort rate (first-class signal because the bind skill makes "abort on weak evidence" an explicit contract). Dataset at `tests/fixtures/grounding_recall/dataset.py` with 23 cases across same-name-different-module (5), similar-intent-different-symbol (10), and cross-language (8) — fixture repo at `tests/fixtures/grounding_recall/repo/`. Headless caller-LLM driver at `tests/eval/_bind_judge.py` (modeled on `_skill_judge.py`) drives a multi-turn `read_file` / `validate_symbols` / `submit_binding` tool-use loop with response caching at `tests/eval/fixtures/bind_judge/` keyed on SHA(model | skill | repo | decision). Default gates: recall ≥ 0.80, precision ≥ 0.85, abort_rate ≤ 0.30 per #280 acceptance. New CI step is **warn-only initially** (`continue-on-error: true`, mirrors the M1 step) — gather a baseline first, ratchet to `--gate-mode hard` once the signal is stable.
+
 ### Changed
 
 - **`code_locator/tools/validate_symbols.py`: dropped unused `self._db` field.** The retention comment ("Retained so `code_locator.adapter.ground_mappings()` can reach `db.lookup_by_file()`") referenced a path deleted in v0.6.0; the field had zero readers.
diff --git a/tests/eval/_bind_judge.py b/tests/eval/_bind_judge.py
new file mode 100644
index 00000000..fd0ca1e3
--- /dev/null
+++ b/tests/eval/_bind_judge.py
@@ -0,0 +1,491 @@
+"""Headless caller-LLM driver for the bicameral-bind skill (#280 PR-2).
+
+Drives `skills/bicameral-bind/SKILL.md` end-to-end against a synthetic
+fixture repo: the LLM gets `read_file`, `validate_symbols`, and
+`submit_binding` tools; we run a multi-turn tool-use loop until it
+either submits a binding or aborts on weak evidence.
+
+Modeled on tests/eval/_skill_judge.py — same x-api-key auth, same
+fixture-cache keyed on SHA(model | skill_sha | repo_sha | input_sha).
+Cache hits keep CI cost ~$0 unless the dataset, fixture repo, or skill
+change.
+
+Environment:
+    ANTHROPIC_API_KEY                       required for live calls
+    BICAMERAL_GROUNDING_EVAL_MODEL          default "claude-haiku-4-5-20251001"
+    BICAMERAL_GROUNDING_EVAL_RECORD=1       force-bypass cache, re-record
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+SKILL_MD_PATH = REPO_ROOT / "skills" / "bicameral-bind" / "SKILL.md"
+CACHE_DIR = Path(__file__).resolve().parent / "fixtures" / "bind_judge"
+
+ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
+ANTHROPIC_API_VERSION = "2023-06-01"
+DEFAULT_MODEL = "claude-haiku-4-5-20251001"
+MAX_OUTPUT_TOKENS = 2048
+MAX_TURNS = 8
+REQUEST_TIMEOUT_S = 90.0
+
+
+# ── Tool schemas exposed to the LLM ─────────────────────────────────────────
+
+
+READ_FILE_TOOL: dict[str, Any] = {
+    "name": "read_file",
+    "description": (
+        "Read the full contents of a file in the fixture repo. Use this to "
+        "confirm a candidate symbol's body actually implements the decision's "
+        "intent before submitting a binding. Path is repo-relative "
+        "(e.g. 'src/checkout/orders.py')."
+    ),
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "file_path": {
+                "type": "string",
+                "description": "Repo-relative path to the file.",
+            }
+        },
+        "required": ["file_path"],
+    },
+}
+
+
+VALIDATE_SYMBOLS_TOOL: dict[str, Any] = {
+    "name": "validate_symbols",
+    "description": (
+        "Confirm one or more candidate symbol names exist in the fixture "
+        "repo's symbol index. Returns the list of (file_path, symbol_name) "
+        "pairs that match. Use this before submitting a binding — the "
+        "handler verifies symbols against the same index."
+    ),
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "candidates": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": (
+                    "Symbol-name hypotheses (e.g. ['process_order', "
+                    "'CheckoutRetryGuard.check_cap'])."
+                ),
+            }
+        },
+        "required": ["candidates"],
+    },
+}
+
+
+SUBMIT_BINDING_TOOL: dict[str, Any] = {
+    "name": "submit_binding",
+    "description": (
+        "Submit your final binding decision. Call this exactly once. "
+        "Either provide (file_path, symbol_name) for the bind, or set "
+        "abort=true with abort_reason if the evidence is too weak to bind "
+        "(per skills/bicameral-bind/SKILL.md 'abort on weak evidence')."
+    ),
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "abort": {
+                "type": "boolean",
+                "description": "True if aborting on weak evidence; false if submitting a real binding.",
+            },
+            "file_path": {
+                "type": "string",
+                "description": "Repo-relative file the symbol lives in. Required when abort=false.",
+            },
+            "symbol_name": {
+                "type": "string",
+                "description": "The symbol to bind to. Required when abort=false.",
+            },
+            "abort_reason": {
+                "type": "string",
+                "description": "One sentence why the evidence was too weak. Required when abort=true.",
+            },
+            "reasoning": {
+                "type": "string",
+                "description": "One sentence explaining the choice (or the abort).",
+            },
+        },
+        "required": ["abort", "reasoning"],
+    },
+}
+
+
+TOOLS = [READ_FILE_TOOL, VALIDATE_SYMBOLS_TOOL, SUBMIT_BINDING_TOOL]
+
+
+# ── Public result shape ─────────────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class BindJudgment:
+    case_id: str
+    aborted: bool
+    bound_file: str | None
+    bound_symbol: str | None
+    abort_reason: str | None
+    reasoning: str
+    turns: int
+    tokens_in: int
+    tokens_out: int
+
+
+# ── Helpers ─────────────────────────────────────────────────────────────────
+
+
+def _sha(text: str) -> str:
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+
+def _load_skill_md() -> str:
+    if not SKILL_MD_PATH.exists():
+        raise FileNotFoundError(f"SKILL.md not found at {SKILL_MD_PATH}")
+    return SKILL_MD_PATH.read_text(encoding="utf-8")
+
+
+def _scan_repo(repo_root: Path) -> dict[str, str]:
+    """Walk the fixture repo and return {repo_relative_path: content}."""
+    out: dict[str, str] = {}
+    for path in sorted(repo_root.rglob("*")):
+        if not path.is_file():
+            continue
+        if path.suffix not in {".py", ".ts", ".js", ".go", ".rs"}:
+            continue
+        rel = path.relative_to(repo_root).as_posix()
+        out[rel] = path.read_text(encoding="utf-8")
+    return out
+
+
+def _index_symbols(files: dict[str, str]) -> list[tuple[str, str]]:
+    """Build a flat (file, symbol) index from the fixture repo.
+
+    Lightweight — extracts top-level def/class names and one level of
+    methods. Good enough for the synthetic fixture; the production
+    bicameral symbol index is richer but for the eval we just need
+    deterministic 'does this symbol exist in this file' answers.
+    """
+    import re
+
+    out: list[tuple[str, str]] = []
+    py_def = re.compile(r"^(?:async\s+)?def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", re.MULTILINE)
+    py_class = re.compile(r"^class\s+([a-zA-Z_][a-zA-Z0-9_]*)", re.MULTILINE)
+    py_method = re.compile(r"^    (?:async\s+)?def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", re.MULTILINE)
+    ts_func = re.compile(
+        r"^(?:export\s+)?(?:async\s+)?function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)",
+        re.MULTILINE,
+    )
+    ts_class = re.compile(r"^(?:export\s+)?class\s+([a-zA-Z_$][a-zA-Z0-9_$]*)", re.MULTILINE)
+
+    for file_path, content in files.items():
+        if file_path.endswith((".py",)):
+            for m in py_def.finditer(content):
+                out.append((file_path, m.group(1)))
+            for cls in py_class.finditer(content):
+                cls_name = cls.group(1)
+                out.append((file_path, cls_name))
+                # Methods inside this class — capture as Class.method
+                tail = content[cls.end() :]
+                next_cls = py_class.search(tail)
+                cls_body = tail[: next_cls.start()] if next_cls else tail
+                for meth in py_method.finditer(cls_body):
+                    out.append((file_path, f"{cls_name}.{meth.group(1)}"))
+        elif file_path.endswith((".ts", ".js")):
+            for m in ts_func.finditer(content):
+                out.append((file_path, m.group(1)))
+            for m in ts_class.finditer(content):
+                out.append((file_path, m.group(1)))
+    return out
+
+
+def _execute_tool(
+    tool_name: str,
+    tool_input: dict,
+    *,
+    repo_files: dict[str, str],
+    symbol_index: list[tuple[str, str]],
+) -> str:
+    """Deterministic tool dispatch — no real handler call, just fixture lookup.
+
+    The eval measures whether the LLM picks the right (file, symbol) given
+    the available evidence; we don't need the production bicameral handler
+    in the loop. Cleaner cache + faster CI.
+    """
+    if tool_name == "read_file":
+        path = tool_input.get("file_path", "")
+        if path in repo_files:
+            return repo_files[path]
+        return f"ERROR: file not found: {path!r} (try one of: {sorted(repo_files.keys())[:5]}…)"
+
+    if tool_name == "validate_symbols":
+        candidates = tool_input.get("candidates") or []
+        matches: list[dict] = []
+        for cand in candidates:
+            for fp, sym in symbol_index:
+                if cand == sym or cand == sym.split(".")[-1]:
+                    matches.append({"file_path": fp, "symbol_name": sym, "candidate": cand})
+        return json.dumps({"matches": matches}, indent=2)
+
+    return f"ERROR: unknown tool {tool_name!r}"
+
+
+def _cache_path(model: str, skill_sha: str, repo_sha: str, input_sha: str) -> Path:
+    key = f"{model}|{skill_sha}|{repo_sha}|{input_sha}"
+    return CACHE_DIR / f"{_sha(key)}.json"
+
+
+def _build_system_prompt(skill_md: str) -> str:
+    return f"""\
+You are a caller-LLM running the bicameral-bind skill against a synthetic
+fixture repo. Your job: read the decision text, identify the right
+(file, symbol) it should bind to, then call `submit_binding`.
+
+Apply the skill's contract verbatim — especially the mandatory pre-bind
+verification (Read at least one candidate file end-to-end, confirm via
+`validate_symbols`, abort on weak evidence). The handler-side rejection
+contract from #280 means your binding is verified against the same
+symbol index `validate_symbols` queries; submitting a hallucinated
+symbol will be rejected.
+
+You have at most {MAX_TURNS} turns to gather evidence and submit. Do
+not respond in plain text — every turn must be a tool_use block.
+
+──── Skill contract (verbatim from skills/bicameral-bind/SKILL.md) ────
+
+{skill_md}
+"""
+
+
+def _build_user_prompt(decision_description: str, repo_files: dict[str, str]) -> str:
+    file_list = "\n".join(f"  - {p}" for p in sorted(repo_files.keys()))
+    return f"""\
+Decision to bind:
+
+  {decision_description}
+
+Fixture repo files available (use `read_file` to view contents):
+
+{file_list}
+
+Identify the right (file, symbol) for this decision and submit via
+`submit_binding`. If after gathering evidence you can't point at a
+specific function or class body that implements the decision, submit
+with abort=true.
+"""
+
+
+def _call_messages_api(
+    *,
+    model: str,
+    system_prompt: str,
+    messages: list[dict],
+    api_key: str,
+) -> dict:
+    headers = {
+        "anthropic-version": ANTHROPIC_API_VERSION,
+        "content-type": "application/json",
+        "x-api-key": api_key,
+    }
+    payload: dict[str, Any] = {
+        "model": model,
+        "max_tokens": MAX_OUTPUT_TOKENS,
+        "temperature": 0,
+        "system": [
+            {
+                "type": "text",
+                "text": system_prompt,
+                "cache_control": {"type": "ephemeral"},
+            }
+        ],
+        "messages": messages,
+        "tools": TOOLS,
+    }
+    with httpx.Client(timeout=REQUEST_TIMEOUT_S) as client:
+        resp = client.post(ANTHROPIC_API_URL, headers=headers, json=payload)
+        if resp.status_code >= 400:
+            raise RuntimeError(f"Anthropic API error {resp.status_code}: {resp.text[:500]}")
+        return resp.json()
+
+
+# ── Public entrypoint ───────────────────────────────────────────────────────
+
+
+def run_bind_judgment(
+    *,
+    case_id: str,
+    decision_description: str,
+    repo_root: Path,
+    model: str | None = None,
+    api_key: str | None = None,
+    use_cache: bool = True,
+) -> BindJudgment:
+    """Drive the bicameral-bind skill against the fixture repo for one case.
+
+    Returns a `BindJudgment` capturing the LLM's outcome (binding or abort)
+    along with token + turn telemetry.
+
+    Caching: response is cached to ``tests/eval/fixtures/bind_judge/`` keyed
+    on (model, SKILL.md SHA, repo SHA, decision SHA). Set
+    ``BICAMERAL_GROUNDING_EVAL_RECORD=1`` to bypass cache and re-record.
+    """
+    chosen_model: str = model or os.getenv("BICAMERAL_GROUNDING_EVAL_MODEL") or DEFAULT_MODEL
+    skill_md = _load_skill_md()
+    skill_sha = _sha(skill_md)
+
+    repo_files = _scan_repo(repo_root)
+    symbol_index = _index_symbols(repo_files)
+
+    canonical_repo = json.dumps(repo_files, sort_keys=True, ensure_ascii=False)
+    repo_sha = _sha(canonical_repo)
+    input_sha = _sha(decision_description)
+
+    cache_file = _cache_path(chosen_model, skill_sha, repo_sha, input_sha)
+    force_record = os.getenv("BICAMERAL_GROUNDING_EVAL_RECORD", "").strip() in {"1", "true", "yes"}
+    if use_cache and not force_record and cache_file.exists():
+        cached: dict[str, Any] = json.loads(cache_file.read_text(encoding="utf-8"))
+        return BindJudgment(
+            case_id=case_id,
+            aborted=bool(cached["aborted"]),
+            bound_file=cached.get("bound_file"),
+            bound_symbol=cached.get("bound_symbol"),
+            abort_reason=cached.get("abort_reason"),
+            reasoning=str(cached.get("reasoning") or ""),
+            turns=int(cached.get("turns") or 0),
+            tokens_in=int(cached.get("tokens_in") or 0),
+            tokens_out=int(cached.get("tokens_out") or 0),
+        )
+
+    chosen_key = api_key or os.environ.get("ANTHROPIC_API_KEY", "")
+    if not chosen_key.strip():
+        raise RuntimeError(
+            "ANTHROPIC_API_KEY missing and no cached fixture exists for "
+            f"(model={chosen_model}, case={case_id}, skill={skill_sha[:8]}, "
+            f"repo={repo_sha[:8]}, input={input_sha[:8]})."
+        )
+
+    system_prompt = _build_system_prompt(skill_md)
+    messages: list[dict] = [
+        {"role": "user", "content": _build_user_prompt(decision_description, repo_files)}
+    ]
+
+    tokens_in = 0
+    tokens_out = 0
+    bound_file: str | None = None
+    bound_symbol: str | None = None
+    aborted = False
+    abort_reason: str | None = None
+    reasoning = ""
+    turn = 0
+
+    for turn in range(1, MAX_TURNS + 1):  # noqa: B007 — `turn` is read after the loop for telemetry
+        data = _call_messages_api(
+            model=chosen_model,
+            system_prompt=system_prompt,
+            messages=messages,
+            api_key=chosen_key,
+        )
+        usage = data.get("usage") or {}
+        tokens_in += int(usage.get("input_tokens", 0))
+        tokens_out += int(usage.get("output_tokens", 0))
+
+        content = data.get("content") or []
+        # Append assistant turn to history regardless of tool/no-tool
+        messages.append({"role": "assistant", "content": content})
+
+        tool_uses = [b for b in content if b.get("type") == "tool_use"]
+        if not tool_uses:
+            # No tool call — agent gave up without submitting. Treat as abort.
+            aborted = True
+            abort_reason = "agent did not call any tool"
+            reasoning = next((b.get("text", "") for b in content if b.get("type") == "text"), "")
+            break
+
+        # Submit the tool results, but check first if any tool_use is submit_binding.
+        submit_call = next((tu for tu in tool_uses if tu.get("name") == "submit_binding"), None)
+        if submit_call is not None:
+            inp = submit_call.get("input") or {}
+            reasoning = str(inp.get("reasoning") or "")
+            if inp.get("abort"):
+                aborted = True
+                abort_reason = str(inp.get("abort_reason") or "")
+            else:
+                bound_file = str(inp.get("file_path") or "") or None
+                bound_symbol = str(inp.get("symbol_name") or "") or None
+            break
+
+        # Otherwise execute read_file / validate_symbols and continue the loop.
+        tool_results: list[dict] = []
+        for tu in tool_uses:
+            result = _execute_tool(
+                tu.get("name", ""),
+                tu.get("input") or {},
+                repo_files=repo_files,
+                symbol_index=symbol_index,
+            )
+            tool_results.append(
+                {
+                    "type": "tool_result",
+                    "tool_use_id": tu.get("id"),
+                    "content": result,
+                }
+            )
+        messages.append({"role": "user", "content": tool_results})
+    else:
+        # Loop exhausted without submit_binding.
+        aborted = True
+        abort_reason = f"hit MAX_TURNS={MAX_TURNS} without submitting a binding"
+
+    judgment_payload: dict[str, Any] = {
+        "aborted": aborted,
+        "bound_file": bound_file,
+        "bound_symbol": bound_symbol,
+        "abort_reason": abort_reason,
+        "reasoning": reasoning,
+        "turns": turn,
+        "tokens_in": tokens_in,
+        "tokens_out": tokens_out,
+    }
+
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    cache_file.write_text(
+        json.dumps(judgment_payload, indent=2, ensure_ascii=False), encoding="utf-8"
+    )
+    return BindJudgment(
+        case_id=case_id,
+        aborted=aborted,
+        bound_file=bound_file,
+        bound_symbol=bound_symbol,
+        abort_reason=abort_reason,
+        reasoning=reasoning,
+        turns=turn,
+        tokens_in=tokens_in,
+        tokens_out=tokens_out,
+    )
+
+
+def fixture_exists(
+    *, case_id: str, decision_description: str, repo_root: Path, model: str | None = None
+) -> bool:
+    """True if a cached fixture exists for these inputs (used to skip when
+    no API key + no cache)."""
+    chosen_model: str = model or os.getenv("BICAMERAL_GROUNDING_EVAL_MODEL") or DEFAULT_MODEL
+    skill_md = _load_skill_md()
+    skill_sha = _sha(skill_md)
+    repo_files = _scan_repo(repo_root)
+    canonical_repo = json.dumps(repo_files, sort_keys=True, ensure_ascii=False)
+    repo_sha = _sha(canonical_repo)
+    input_sha = _sha(decision_description)
+    return _cache_path(chosen_model, skill_sha, repo_sha, input_sha).exists()
diff --git a/tests/eval_grounding_recall.py b/tests/eval_grounding_recall.py
new file mode 100644
index 00000000..03f21d29
--- /dev/null
+++ b/tests/eval_grounding_recall.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""M2 grounding-recall eval — measures caller-LLM bind precision/recall (#280 PR-2).
+
+Drives the bicameral-bind skill against a synthetic fixture (≥ 21
+decisions across same-name-different-module / similar-intent /
+cross-language cases), captures what each judgment bound vs. ground
+truth, and emits precision / recall / abort-rate.
+
+Three measurement axes (the split matters for diagnosis):
+  - Precision = correct / (correct + wrong_symbol + wrong_file)
+                of the bindings the agent committed, what fraction were right
+  - Recall    = correct / total_rows
+                of the ground-truth bindings, how many the agent got right
+                (aborts and wrong bindings BOTH count against)
+  - Abort rate = aborted / total_rows
+                 first-class signal because the bind-skill makes 'abort on
+                 weak evidence' an explicit contract — high abort rate
+                 means the skill is too conservative
+
+Usage:
+    .venv/bin/python tests/eval_grounding_recall.py
+        --model claude-haiku-4-5-20251001
+        --gate-mode warn
+        -o test-results/m2-grounding-recall.json
+
+Flags:
+    --case-filter        only run cases of this type (debug)
+    --case-id            run a single case by id (debug)
+    --model              override BICAMERAL_GROUNDING_EVAL_MODEL
+    --min-recall         gate threshold (default 0.80, per #280)
+    --min-precision      gate threshold (default 0.85, per #280)
+    --max-abort-rate     gate threshold (default 0.30 — agent too cautious)
+    --gate-mode          'warn' (advisory, default) | 'hard' (exit non-zero on miss)
+    -o / --output        write JSON report
+    --verbose            print per-case rows
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+# tests/ has no __init__.py; import siblings via dir on sys.path.
+REPO_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(REPO_ROOT / "tests" / "eval"))
+sys.path.insert(0, str(REPO_ROOT / "tests" / "fixtures" / "grounding_recall"))
+
+from _bind_judge import BindJudgment, fixture_exists, run_bind_judgment  # type: ignore[import-not-found]  # noqa: E402, I001
+from dataset import ALL_CASES, GENERATOR_VERSION, GroundingCase, cases_by_type  # type: ignore[import-not-found]  # noqa: E402, I001
+
+FIXTURE_REPO = REPO_ROOT / "tests" / "fixtures" / "grounding_recall" / "repo"
+
+
+# ── Outcome classification ──────────────────────────────────────────────────
+
+
+def _classify(case: GroundingCase, judgment: BindJudgment) -> str:
+    """Map (case, judgment) → outcome category for metrics."""
+    if judgment.aborted:
+        return "aborted"
+    if judgment.bound_file == case.intended_file and judgment.bound_symbol == case.intended_symbol:
+        return "correct"
+    if judgment.bound_file == case.intended_file:
+        return "wrong_symbol"
+    return "wrong_file"
+
+
+# ── Report shape ────────────────────────────────────────────────────────────
+
+
+def _per_case_row(case: GroundingCase, judgment: BindJudgment, outcome: str) -> dict[str, Any]:
+    return {
+        "case_id": case.case_id,
+        "case_type": case.case_type,
+        "intended_file": case.intended_file,
+        "intended_symbol": case.intended_symbol,
+        "bound_file": judgment.bound_file,
+        "bound_symbol": judgment.bound_symbol,
+        "outcome": outcome,
+        "aborted": judgment.aborted,
+        "abort_reason": judgment.abort_reason,
+        "reasoning": judgment.reasoning,
+        "turns": judgment.turns,
+        "tokens_in": judgment.tokens_in,
+        "tokens_out": judgment.tokens_out,
+    }
+
+
+def _aggregate(rows: list[dict[str, Any]]) -> dict[str, Any]:
+    total = len(rows)
+    by_outcome: dict[str, int] = defaultdict(int)
+    by_type_total: dict[str, int] = defaultdict(int)
+    by_type_correct: dict[str, int] = defaultdict(int)
+    tokens_in = 0
+    tokens_out = 0
+    turns_sum = 0
+
+    for r in rows:
+        by_outcome[r["outcome"]] += 1
+        by_type_total[r["case_type"]] += 1
+        if r["outcome"] == "correct":
+            by_type_correct[r["case_type"]] += 1
+        tokens_in += r["tokens_in"]
+        tokens_out += r["tokens_out"]
+        turns_sum += r["turns"]
+
+    correct = by_outcome["correct"]
+    wrong_symbol = by_outcome["wrong_symbol"]
+    wrong_file = by_outcome["wrong_file"]
+    aborted = by_outcome["aborted"]
+
+    submitted = correct + wrong_symbol + wrong_file
+    precision = correct / submitted if submitted > 0 else 0.0
+    recall = correct / total if total > 0 else 0.0
+    abort_rate = aborted / total if total > 0 else 0.0
+
+    per_type = {
+        t: {
+            "total": by_type_total[t],
+            "correct": by_type_correct[t],
+            "recall": (by_type_correct[t] / by_type_total[t]) if by_type_total[t] else 0.0,
+        }
+        for t in sorted(by_type_total)
+    }
+
+    return {
+        "total_cases": total,
+        "outcomes": dict(by_outcome),
+        "precision": round(precision, 4),
+        "recall": round(recall, 4),
+        "abort_rate": round(abort_rate, 4),
+        "per_case_type": per_type,
+        "tokens_in_total": tokens_in,
+        "tokens_out_total": tokens_out,
+        "avg_turns": round(turns_sum / total, 2) if total else 0.0,
+    }
+
+
+# ── Runner ──────────────────────────────────────────────────────────────────
+
+
+async def run(args: argparse.Namespace) -> tuple[dict[str, Any], int]:
+    cases: list[GroundingCase] = list(ALL_CASES)
+    if args.case_filter:
+        cases = cases_by_type(args.case_filter)
+        if not cases:
+            print(f"no cases match --case-filter {args.case_filter!r}", file=sys.stderr)
+            return {}, 2
+    if args.case_id:
+        cases = [c for c in cases if c.case_id == args.case_id]
+        if not cases:
+            print(f"no case matches --case-id {args.case_id!r}", file=sys.stderr)
+            return {}, 2
+
+    rows: list[dict[str, Any]] = []
+    for case in cases:
+        if args.skip_missing_fixtures and not fixture_exists(
+            case_id=case.case_id,
+            decision_description=case.description,
+            repo_root=FIXTURE_REPO,
+            model=args.model,
+        ):
+            if args.verbose:
+                print(f"SKIP {case.case_id}: no cached fixture and --skip-missing-fixtures")
+            continue
+        try:
+            judgment = run_bind_judgment(
+                case_id=case.case_id,
+                decision_description=case.description,
+                repo_root=FIXTURE_REPO,
+                model=args.model,
+            )
+        except RuntimeError as exc:
+            print(f"ERROR on {case.case_id}: {exc}", file=sys.stderr)
+            if args.gate_mode == "hard":
+                return {}, 3
+            continue
+
+        outcome = _classify(case, judgment)
+        row = _per_case_row(case, judgment, outcome)
+        rows.append(row)
+        if args.verbose:
+            print(
+                f"  {case.case_id:<35}  {outcome:<14}  "
+                f"bound=({judgment.bound_file or '—'}::{judgment.bound_symbol or '—'})"
+            )
+
+    summary = _aggregate(rows)
+    summary["generator_version"] = GENERATOR_VERSION
+    summary["model"] = args.model or "default"
+    summary["gate_mode"] = args.gate_mode
+
+    # Gate enforcement
+    breaches: list[str] = []
+    if summary["recall"] < args.min_recall:
+        breaches.append(f"recall {summary['recall']:.3f} < {args.min_recall}")
+    if summary["precision"] < args.min_precision:
+        breaches.append(f"precision {summary['precision']:.3f} < {args.min_precision}")
+    if summary["abort_rate"] > args.max_abort_rate:
+        breaches.append(f"abort_rate {summary['abort_rate']:.3f} > {args.max_abort_rate}")
+    summary["gate_breaches"] = breaches
+
+    report = {"summary": summary, "rows": rows}
+
+    if args.output:
+        out = Path(args.output)
+        out.parent.mkdir(parents=True, exist_ok=True)
+        out.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
+        print(f"wrote {args.output}")
+
+    print()
+    print(f"M2 grounding-recall eval — {summary['total_cases']} cases")
+    print(f"  precision  : {summary['precision']:.3f}  (gate ≥ {args.min_precision})")
+    print(f"  recall     : {summary['recall']:.3f}  (gate ≥ {args.min_recall})")
+    print(f"  abort_rate : {summary['abort_rate']:.3f}  (gate ≤ {args.max_abort_rate})")
+    print(f"  avg_turns  : {summary['avg_turns']}")
+    print(f"  tokens     : {summary['tokens_in_total']} in / {summary['tokens_out_total']} out")
+    if breaches:
+        print(f"  ⚠ gate breaches: {'; '.join(breaches)}")
+    else:
+        print("  ✓ all gates pass")
+
+    if breaches and args.gate_mode == "hard":
+        return report, 1
+    return report, 0
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__.split("\n")[0] if __doc__ else None)
+    p.add_argument(
+        "--case-filter", choices=("same_name_different_module", "similar_intent", "cross_language")
+    )
+    p.add_argument("--case-id", help="run a single case by id (debug)")
+    p.add_argument("--model", help="override BICAMERAL_GROUNDING_EVAL_MODEL")
+    p.add_argument("--min-recall", type=float, default=0.80)
+    p.add_argument("--min-precision", type=float, default=0.85)
+    p.add_argument("--max-abort-rate", type=float, default=0.30)
+    p.add_argument("--gate-mode", choices=("warn", "hard"), default="warn")
+    p.add_argument(
+        "--skip-missing-fixtures",
+        action="store_true",
+        help="skip cases without a cached fixture (no API call) — useful for offline runs",
+    )
+    p.add_argument("-o", "--output", help="write JSON report to this path")
+    p.add_argument("--verbose", action="store_true")
+    args = p.parse_args()
+
+    _, exit_code = asyncio.run(run(args))
+    return exit_code
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/fixtures/grounding_recall/dataset.py b/tests/fixtures/grounding_recall/dataset.py
new file mode 100644
index 00000000..6372ee30
--- /dev/null
+++ b/tests/fixtures/grounding_recall/dataset.py
@@ -0,0 +1,398 @@
+"""Synthetic grounding-recall dataset for the M2 caller-LLM eval (#280).
+
+Each row is a `GroundingCase` describing a decision text plus the
+ground-truth (file, symbol) it should bind to. The fixture repo at
+``tests/fixtures/grounding_recall/repo/`` contains the intended symbols
+plus deliberate distractors; the LLM is given access to the repo and
+must pick the right symbol to clear the precision gate.
+
+`GENERATOR_VERSION` invalidates the eval cache when bumped — change the
+value if a row is added/edited/removed so the next CI run re-records.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+GENERATOR_VERSION = "1"
+
+
+@dataclass(frozen=True)
+class GroundingCase:
+    """One row in the M2 grounding-recall fixture."""
+
+    case_id: str
+    case_type: str  # "same_name_different_module" | "similar_intent" | "cross_language"
+    description: str  # the decision text the LLM reads
+    intended_file: str  # ground-truth file (relative to repo root)
+    intended_symbol: str  # ground-truth symbol name
+    distractors: tuple[tuple[str, str], ...] = field(default_factory=tuple)
+    # ↑ list of (file, symbol) plausible-but-wrong candidates that exist
+    #   in the fixture repo. Used by the runner to verify the wrong-bind
+    #   path actually has somewhere to land (i.e. the LLM could plausibly
+    #   pick a distractor — measures real precision, not "no other option").
+
+
+# ─── Case A: same-name-different-module ─────────────────────────────────────
+
+CASES_A: list[GroundingCase] = [
+    GroundingCase(
+        case_id="A1_process_order_checkout",
+        case_type="same_name_different_module",
+        description=(
+            "Customer checkout flow caps payment retries at 3 per Stripe contract — "
+            "after 3 declines the user sees a hard error and the cart unlocks."
+        ),
+        intended_file="src/checkout/orders.py",
+        intended_symbol="process_order",
+        distractors=(
+            ("src/admin/orders.py", "process_order"),
+            ("src/billing/refunds.py", "process_order"),
+        ),
+    ),
+    GroundingCase(
+        case_id="A2_process_order_admin",
+        case_type="same_name_different_module",
+        description=(
+            "Support team manually replays finance-flagged orders through the "
+            "refund pipeline — runs under elevated permissions, never touches "
+            "the customer payment-auth path."
+        ),
+        intended_file="src/admin/orders.py",
+        intended_symbol="process_order",
+        distractors=(
+            ("src/checkout/orders.py", "process_order"),
+            ("src/billing/refunds.py", "process_order"),
+        ),
+    ),
+    GroundingCase(
+        case_id="A3_process_order_billing",
+        case_type="same_name_different_module",
+        description=(
+            "Bulk-refund batch job + chargeback webhook handler push refund "
+            "requests through the billing pipeline (credit-to-source + "
+            "accounting reconciliation). Distinct from manual admin replay "
+            "and from customer checkout."
+        ),
+        intended_file="src/billing/refunds.py",
+        intended_symbol="process_order",
+        distractors=(
+            ("src/checkout/orders.py", "process_order"),
+            ("src/admin/orders.py", "process_order"),
+        ),
+    ),
+    GroundingCase(
+        case_id="A4_cancel_order_checkout",
+        case_type="same_name_different_module",
+        description=(
+            "User-initiated order cancellation from the storefront — refunds "
+            "within the 24-hour cancellation window, otherwise blocks with a "
+            "clear error to the customer."
+        ),
+        intended_file="src/checkout/orders.py",
+        intended_symbol="cancel_order",
+        distractors=(("src/billing/refunds.py", "cancel_order"),),
+    ),
+    GroundingCase(
+        case_id="A5_cancel_order_billing",
+        case_type="same_name_different_module",
+        description=(
+            "Stripe chargeback webhook + failed-renewal subscription job both "
+            "trigger billing-side cancellation — bypasses the user-facing "
+            "24-hour window because the trigger is external, records a "
+            "billing cancellation event, refunds to source of funds."
+        ),
+        intended_file="src/billing/refunds.py",
+        intended_symbol="cancel_order",
+        distractors=(("src/checkout/orders.py", "cancel_order"),),
+    ),
+]
+
+
+# ─── Case B: similar-intent-different-symbol ────────────────────────────────
+
+CASES_B: list[GroundingCase] = [
+    GroundingCase(
+        case_id="B1_tenant_rate_limit",
+        case_type="similar_intent",
+        description=(
+            "Enterprise contract clause: each tenant gets 1000 req/min on "
+            "checkout endpoints. Identity is the tenant_id from auth claims, "
+            "scope is the checkout-endpoint family. Implements the tenant-"
+            "scoped SLA, NOT the cluster-wide protective ceiling."
+        ),
+        intended_file="src/middleware/tenant_rate_limit.py",
+        intended_symbol="TenantCheckoutRateLimiter.check",
+        distractors=(
+            ("src/middleware/global_rate_limit.py", "GlobalRateLimiter.check"),
+            ("src/checkout/throttle.py", "throttle_checkout"),
+        ),
+    ),
+    GroundingCase(
+        case_id="B2_retry_cap",
+        case_type="similar_intent",
+        description=(
+            "Stripe contract clause: never retry a declined authorization "
+            "more than 3 times within 24 hours. Enforced before each retry "
+            "by raising MaxRetriesExceeded when the attempt counter hits 3. "
+            "Distinct from the backoff-delay configuration."
+        ),
+        intended_file="src/checkout/retry.py",
+        intended_symbol="CheckoutRetryGuard.check_cap",
+        distractors=(
+            ("src/checkout/retry.py", "StripeRetryPolicy.delay_for"),
+            ("src/checkout/orders.py", "process_order"),
+        ),
+    ),
+    GroundingCase(
+        case_id="B3_throttle_double_submit",
+        case_type="similar_intent",
+        description=(
+            "Prevent double-submit on the 'pay now' button by adding a "
+            "30-second cooldown per session. This is session-level cooldown, "
+            "NOT per-tenant rate limit — the trigger is the user's recent "
+            "click history, not the tenant's request volume."
+        ),
+        intended_file="src/checkout/throttle.py",
+        intended_symbol="throttle_checkout",
+        distractors=(
+            ("src/middleware/tenant_rate_limit.py", "TenantCheckoutRateLimiter.check"),
+            ("src/middleware/global_rate_limit.py", "GlobalRateLimiter.check"),
+        ),
+    ),
+    GroundingCase(
+        case_id="B4_validate_jwt",
+        case_type="similar_intent",
+        description=(
+            "API gateway middleware validates JWT signature, expiry, and "
+            "audience on every inbound request. Stateless bearer credentials — "
+            "the gateway does NOT consult any session store. Used on the "
+            "API surface, not the web UI."
+        ),
+        intended_file="src/auth/tokens.py",
+        intended_symbol="validate_token",
+        distractors=(("src/auth/session.py", "validate_session"),),
+    ),
+    GroundingCase(
+        case_id="B5_validate_session",
+        case_type="similar_intent",
+        description=(
+            "Web UI middleware validates the server-side session cookie "
+            "before serving authenticated pages. Sessions support invalidation "
+            "(unlike JWTs); a request carries a session cookie OR a JWT, "
+            "never both, and this function handles the cookie path."
+        ),
+        intended_file="src/auth/session.py",
+        intended_symbol="validate_session",
+        distractors=(("src/auth/tokens.py", "validate_token"),),
+    ),
+    GroundingCase(
+        case_id="B6_refresh_jwt",
+        case_type="similar_intent",
+        description=(
+            "Exchange a long-lived refresh token for a fresh access token. "
+            "Refresh tokens are JWTs marked with type='refresh' and have "
+            "their own expiry separate from access tokens. Mints a new "
+            "access JWT for the same subject."
+        ),
+        intended_file="src/auth/tokens.py",
+        intended_symbol="refresh_token",
+        distractors=(("src/auth/session.py", "refresh_session"),),
+    ),
+    GroundingCase(
+        case_id="B7_refresh_session",
+        case_type="similar_intent",
+        description=(
+            "Extend the server-side session's expiry on every authenticated "
+            "UI page-load — slides the idle-timeout window forward, but caps "
+            "at the absolute 24-hour creation timeout regardless of activity. "
+            "Server-side state, not a token mint."
+        ),
+        intended_file="src/auth/session.py",
+        intended_symbol="refresh_session",
+        distractors=(("src/auth/tokens.py", "refresh_token"),),
+    ),
+    GroundingCase(
+        case_id="B8_verify_webhook_python",
+        case_type="similar_intent",
+        description=(
+            "Verify HMAC-SHA256 signature on inbound webhook payloads before "
+            "any processing. Computed over the raw body using the per-source "
+            "shared secret. Constant-time comparison via hmac.compare_digest. "
+            "Implemented in the Python webhook ingress, not the auth path."
+        ),
+        intended_file="src/webhooks/verify.py",
+        intended_symbol="verify_webhook_signature",
+        distractors=(
+            ("src/auth/tokens.py", "validate_token"),
+            ("src/auth/session.py", "validate_session"),
+        ),
+    ),
+    GroundingCase(
+        case_id="B9_request_metrics",
+        case_type="similar_intent",
+        description=(
+            "API gateway emits per-request metrics — latency histogram, "
+            "status counter, route + tenant tags — after the response is "
+            "generated. One metric emission per inbound HTTP request, "
+            "tagged with tenant_id for SLA dashboards."
+        ),
+        intended_file="src/metrics/collect.py",
+        intended_symbol="collect_request_metrics",
+        distractors=(("src/metrics/collect.py", "collect_handler_metrics"),),
+    ),
+    GroundingCase(
+        case_id="B10_handler_metrics",
+        case_type="similar_intent",
+        description=(
+            "Each handler invocation emits its own latency + outcome metrics — "
+            "finer grain than per-request, since one inbound request fans out "
+            "to many handler invocations through the middleware chain. Used "
+            "to attribute regressions to specific handlers."
+        ),
+        intended_file="src/metrics/collect.py",
+        intended_symbol="collect_handler_metrics",
+        distractors=(("src/metrics/collect.py", "collect_request_metrics"),),
+    ),
+]
+
+
+# ─── Case C: cross-language ─────────────────────────────────────────────────
+
+CASES_C: list[GroundingCase] = [
+    GroundingCase(
+        case_id="C1_verify_python",
+        case_type="cross_language",
+        description=(
+            "Python API service verifies HMAC-SHA256 signatures on inbound "
+            "webhook requests using the per-source shared secret with "
+            "constant-time comparison. Runtime: CPython on the API workers."
+        ),
+        intended_file="src/webhooks/verify.py",
+        intended_symbol="verify_webhook_signature",
+        distractors=(("src/webhooks/verify.ts", "verifyWebhookSignature"),),
+    ),
+    GroundingCase(
+        case_id="C2_verify_typescript",
+        case_type="cross_language",
+        description=(
+            "TypeScript edge-worker verifies HMAC-SHA256 webhook signatures "
+            "before forwarding to the origin. Same security contract as the "
+            "Python API path, but runs at the CDN edge so verification "
+            "happens before the request hits the origin."
+        ),
+        intended_file="src/webhooks/verify.ts",
+        intended_symbol="verifyWebhookSignature",
+        distractors=(("src/webhooks/verify.py", "verify_webhook_signature"),),
+    ),
+    GroundingCase(
+        case_id="C3_dispatch_python",
+        case_type="cross_language",
+        description=(
+            "Python webhook ingress routes verified events to the subscriber "
+            "handler chain — after signature verification, fans out to each "
+            "handler with per-handler retry. Errors in one handler do not "
+            "abort the rest. Runs in the Python ingestion path."
+        ),
+        intended_file="src/webhooks/dispatch.py",
+        intended_symbol="dispatch_event",
+        distractors=(("src/webhooks/dispatch.ts", "dispatchEvent"),),
+    ),
+    GroundingCase(
+        case_id="C4_dispatch_typescript",
+        case_type="cross_language",
+        description=(
+            "TypeScript runtime routes verified webhook events to subscriber "
+            "handlers, fanning out with per-handler retry. Same routing "
+            "contract as the Python sibling, but runs in the TS ingress (e.g. "
+            "the edge worker after sig-verify)."
+        ),
+        intended_file="src/webhooks/dispatch.ts",
+        intended_symbol="dispatchEvent",
+        distractors=(("src/webhooks/dispatch.py", "dispatch_event"),),
+    ),
+    GroundingCase(
+        case_id="C5_enqueue_python",
+        case_type="cross_language",
+        description=(
+            "Python webhook handler queues events for asynchronous dispatch "
+            "when the inbound request must respond inside Stripe's 5-second "
+            "window — the queue lives in-process, drained by a background "
+            "worker. Used only on the Python path."
+        ),
+        intended_file="src/webhooks/dispatch.py",
+        intended_symbol="enqueue_dispatch",
+        distractors=(("src/webhooks/dispatch.ts", "enqueueDispatch"),),
+    ),
+    GroundingCase(
+        case_id="C6_enqueue_typescript",
+        case_type="cross_language",
+        description=(
+            "TypeScript edge worker queues webhook events for async dispatch "
+            "to satisfy Stripe's 5-second response budget. Same async-queue "
+            "contract as the Python sibling, runs in the TS runtime where "
+            "edge workers must respond fast."
+        ),
+        intended_file="src/webhooks/dispatch.ts",
+        intended_symbol="enqueueDispatch",
+        distractors=(("src/webhooks/dispatch.py", "enqueue_dispatch"),),
+    ),
+    GroundingCase(
+        case_id="C7_metrics_python",
+        case_type="cross_language",
+        description=(
+            "Python API gateway emits per-request latency and status metrics "
+            "tagged with tenant_id. Runs in-process on the Python workers, "
+            "fires after the response is generated by the gateway middleware."
+        ),
+        intended_file="src/metrics/collect.py",
+        intended_symbol="collect_request_metrics",
+        distractors=(("src/metrics/collect.ts", "collectRequestMetrics"),),
+    ),
+    GroundingCase(
+        case_id="C8_metrics_typescript",
+        case_type="cross_language",
+        description=(
+            "TypeScript edge proxy emits per-request latency and status "
+            "metrics tagged with tenant_id. Runs in the TS edge runtime, "
+            "fires after each proxied response. Same metrics contract as "
+            "the Python gateway, different runtime."
+        ),
+        intended_file="src/metrics/collect.ts",
+        intended_symbol="collectRequestMetrics",
+        distractors=(("src/metrics/collect.py", "collect_request_metrics"),),
+    ),
+]
+
+
+ALL_CASES: list[GroundingCase] = CASES_A + CASES_B + CASES_C
+
+
+def cases_by_type(case_type: str) -> list[GroundingCase]:
+    return [c for c in ALL_CASES if c.case_type == case_type]
+
+
+def case_by_id(case_id: str) -> GroundingCase:
+    for c in ALL_CASES:
+        if c.case_id == case_id:
+            return c
+    raise KeyError(f"unknown case_id: {case_id}")
+
+
+# Sanity check at import time — fail loud if the dataset shape regresses.
+def _validate_dataset() -> None:
+    seen_ids: set[str] = set()
+    for c in ALL_CASES:
+        if c.case_id in seen_ids:
+            raise AssertionError(f"duplicate case_id: {c.case_id}")
+        seen_ids.add(c.case_id)
+        if c.case_type not in ("same_name_different_module", "similar_intent", "cross_language"):
+            raise AssertionError(f"{c.case_id}: invalid case_type {c.case_type!r}")
+        if not c.intended_file or not c.intended_symbol:
+            raise AssertionError(f"{c.case_id}: intended_file/symbol must be non-empty")
+        for df, ds in c.distractors:
+            if (df, ds) == (c.intended_file, c.intended_symbol):
+                raise AssertionError(f"{c.case_id}: distractor matches intended")
+
+
+_validate_dataset()
diff --git a/tests/fixtures/grounding_recall/repo/src/admin/orders.py b/tests/fixtures/grounding_recall/repo/src/admin/orders.py
new file mode 100644
index 00000000..a2ee40c7
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/admin/orders.py
@@ -0,0 +1,39 @@
+"""Admin tools for finance/support — refunds, manual review, reporting.
+
+Runs under elevated permissions; never in the customer checkout path.
+"""
+
+
+def process_order(order):
+    """Admin replay of a finance-flagged order — refunds only.
+
+    Used by the support team to manually replay a flagged order through
+    the refund pipeline. Does NOT exercise the customer checkout path
+    (no payment auth, no inventory holds). Distinct from
+    checkout/orders.py:process_order which is customer-facing.
+    """
+    if not order.flagged_for_refund:
+        raise NotEligibleForAdminReplayError(order.id)
+    return _replay_refund(order)
+
+
+def report_orders(start, end):
+    """Generate the daily ops report for finance — flagged orders only."""
+    rows = _load_flagged_in_range(start, end)
+    return _format_report(rows)
+
+
+def _replay_refund(order):
+    raise NotImplementedError
+
+
+def _load_flagged_in_range(start, end):
+    raise NotImplementedError
+
+
+def _format_report(rows):
+    raise NotImplementedError
+
+
+class NotEligibleForAdminReplayError(Exception):
+    pass
diff --git a/tests/fixtures/grounding_recall/repo/src/auth/session.py b/tests/fixtures/grounding_recall/repo/src/auth/session.py
new file mode 100644
index 00000000..bf7a7723
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/auth/session.py
@@ -0,0 +1,70 @@
+"""Server-side session lifecycle — distinct from JWT tokens.
+
+Sessions are stored server-side and identified by an opaque cookie;
+JWTs (tokens.py) are stateless bearer credentials. Don't confuse the
+two — sessions support invalidation, JWTs don't.
+"""
+
+
+def validate_session(session_id):
+    """Look up a session by id and confirm it's not expired or revoked.
+
+    Returns the session dict on success; raises SessionExpired or
+    SessionNotFound otherwise. Used by the web UI middleware (NOT the
+    API gateway, which uses validate_token). Mutually exclusive with
+    JWT validation — a request carries one or the other.
+    """
+    session = _load_session(session_id)
+    if session is None:
+        raise SessionNotFound(session_id)
+    if session.expires_at < _now():
+        raise SessionExpired(session_id)
+    return session
+
+
+def refresh_session(session_id):
+    """Extend the session's expiry by another idle-timeout window.
+
+    Called on every authenticated UI page-load. Caps at the absolute
+    timeout (24h from creation) regardless of activity.
+    """
+    session = validate_session(session_id)
+    session.expires_at = min(_now() + _idle_timeout(), session.created_at + _absolute_timeout())
+    _store_session(session)
+
+
+def revoke_session(session_id):
+    """Logout — marks the session as revoked, future loads will fail."""
+    _delete_session(session_id)
+
+
+def _load_session(session_id):
+    raise NotImplementedError
+
+
+def _store_session(session):
+    raise NotImplementedError
+
+
+def _delete_session(session_id):
+    raise NotImplementedError
+
+
+def _now():
+    raise NotImplementedError
+
+
+def _idle_timeout():
+    return 1800  # 30 min
+
+
+def _absolute_timeout():
+    return 86400  # 24 h
+
+
+class SessionNotFound(Exception):
+    pass
+
+
+class SessionExpired(Exception):
+    pass
diff --git a/tests/fixtures/grounding_recall/repo/src/auth/tokens.py b/tests/fixtures/grounding_recall/repo/src/auth/tokens.py
new file mode 100644
index 00000000..b45f7fe1
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/auth/tokens.py
@@ -0,0 +1,62 @@
+"""JWT token lifecycle — validation, refresh, revocation."""
+
+
+def validate_token(token):
+    """Validate a JWT's signature, expiry, and audience.
+
+    Returns the parsed claims dict if valid; raises TokenInvalid on
+    signature mismatch, TokenExpired on stale exp, TokenWrongAudience
+    on aud mismatch. Used by the API gateway middleware on every
+    inbound request.
+    """
+    claims = _decode_jwt(token)
+    if claims["exp"] < _now():
+        raise TokenExpired
+    if claims["aud"] != _expected_audience():
+        raise TokenWrongAudience
+    return claims
+
+
+def refresh_token(refresh_token):
+    """Exchange a long-lived refresh token for a fresh access token."""
+    claims = _decode_jwt(refresh_token)
+    if claims.get("type") != "refresh":
+        raise TokenInvalid
+    return _mint_access_token(claims["sub"])
+
+
+def revoke_token(token):
+    """Add a token to the revocation list — checked on every validate."""
+    claims = _decode_jwt(token)
+    _revocation_store.add(claims["jti"], claims["exp"])
+
+
+def _decode_jwt(token):
+    raise NotImplementedError
+
+
+def _now():
+    raise NotImplementedError
+
+
+def _expected_audience():
+    return "bicameral-api"
+
+
+def _mint_access_token(subject):
+    raise NotImplementedError
+
+
+_revocation_store = None
+
+
+class TokenInvalid(Exception):
+    pass
+
+
+class TokenExpired(Exception):
+    pass
+
+
+class TokenWrongAudience(Exception):
+    pass
diff --git a/tests/fixtures/grounding_recall/repo/src/billing/refunds.py b/tests/fixtures/grounding_recall/repo/src/billing/refunds.py
new file mode 100644
index 00000000..c8f1eefa
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/billing/refunds.py
@@ -0,0 +1,53 @@
+"""Billing-side refund processing — distinct from admin replay.
+
+Refunds initiated through the billing system flow through this module:
+batch processing, source-of-funds tracking, accounting reconciliation.
+Admin tools (admin/orders.py:process_order) replay individual flagged
+orders manually; this is the bulk path for normal refund volume.
+"""
+
+
+def cancel_order(order_id, reason):
+    """Refund-driven cancellation — used when the billing system
+    initiates the cancel (e.g. failed renewal, chargeback).
+
+    Distinct from checkout/orders.py:cancel_order which is user-initiated
+    and respects the 24h window. Billing cancellations bypass the window
+    because they originate from external triggers (Stripe chargeback
+    webhooks, subscription failures).
+    """
+    order = _load(order_id)
+    _record_billing_cancellation(order, reason)
+    return _refund_to_source_of_funds(order)
+
+
+def process_order(refund_request):
+    """Run a refund through the billing pipeline — credit + accounting.
+
+    Used by the bulk-refund batch job and by the chargeback webhook
+    handler. Distinct from admin/orders.py:process_order (manual replay
+    of finance-flagged orders) and checkout/orders.py:process_order
+    (customer payment auth + fulfillment).
+    """
+    _credit_to_source(refund_request.amount, refund_request.source)
+    _record_accounting_entry(refund_request)
+
+
+def _load(order_id):
+    raise NotImplementedError
+
+
+def _record_billing_cancellation(order, reason):
+    raise NotImplementedError
+
+
+def _refund_to_source_of_funds(order):
+    raise NotImplementedError
+
+
+def _credit_to_source(amount, source):
+    raise NotImplementedError
+
+
+def _record_accounting_entry(refund):
+    raise NotImplementedError
diff --git a/tests/fixtures/grounding_recall/repo/src/checkout/orders.py b/tests/fixtures/grounding_recall/repo/src/checkout/orders.py
new file mode 100644
index 00000000..acac03c0
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/checkout/orders.py
@@ -0,0 +1,50 @@
+"""Customer-facing checkout flow.
+
+Stripe contract: max 3 retries per payment intent, then hard error.
+"""
+
+
+def process_order(order):
+    """Run a checkout order through payment auth + fulfillment.
+
+    Caps payment retries at 3 per Stripe contract — after 3 declines the
+    user sees a hard error and the cart unlocks. This is the customer
+    checkout path, distinct from admin/orders.py which handles refunds.
+    """
+    for _attempt in range(3):
+        result = _attempt_payment(order)
+        if result.success:
+            return _fulfill(order)
+    raise PaymentDeclinedError(order.id)
+
+
+def cancel_order(order_id):
+    """User-initiated cancellation — refunds within 24h, otherwise blocks."""
+    order = _load(order_id)
+    if order.age_hours > 24:
+        raise CancellationWindowClosedError(order_id)
+    return _refund_to_source(order)
+
+
+def _attempt_payment(order):
+    raise NotImplementedError
+
+
+def _fulfill(order):
+    raise NotImplementedError
+
+
+def _load(order_id):
+    raise NotImplementedError
+
+
+def _refund_to_source(order):
+    raise NotImplementedError
+
+
+class PaymentDeclinedError(Exception):
+    pass
+
+
+class CancellationWindowClosedError(Exception):
+    pass
diff --git a/tests/fixtures/grounding_recall/repo/src/checkout/retry.py b/tests/fixtures/grounding_recall/repo/src/checkout/retry.py
new file mode 100644
index 00000000..c4b7dfec
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/checkout/retry.py
@@ -0,0 +1,28 @@
+"""Checkout retry semantics — caps the number of payment-auth attempts."""
+
+
+class CheckoutRetryGuard:
+    """Enforces the per-checkout retry ceiling required by Stripe contract."""
+
+    MAX_ATTEMPTS = 3
+
+    def check_cap(self, attempt_count):
+        """Raise MaxRetriesExceeded if attempt_count >= 3.
+
+        Called from checkout/orders.py:process_order before each retry.
+        Implements the contract clause: "merchants must not retry a
+        declined authorization more than 3 times within 24 hours."
+        """
+        if attempt_count >= self.MAX_ATTEMPTS:
+            raise MaxRetriesExceeded(attempt_count)
+
+
+class StripeRetryPolicy:
+    """Configures backoff between retries — exponential w/ 2s base."""
+
+    def delay_for(self, attempt):
+        return 2**attempt
+
+
+class MaxRetriesExceeded(Exception):
+    pass
diff --git a/tests/fixtures/grounding_recall/repo/src/checkout/throttle.py b/tests/fixtures/grounding_recall/repo/src/checkout/throttle.py
new file mode 100644
index 00000000..2f3c46a7
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/checkout/throttle.py
@@ -0,0 +1,40 @@
+"""Per-session checkout throttling — distinct from rate limiting.
+
+Throttle = "cooldown after a triggering action" (e.g. wait 30s after
+pressing 'pay now' before allowing it again). Rate limit = "max N
+requests per window per identity."
+"""
+
+
+def throttle_checkout(session_id, action):
+    """Apply session-level cooldown to a checkout action.
+
+    Reads the session's last-action timestamp and rejects with
+    ThrottledError if the cooldown window hasn't elapsed. Does NOT
+    enforce per-tenant or per-IP rate limits — that's the middleware
+    layer's job.
+    """
+    last_ts = _last_action_timestamp(session_id, action)
+    if last_ts and _seconds_since(last_ts) < _cooldown_for(action):
+        raise ThrottledError(session_id, action)
+    _record_action(session_id, action)
+
+
+def _last_action_timestamp(session_id, action):
+    raise NotImplementedError
+
+
+def _seconds_since(ts):
+    raise NotImplementedError
+
+
+def _cooldown_for(action):
+    return {"pay_now": 30, "apply_coupon": 5}.get(action, 0)
+
+
+def _record_action(session_id, action):
+    raise NotImplementedError
+
+
+class ThrottledError(Exception):
+    pass
diff --git a/tests/fixtures/grounding_recall/repo/src/metrics/collect.py b/tests/fixtures/grounding_recall/repo/src/metrics/collect.py
new file mode 100644
index 00000000..01cb9de6
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/metrics/collect.py
@@ -0,0 +1,36 @@
+"""Request and handler metric collection (Python runtime).
+
+Emits to the metrics backend on every inbound request and every
+handler invocation. Sibling of collect.ts for the TypeScript runtime.
+"""
+
+
+def collect_request_metrics(request, response, latency_ms):
+    """Emit per-request metrics: latency, status, route, tenant.
+
+    Fired by the API gateway middleware on every inbound request, after
+    the response is generated. Tagged with tenant_id for SLA tracking.
+    Distinct from collect_handler_metrics which is per-handler-invocation
+    granularity (one request can fan out to many handlers).
+    """
+    _emit_counter("requests_total", tags={"route": request.path, "status": response.status})
+    _emit_histogram("request_latency_ms", latency_ms, tags={"tenant": request.auth.tenant_id})
+
+
+def collect_handler_metrics(handler_name, outcome, duration_ms):
+    """Emit per-handler-invocation metrics — finer grain than request-level.
+
+    One request → many handler invocations (middleware chain + business
+    logic). This captures each handler's own latency + outcome so we can
+    spot which handler is responsible for a regression.
+    """
+    _emit_counter("handler_invocations_total", tags={"handler": handler_name, "outcome": outcome})
+    _emit_histogram("handler_duration_ms", duration_ms, tags={"handler": handler_name})
+
+
+def _emit_counter(name, tags):
+    raise NotImplementedError
+
+
+def _emit_histogram(name, value, tags):
+    raise NotImplementedError
diff --git a/tests/fixtures/grounding_recall/repo/src/metrics/collect.ts b/tests/fixtures/grounding_recall/repo/src/metrics/collect.ts
new file mode 100644
index 00000000..5a4fa5cf
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/metrics/collect.ts
@@ -0,0 +1,36 @@
+// Request metric collection (TypeScript runtime).
+// Sibling of collect.py — same contract, different runtime.
+
+interface Request {
+  path: string;
+  auth: { tenantId: string };
+}
+
+interface Response {
+  status: number;
+}
+
+export function collectRequestMetrics(
+  request: Request,
+  response: Response,
+  latencyMs: number,
+): void {
+  // Emit per-request metrics: latency, status, route, tenant.
+  // Fired by the API gateway middleware on every inbound request,
+  // after the response is generated. TS sibling of
+  // collect.py:collect_request_metrics.
+  emitCounter("requests_total", { route: request.path, status: String(response.status) });
+  emitHistogram("request_latency_ms", latencyMs, { tenant: request.auth.tenantId });
+}
+
+function emitCounter(_name: string, _tags: Record<string, string>): void {
+  throw new Error("not implemented");
+}
+
+function emitHistogram(
+  _name: string,
+  _value: number,
+  _tags: Record<string, string>,
+): void {
+  throw new Error("not implemented");
+}
diff --git a/tests/fixtures/grounding_recall/repo/src/middleware/global_rate_limit.py b/tests/fixtures/grounding_recall/repo/src/middleware/global_rate_limit.py
new file mode 100644
index 00000000..9349c1a0
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/middleware/global_rate_limit.py
@@ -0,0 +1,30 @@
+"""Service-wide rate limiting — protects against abuse, not SLA.
+
+Distinct from tenant_rate_limit.py which enforces the Enterprise
+contract. This caps total req/sec across the whole service to keep
+the cluster within capacity headroom.
+"""
+
+
+class GlobalRateLimiter:
+    """Service-wide ceiling — 50_000 req/sec total across all tenants."""
+
+    GLOBAL_LIMIT_PER_SEC = 50_000
+
+    def __init__(self, store):
+        self._store = store
+
+    def check(self, request):
+        """Cluster-level cap — not tied to any contract clause.
+
+        Trips long before any tenant could; mostly defensive against
+        runaway crawlers and mis-tuned client retry storms. Tenant-scoped
+        SLA enforcement lives in tenant_rate_limit.py:TenantCheckoutRateLimiter.
+        """
+        if self._store.count_in_window("__global__", window_seconds=1) >= self.GLOBAL_LIMIT_PER_SEC:
+            raise GlobalRateLimitExceeded
+        self._store.increment("__global__")
+
+
+class GlobalRateLimitExceeded(Exception):
+    pass
diff --git a/tests/fixtures/grounding_recall/repo/src/middleware/tenant_rate_limit.py b/tests/fixtures/grounding_recall/repo/src/middleware/tenant_rate_limit.py
new file mode 100644
index 00000000..7ba6b5f1
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/middleware/tenant_rate_limit.py
@@ -0,0 +1,34 @@
+"""Per-tenant rate limiting for checkout endpoints.
+
+Enterprise SLA: 1000 req/min per tenant. Keyed on tenant_id resolved
+from the request's auth claims. Distinct from global_rate_limit.py
+which protects the whole service.
+"""
+
+
+class TenantCheckoutRateLimiter:
+    """Token bucket rate limiter scoped to (tenant_id, checkout_endpoint)."""
+
+    LIMIT_PER_MIN = 1000
+
+    def __init__(self, store):
+        self._store = store
+
+    def check(self, request):
+        """Return None if under cap; raise RateLimitExceeded if over.
+
+        Implements the Enterprise SLA commitment: 1000 req/min per tenant
+        per checkout endpoint. Tenant id is read from request.auth.tenant_id.
+        Used by middleware/checkout_pipeline.py before reaching the
+        order handler.
+        """
+        tenant_id = request.auth.tenant_id
+        endpoint = request.path
+        key = f"{tenant_id}:{endpoint}"
+        if self._store.count_in_window(key, window_seconds=60) >= self.LIMIT_PER_MIN:
+            raise RateLimitExceeded(key)
+        self._store.increment(key)
+
+
+class RateLimitExceeded(Exception):
+    pass
diff --git a/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.py b/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.py
new file mode 100644
index 00000000..c3619394
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.py
@@ -0,0 +1,38 @@
+"""Inbound webhook event router (Python runtime).
+
+After signature verification, dispatch the event to the right
+subscriber chain. Sibling of dispatch.ts for the TypeScript runtime.
+"""
+
+
+def dispatch_event(event):
+    """Route a verified webhook event to its subscriber handlers.
+
+    Looks up the event type → handler list mapping, fans out to each
+    handler with a per-handler retry policy. Errors in one handler do
+    not abort the rest. Used by the webhook ingress after
+    verify_webhook_signature passes.
+    """
+    handlers = _subscribers_for(event.type)
+    for handler in handlers:
+        try:
+            handler(event)
+        except Exception as exc:
+            _record_handler_failure(event, handler, exc)
+
+
+def enqueue_dispatch(event):
+    """Queue an event for asynchronous dispatch — used when the
+    inbound request must respond immediately (Stripe 5s window)."""
+    _queue.put(event)
+
+
+def _subscribers_for(event_type):
+    raise NotImplementedError
+
+
+def _record_handler_failure(event, handler, exc):
+    raise NotImplementedError
+
+
+_queue = None
diff --git a/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.ts b/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.ts
new file mode 100644
index 00000000..ea03812e
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.ts
@@ -0,0 +1,44 @@
+// Inbound webhook event router (TypeScript runtime).
+// Sibling of dispatch.py — same routing contract, different runtime.
+
+export interface WebhookEvent {
+  type: string;
+  payload: unknown;
+}
+
+type Handler = (event: WebhookEvent) => Promise<void> | void;
+
+export async function dispatchEvent(event: WebhookEvent): Promise<void> {
+  // Route a verified webhook event to its subscriber handlers.
+  // Looks up the event type → handler list mapping, fans out to each
+  // handler with a per-handler retry policy. Errors in one handler do
+  // not abort the rest. TS sibling of dispatch.py:dispatch_event.
+  const handlers = subscribersFor(event.type);
+  for (const handler of handlers) {
+    try {
+      await handler(event);
+    } catch (exc) {
+      recordHandlerFailure(event, handler, exc);
+    }
+  }
+}
+
+export function enqueueDispatch(event: WebhookEvent): void {
+  // Queue an event for asynchronous dispatch — used when the inbound
+  // request must respond immediately (Stripe 5s window).
+  queue.push(event);
+}
+
+function subscribersFor(_eventType: string): Handler[] {
+  throw new Error("not implemented");
+}
+
+function recordHandlerFailure(
+  _event: WebhookEvent,
+  _handler: Handler,
+  _exc: unknown,
+): void {
+  throw new Error("not implemented");
+}
+
+const queue: WebhookEvent[] = [];
diff --git a/tests/fixtures/grounding_recall/repo/src/webhooks/verify.py b/tests/fixtures/grounding_recall/repo/src/webhooks/verify.py
new file mode 100644
index 00000000..01eab8cb
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/webhooks/verify.py
@@ -0,0 +1,28 @@
+"""HMAC-SHA256 webhook signature verification (Python runtime)."""
+
+import hashlib
+import hmac
+
+
+def verify_webhook_signature(body, signature_header, secret):
+    """Validate an incoming webhook's HMAC signature before processing.
+
+    Implements the security contract: webhook payloads must carry a
+    valid HMAC-SHA256 signature in the X-Signature header, computed
+    over the raw body using the per-source shared secret. Constant-time
+    comparison via hmac.compare_digest. Distinct from verify.ts which
+    is the TypeScript-runtime sibling.
+    """
+    expected = hmac.new(secret, body, hashlib.sha256).hexdigest()
+    if not hmac.compare_digest(expected, signature_header):
+        raise InvalidWebhookSignature
+    return True
+
+
+def extract_signature_header(headers):
+    """Pull the signature from X-Signature, fallback to legacy X-Sig."""
+    return headers.get("X-Signature") or headers.get("X-Sig")
+
+
+class InvalidWebhookSignature(Exception):
+    pass
diff --git a/tests/fixtures/grounding_recall/repo/src/webhooks/verify.ts b/tests/fixtures/grounding_recall/repo/src/webhooks/verify.ts
new file mode 100644
index 00000000..5bf1cbf5
--- /dev/null
+++ b/tests/fixtures/grounding_recall/repo/src/webhooks/verify.ts
@@ -0,0 +1,33 @@
+// HMAC-SHA256 webhook signature verification (TypeScript runtime).
+// Sibling of verify.py — same security contract, different runtime.
+
+import { createHmac, timingSafeEqual } from "node:crypto";
+
+export class InvalidWebhookSignature extends Error {}
+
+export function verifyWebhookSignature(
+  body: Buffer,
+  signatureHeader: string,
+  secret: string,
+): true {
+  // Validate an incoming webhook's HMAC signature before processing.
+  // Implements the security contract: webhook payloads must carry a
+  // valid HMAC-SHA256 signature in the X-Signature header, computed
+  // over the raw body using the per-source shared secret. Constant-time
+  // comparison via timingSafeEqual. Sibling of verify.py — same
+  // contract for the TypeScript runtime.
+  const expected = createHmac("sha256", secret).update(body).digest("hex");
+  const a = Buffer.from(expected);
+  const b = Buffer.from(signatureHeader);
+  if (a.length !== b.length || !timingSafeEqual(a, b)) {
+    throw new InvalidWebhookSignature();
+  }
+  return true;
+}
+
+export function extractSignatureHeader(
+  headers: Record<string, string>,
+): string | undefined {
+  // Pull the signature from X-Signature, fallback to legacy X-Sig.
+  return headers["X-Signature"] ?? headers["X-Sig"];
+}