diff --git a/.github/workflows/test-mcp-regression.yml b/.github/workflows/test-mcp-regression.yml index 19cb25b0..9537a0fe 100644 --- a/.github/workflows/test-mcp-regression.yml +++ b/.github/workflows/test-mcp-regression.yml @@ -129,6 +129,25 @@ jobs: --skill-variant from-skill-md -o test-results/m1-adversarial.json + # ── M2 grounding-recall eval (warn-only, #280 PR-2) ──────────── + # Drives the bicameral-bind skill against tests/fixtures/grounding_recall/ + # — synthetic fixture with 23 decisions across same-name-different-module, + # similar-intent, and cross-language cases. Cache hits at + # tests/eval/fixtures/bind_judge/ keep CI cost ~$0 unless the dataset, + # fixture repo, or skill change. Warn-only initially per #280's gating- + # is-observability framing — we ship the measurement, observe the + # baseline, then ratchet to --gate-mode hard once the signal is stable. + - name: M2 grounding-recall eval (warn-only) + if: matrix.os == 'ubuntu-latest' + continue-on-error: true + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + BICAMERAL_GROUNDING_EVAL_MODEL: claude-haiku-4-5-20251001 + run: > + python tests/eval_grounding_recall.py + --gate-mode warn + -o test-results/m2-grounding-recall.json + # ── Generate rich E2E report from artifacts ──────────────────── # Ubuntu-only: the script consumes the medusa adversarial corpus # (cloned only on Ubuntu above) plus the Phase 3 E2E artifacts diff --git a/CHANGELOG.md b/CHANGELOG.md index de577922..2b5bb8dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ All notable changes to bicameral-mcp are tracked here. Format loosely follows - **`skills/bicameral-bind/SKILL.md` (#280).** New skill that extracts the bind contract out of `skills/bicameral-ingest/SKILL.md` §2 and tightens it from advisory to mandatory: the agent must Read at least one candidate file end-to-end, confirm the symbol via `validate_symbols`, and abort on weak evidence. Documents the handler-side rejection contract added in this release. `bicameral-ingest` §2 now points at the new skill instead of duplicating the verification rules. +- **`tests/eval_grounding_recall.py` — M2 grounding-recall eval harness (#280 PR-2).** Synthetic-fixture benchmark that drives the `bicameral-bind` skill end-to-end and measures three axes: precision (of bindings the agent committed, what fraction were correct), recall (of ground-truth bindings, how many the agent got right), and abort rate (first-class signal because the bind skill makes "abort on weak evidence" an explicit contract). Dataset at `tests/fixtures/grounding_recall/dataset.py` with 23 cases across same-name-different-module (5), similar-intent-different-symbol (10), and cross-language (8) — fixture repo at `tests/fixtures/grounding_recall/repo/`. Headless caller-LLM driver at `tests/eval/_bind_judge.py` (modeled on `_skill_judge.py`) drives a multi-turn `read_file` / `validate_symbols` / `submit_binding` tool-use loop with response caching at `tests/eval/fixtures/bind_judge/` keyed on SHA(model | skill | repo | decision). Default gates: recall ≥ 0.80, precision ≥ 0.85, abort_rate ≤ 0.30 per #280 acceptance. New CI step is **warn-only initially** (`continue-on-error: true`, mirrors the M1 step) — gather a baseline first, ratchet to `--gate-mode hard` once the signal is stable. + ### Changed - **`code_locator/tools/validate_symbols.py`: dropped unused `self._db` field.** The retention comment ("Retained so `code_locator.adapter.ground_mappings()` can reach `db.lookup_by_file()`") referenced a path deleted in v0.6.0; the field had zero readers. diff --git a/tests/eval/_bind_judge.py b/tests/eval/_bind_judge.py new file mode 100644 index 00000000..fd0ca1e3 --- /dev/null +++ b/tests/eval/_bind_judge.py @@ -0,0 +1,491 @@ +"""Headless caller-LLM driver for the bicameral-bind skill (#280 PR-2). + +Drives `skills/bicameral-bind/SKILL.md` end-to-end against a synthetic +fixture repo: the LLM gets `read_file`, `validate_symbols`, and +`submit_binding` tools; we run a multi-turn tool-use loop until it +either submits a binding or aborts on weak evidence. + +Modeled on tests/eval/_skill_judge.py — same x-api-key auth, same +fixture-cache keyed on SHA(model | skill_sha | repo_sha | input_sha). +Cache hits keep CI cost ~$0 unless the dataset, fixture repo, or skill +change. + +Environment: + ANTHROPIC_API_KEY required for live calls + BICAMERAL_GROUNDING_EVAL_MODEL default "claude-haiku-4-5-20251001" + BICAMERAL_GROUNDING_EVAL_RECORD=1 force-bypass cache, re-record +""" + +from __future__ import annotations + +import hashlib +import json +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import httpx + +REPO_ROOT = Path(__file__).resolve().parents[2] +SKILL_MD_PATH = REPO_ROOT / "skills" / "bicameral-bind" / "SKILL.md" +CACHE_DIR = Path(__file__).resolve().parent / "fixtures" / "bind_judge" + +ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages" +ANTHROPIC_API_VERSION = "2023-06-01" +DEFAULT_MODEL = "claude-haiku-4-5-20251001" +MAX_OUTPUT_TOKENS = 2048 +MAX_TURNS = 8 +REQUEST_TIMEOUT_S = 90.0 + + +# ── Tool schemas exposed to the LLM ───────────────────────────────────────── + + +READ_FILE_TOOL: dict[str, Any] = { + "name": "read_file", + "description": ( + "Read the full contents of a file in the fixture repo. Use this to " + "confirm a candidate symbol's body actually implements the decision's " + "intent before submitting a binding. Path is repo-relative " + "(e.g. 'src/checkout/orders.py')." + ), + "input_schema": { + "type": "object", + "properties": { + "file_path": { + "type": "string", + "description": "Repo-relative path to the file.", + } + }, + "required": ["file_path"], + }, +} + + +VALIDATE_SYMBOLS_TOOL: dict[str, Any] = { + "name": "validate_symbols", + "description": ( + "Confirm one or more candidate symbol names exist in the fixture " + "repo's symbol index. Returns the list of (file_path, symbol_name) " + "pairs that match. Use this before submitting a binding — the " + "handler verifies symbols against the same index." + ), + "input_schema": { + "type": "object", + "properties": { + "candidates": { + "type": "array", + "items": {"type": "string"}, + "description": ( + "Symbol-name hypotheses (e.g. ['process_order', " + "'CheckoutRetryGuard.check_cap'])." + ), + } + }, + "required": ["candidates"], + }, +} + + +SUBMIT_BINDING_TOOL: dict[str, Any] = { + "name": "submit_binding", + "description": ( + "Submit your final binding decision. Call this exactly once. " + "Either provide (file_path, symbol_name) for the bind, or set " + "abort=true with abort_reason if the evidence is too weak to bind " + "(per skills/bicameral-bind/SKILL.md 'abort on weak evidence')." + ), + "input_schema": { + "type": "object", + "properties": { + "abort": { + "type": "boolean", + "description": "True if aborting on weak evidence; false if submitting a real binding.", + }, + "file_path": { + "type": "string", + "description": "Repo-relative file the symbol lives in. Required when abort=false.", + }, + "symbol_name": { + "type": "string", + "description": "The symbol to bind to. Required when abort=false.", + }, + "abort_reason": { + "type": "string", + "description": "One sentence why the evidence was too weak. Required when abort=true.", + }, + "reasoning": { + "type": "string", + "description": "One sentence explaining the choice (or the abort).", + }, + }, + "required": ["abort", "reasoning"], + }, +} + + +TOOLS = [READ_FILE_TOOL, VALIDATE_SYMBOLS_TOOL, SUBMIT_BINDING_TOOL] + + +# ── Public result shape ───────────────────────────────────────────────────── + + +@dataclass(frozen=True) +class BindJudgment: + case_id: str + aborted: bool + bound_file: str | None + bound_symbol: str | None + abort_reason: str | None + reasoning: str + turns: int + tokens_in: int + tokens_out: int + + +# ── Helpers ───────────────────────────────────────────────────────────────── + + +def _sha(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def _load_skill_md() -> str: + if not SKILL_MD_PATH.exists(): + raise FileNotFoundError(f"SKILL.md not found at {SKILL_MD_PATH}") + return SKILL_MD_PATH.read_text(encoding="utf-8") + + +def _scan_repo(repo_root: Path) -> dict[str, str]: + """Walk the fixture repo and return {repo_relative_path: content}.""" + out: dict[str, str] = {} + for path in sorted(repo_root.rglob("*")): + if not path.is_file(): + continue + if path.suffix not in {".py", ".ts", ".js", ".go", ".rs"}: + continue + rel = path.relative_to(repo_root).as_posix() + out[rel] = path.read_text(encoding="utf-8") + return out + + +def _index_symbols(files: dict[str, str]) -> list[tuple[str, str]]: + """Build a flat (file, symbol) index from the fixture repo. + + Lightweight — extracts top-level def/class names and one level of + methods. Good enough for the synthetic fixture; the production + bicameral symbol index is richer but for the eval we just need + deterministic 'does this symbol exist in this file' answers. + """ + import re + + out: list[tuple[str, str]] = [] + py_def = re.compile(r"^(?:async\s+)?def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", re.MULTILINE) + py_class = re.compile(r"^class\s+([a-zA-Z_][a-zA-Z0-9_]*)", re.MULTILINE) + py_method = re.compile(r"^ (?:async\s+)?def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", re.MULTILINE) + ts_func = re.compile( + r"^(?:export\s+)?(?:async\s+)?function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)", + re.MULTILINE, + ) + ts_class = re.compile(r"^(?:export\s+)?class\s+([a-zA-Z_$][a-zA-Z0-9_$]*)", re.MULTILINE) + + for file_path, content in files.items(): + if file_path.endswith((".py",)): + for m in py_def.finditer(content): + out.append((file_path, m.group(1))) + for cls in py_class.finditer(content): + cls_name = cls.group(1) + out.append((file_path, cls_name)) + # Methods inside this class — capture as Class.method + tail = content[cls.end() :] + next_cls = py_class.search(tail) + cls_body = tail[: next_cls.start()] if next_cls else tail + for meth in py_method.finditer(cls_body): + out.append((file_path, f"{cls_name}.{meth.group(1)}")) + elif file_path.endswith((".ts", ".js")): + for m in ts_func.finditer(content): + out.append((file_path, m.group(1))) + for m in ts_class.finditer(content): + out.append((file_path, m.group(1))) + return out + + +def _execute_tool( + tool_name: str, + tool_input: dict, + *, + repo_files: dict[str, str], + symbol_index: list[tuple[str, str]], +) -> str: + """Deterministic tool dispatch — no real handler call, just fixture lookup. + + The eval measures whether the LLM picks the right (file, symbol) given + the available evidence; we don't need the production bicameral handler + in the loop. Cleaner cache + faster CI. + """ + if tool_name == "read_file": + path = tool_input.get("file_path", "") + if path in repo_files: + return repo_files[path] + return f"ERROR: file not found: {path!r} (try one of: {sorted(repo_files.keys())[:5]}…)" + + if tool_name == "validate_symbols": + candidates = tool_input.get("candidates") or [] + matches: list[dict] = [] + for cand in candidates: + for fp, sym in symbol_index: + if cand == sym or cand == sym.split(".")[-1]: + matches.append({"file_path": fp, "symbol_name": sym, "candidate": cand}) + return json.dumps({"matches": matches}, indent=2) + + return f"ERROR: unknown tool {tool_name!r}" + + +def _cache_path(model: str, skill_sha: str, repo_sha: str, input_sha: str) -> Path: + key = f"{model}|{skill_sha}|{repo_sha}|{input_sha}" + return CACHE_DIR / f"{_sha(key)}.json" + + +def _build_system_prompt(skill_md: str) -> str: + return f"""\ +You are a caller-LLM running the bicameral-bind skill against a synthetic +fixture repo. Your job: read the decision text, identify the right +(file, symbol) it should bind to, then call `submit_binding`. + +Apply the skill's contract verbatim — especially the mandatory pre-bind +verification (Read at least one candidate file end-to-end, confirm via +`validate_symbols`, abort on weak evidence). The handler-side rejection +contract from #280 means your binding is verified against the same +symbol index `validate_symbols` queries; submitting a hallucinated +symbol will be rejected. + +You have at most {MAX_TURNS} turns to gather evidence and submit. Do +not respond in plain text — every turn must be a tool_use block. + +──── Skill contract (verbatim from skills/bicameral-bind/SKILL.md) ──── + +{skill_md} +""" + + +def _build_user_prompt(decision_description: str, repo_files: dict[str, str]) -> str: + file_list = "\n".join(f" - {p}" for p in sorted(repo_files.keys())) + return f"""\ +Decision to bind: + + {decision_description} + +Fixture repo files available (use `read_file` to view contents): + +{file_list} + +Identify the right (file, symbol) for this decision and submit via +`submit_binding`. If after gathering evidence you can't point at a +specific function or class body that implements the decision, submit +with abort=true. +""" + + +def _call_messages_api( + *, + model: str, + system_prompt: str, + messages: list[dict], + api_key: str, +) -> dict: + headers = { + "anthropic-version": ANTHROPIC_API_VERSION, + "content-type": "application/json", + "x-api-key": api_key, + } + payload: dict[str, Any] = { + "model": model, + "max_tokens": MAX_OUTPUT_TOKENS, + "temperature": 0, + "system": [ + { + "type": "text", + "text": system_prompt, + "cache_control": {"type": "ephemeral"}, + } + ], + "messages": messages, + "tools": TOOLS, + } + with httpx.Client(timeout=REQUEST_TIMEOUT_S) as client: + resp = client.post(ANTHROPIC_API_URL, headers=headers, json=payload) + if resp.status_code >= 400: + raise RuntimeError(f"Anthropic API error {resp.status_code}: {resp.text[:500]}") + return resp.json() + + +# ── Public entrypoint ─────────────────────────────────────────────────────── + + +def run_bind_judgment( + *, + case_id: str, + decision_description: str, + repo_root: Path, + model: str | None = None, + api_key: str | None = None, + use_cache: bool = True, +) -> BindJudgment: + """Drive the bicameral-bind skill against the fixture repo for one case. + + Returns a `BindJudgment` capturing the LLM's outcome (binding or abort) + along with token + turn telemetry. + + Caching: response is cached to ``tests/eval/fixtures/bind_judge/`` keyed + on (model, SKILL.md SHA, repo SHA, decision SHA). Set + ``BICAMERAL_GROUNDING_EVAL_RECORD=1`` to bypass cache and re-record. + """ + chosen_model: str = model or os.getenv("BICAMERAL_GROUNDING_EVAL_MODEL") or DEFAULT_MODEL + skill_md = _load_skill_md() + skill_sha = _sha(skill_md) + + repo_files = _scan_repo(repo_root) + symbol_index = _index_symbols(repo_files) + + canonical_repo = json.dumps(repo_files, sort_keys=True, ensure_ascii=False) + repo_sha = _sha(canonical_repo) + input_sha = _sha(decision_description) + + cache_file = _cache_path(chosen_model, skill_sha, repo_sha, input_sha) + force_record = os.getenv("BICAMERAL_GROUNDING_EVAL_RECORD", "").strip() in {"1", "true", "yes"} + if use_cache and not force_record and cache_file.exists(): + cached: dict[str, Any] = json.loads(cache_file.read_text(encoding="utf-8")) + return BindJudgment( + case_id=case_id, + aborted=bool(cached["aborted"]), + bound_file=cached.get("bound_file"), + bound_symbol=cached.get("bound_symbol"), + abort_reason=cached.get("abort_reason"), + reasoning=str(cached.get("reasoning") or ""), + turns=int(cached.get("turns") or 0), + tokens_in=int(cached.get("tokens_in") or 0), + tokens_out=int(cached.get("tokens_out") or 0), + ) + + chosen_key = api_key or os.environ.get("ANTHROPIC_API_KEY", "") + if not chosen_key.strip(): + raise RuntimeError( + "ANTHROPIC_API_KEY missing and no cached fixture exists for " + f"(model={chosen_model}, case={case_id}, skill={skill_sha[:8]}, " + f"repo={repo_sha[:8]}, input={input_sha[:8]})." + ) + + system_prompt = _build_system_prompt(skill_md) + messages: list[dict] = [ + {"role": "user", "content": _build_user_prompt(decision_description, repo_files)} + ] + + tokens_in = 0 + tokens_out = 0 + bound_file: str | None = None + bound_symbol: str | None = None + aborted = False + abort_reason: str | None = None + reasoning = "" + turn = 0 + + for turn in range(1, MAX_TURNS + 1): # noqa: B007 — `turn` is read after the loop for telemetry + data = _call_messages_api( + model=chosen_model, + system_prompt=system_prompt, + messages=messages, + api_key=chosen_key, + ) + usage = data.get("usage") or {} + tokens_in += int(usage.get("input_tokens", 0)) + tokens_out += int(usage.get("output_tokens", 0)) + + content = data.get("content") or [] + # Append assistant turn to history regardless of tool/no-tool + messages.append({"role": "assistant", "content": content}) + + tool_uses = [b for b in content if b.get("type") == "tool_use"] + if not tool_uses: + # No tool call — agent gave up without submitting. Treat as abort. + aborted = True + abort_reason = "agent did not call any tool" + reasoning = next((b.get("text", "") for b in content if b.get("type") == "text"), "") + break + + # Submit the tool results, but check first if any tool_use is submit_binding. + submit_call = next((tu for tu in tool_uses if tu.get("name") == "submit_binding"), None) + if submit_call is not None: + inp = submit_call.get("input") or {} + reasoning = str(inp.get("reasoning") or "") + if inp.get("abort"): + aborted = True + abort_reason = str(inp.get("abort_reason") or "") + else: + bound_file = str(inp.get("file_path") or "") or None + bound_symbol = str(inp.get("symbol_name") or "") or None + break + + # Otherwise execute read_file / validate_symbols and continue the loop. + tool_results: list[dict] = [] + for tu in tool_uses: + result = _execute_tool( + tu.get("name", ""), + tu.get("input") or {}, + repo_files=repo_files, + symbol_index=symbol_index, + ) + tool_results.append( + { + "type": "tool_result", + "tool_use_id": tu.get("id"), + "content": result, + } + ) + messages.append({"role": "user", "content": tool_results}) + else: + # Loop exhausted without submit_binding. + aborted = True + abort_reason = f"hit MAX_TURNS={MAX_TURNS} without submitting a binding" + + judgment_payload: dict[str, Any] = { + "aborted": aborted, + "bound_file": bound_file, + "bound_symbol": bound_symbol, + "abort_reason": abort_reason, + "reasoning": reasoning, + "turns": turn, + "tokens_in": tokens_in, + "tokens_out": tokens_out, + } + + CACHE_DIR.mkdir(parents=True, exist_ok=True) + cache_file.write_text( + json.dumps(judgment_payload, indent=2, ensure_ascii=False), encoding="utf-8" + ) + return BindJudgment( + case_id=case_id, + aborted=aborted, + bound_file=bound_file, + bound_symbol=bound_symbol, + abort_reason=abort_reason, + reasoning=reasoning, + turns=turn, + tokens_in=tokens_in, + tokens_out=tokens_out, + ) + + +def fixture_exists( + *, case_id: str, decision_description: str, repo_root: Path, model: str | None = None +) -> bool: + """True if a cached fixture exists for these inputs (used to skip when + no API key + no cache).""" + chosen_model: str = model or os.getenv("BICAMERAL_GROUNDING_EVAL_MODEL") or DEFAULT_MODEL + skill_md = _load_skill_md() + skill_sha = _sha(skill_md) + repo_files = _scan_repo(repo_root) + canonical_repo = json.dumps(repo_files, sort_keys=True, ensure_ascii=False) + repo_sha = _sha(canonical_repo) + input_sha = _sha(decision_description) + return _cache_path(chosen_model, skill_sha, repo_sha, input_sha).exists() diff --git a/tests/eval_grounding_recall.py b/tests/eval_grounding_recall.py new file mode 100644 index 00000000..03f21d29 --- /dev/null +++ b/tests/eval_grounding_recall.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +"""M2 grounding-recall eval — measures caller-LLM bind precision/recall (#280 PR-2). + +Drives the bicameral-bind skill against a synthetic fixture (≥ 21 +decisions across same-name-different-module / similar-intent / +cross-language cases), captures what each judgment bound vs. ground +truth, and emits precision / recall / abort-rate. + +Three measurement axes (the split matters for diagnosis): + - Precision = correct / (correct + wrong_symbol + wrong_file) + of the bindings the agent committed, what fraction were right + - Recall = correct / total_rows + of the ground-truth bindings, how many the agent got right + (aborts and wrong bindings BOTH count against) + - Abort rate = aborted / total_rows + first-class signal because the bind-skill makes 'abort on + weak evidence' an explicit contract — high abort rate + means the skill is too conservative + +Usage: + .venv/bin/python tests/eval_grounding_recall.py + --model claude-haiku-4-5-20251001 + --gate-mode warn + -o test-results/m2-grounding-recall.json + +Flags: + --case-filter only run cases of this type (debug) + --case-id run a single case by id (debug) + --model override BICAMERAL_GROUNDING_EVAL_MODEL + --min-recall gate threshold (default 0.80, per #280) + --min-precision gate threshold (default 0.85, per #280) + --max-abort-rate gate threshold (default 0.30 — agent too cautious) + --gate-mode 'warn' (advisory, default) | 'hard' (exit non-zero on miss) + -o / --output write JSON report + --verbose print per-case rows +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import sys +from collections import defaultdict +from pathlib import Path +from typing import Any + +# tests/ has no __init__.py; import siblings via dir on sys.path. +REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(REPO_ROOT / "tests" / "eval")) +sys.path.insert(0, str(REPO_ROOT / "tests" / "fixtures" / "grounding_recall")) + +from _bind_judge import BindJudgment, fixture_exists, run_bind_judgment # type: ignore[import-not-found] # noqa: E402, I001 +from dataset import ALL_CASES, GENERATOR_VERSION, GroundingCase, cases_by_type # type: ignore[import-not-found] # noqa: E402, I001 + +FIXTURE_REPO = REPO_ROOT / "tests" / "fixtures" / "grounding_recall" / "repo" + + +# ── Outcome classification ────────────────────────────────────────────────── + + +def _classify(case: GroundingCase, judgment: BindJudgment) -> str: + """Map (case, judgment) → outcome category for metrics.""" + if judgment.aborted: + return "aborted" + if judgment.bound_file == case.intended_file and judgment.bound_symbol == case.intended_symbol: + return "correct" + if judgment.bound_file == case.intended_file: + return "wrong_symbol" + return "wrong_file" + + +# ── Report shape ──────────────────────────────────────────────────────────── + + +def _per_case_row(case: GroundingCase, judgment: BindJudgment, outcome: str) -> dict[str, Any]: + return { + "case_id": case.case_id, + "case_type": case.case_type, + "intended_file": case.intended_file, + "intended_symbol": case.intended_symbol, + "bound_file": judgment.bound_file, + "bound_symbol": judgment.bound_symbol, + "outcome": outcome, + "aborted": judgment.aborted, + "abort_reason": judgment.abort_reason, + "reasoning": judgment.reasoning, + "turns": judgment.turns, + "tokens_in": judgment.tokens_in, + "tokens_out": judgment.tokens_out, + } + + +def _aggregate(rows: list[dict[str, Any]]) -> dict[str, Any]: + total = len(rows) + by_outcome: dict[str, int] = defaultdict(int) + by_type_total: dict[str, int] = defaultdict(int) + by_type_correct: dict[str, int] = defaultdict(int) + tokens_in = 0 + tokens_out = 0 + turns_sum = 0 + + for r in rows: + by_outcome[r["outcome"]] += 1 + by_type_total[r["case_type"]] += 1 + if r["outcome"] == "correct": + by_type_correct[r["case_type"]] += 1 + tokens_in += r["tokens_in"] + tokens_out += r["tokens_out"] + turns_sum += r["turns"] + + correct = by_outcome["correct"] + wrong_symbol = by_outcome["wrong_symbol"] + wrong_file = by_outcome["wrong_file"] + aborted = by_outcome["aborted"] + + submitted = correct + wrong_symbol + wrong_file + precision = correct / submitted if submitted > 0 else 0.0 + recall = correct / total if total > 0 else 0.0 + abort_rate = aborted / total if total > 0 else 0.0 + + per_type = { + t: { + "total": by_type_total[t], + "correct": by_type_correct[t], + "recall": (by_type_correct[t] / by_type_total[t]) if by_type_total[t] else 0.0, + } + for t in sorted(by_type_total) + } + + return { + "total_cases": total, + "outcomes": dict(by_outcome), + "precision": round(precision, 4), + "recall": round(recall, 4), + "abort_rate": round(abort_rate, 4), + "per_case_type": per_type, + "tokens_in_total": tokens_in, + "tokens_out_total": tokens_out, + "avg_turns": round(turns_sum / total, 2) if total else 0.0, + } + + +# ── Runner ────────────────────────────────────────────────────────────────── + + +async def run(args: argparse.Namespace) -> tuple[dict[str, Any], int]: + cases: list[GroundingCase] = list(ALL_CASES) + if args.case_filter: + cases = cases_by_type(args.case_filter) + if not cases: + print(f"no cases match --case-filter {args.case_filter!r}", file=sys.stderr) + return {}, 2 + if args.case_id: + cases = [c for c in cases if c.case_id == args.case_id] + if not cases: + print(f"no case matches --case-id {args.case_id!r}", file=sys.stderr) + return {}, 2 + + rows: list[dict[str, Any]] = [] + for case in cases: + if args.skip_missing_fixtures and not fixture_exists( + case_id=case.case_id, + decision_description=case.description, + repo_root=FIXTURE_REPO, + model=args.model, + ): + if args.verbose: + print(f"SKIP {case.case_id}: no cached fixture and --skip-missing-fixtures") + continue + try: + judgment = run_bind_judgment( + case_id=case.case_id, + decision_description=case.description, + repo_root=FIXTURE_REPO, + model=args.model, + ) + except RuntimeError as exc: + print(f"ERROR on {case.case_id}: {exc}", file=sys.stderr) + if args.gate_mode == "hard": + return {}, 3 + continue + + outcome = _classify(case, judgment) + row = _per_case_row(case, judgment, outcome) + rows.append(row) + if args.verbose: + print( + f" {case.case_id:<35} {outcome:<14} " + f"bound=({judgment.bound_file or '—'}::{judgment.bound_symbol or '—'})" + ) + + summary = _aggregate(rows) + summary["generator_version"] = GENERATOR_VERSION + summary["model"] = args.model or "default" + summary["gate_mode"] = args.gate_mode + + # Gate enforcement + breaches: list[str] = [] + if summary["recall"] < args.min_recall: + breaches.append(f"recall {summary['recall']:.3f} < {args.min_recall}") + if summary["precision"] < args.min_precision: + breaches.append(f"precision {summary['precision']:.3f} < {args.min_precision}") + if summary["abort_rate"] > args.max_abort_rate: + breaches.append(f"abort_rate {summary['abort_rate']:.3f} > {args.max_abort_rate}") + summary["gate_breaches"] = breaches + + report = {"summary": summary, "rows": rows} + + if args.output: + out = Path(args.output) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8") + print(f"wrote {args.output}") + + print() + print(f"M2 grounding-recall eval — {summary['total_cases']} cases") + print(f" precision : {summary['precision']:.3f} (gate ≥ {args.min_precision})") + print(f" recall : {summary['recall']:.3f} (gate ≥ {args.min_recall})") + print(f" abort_rate : {summary['abort_rate']:.3f} (gate ≤ {args.max_abort_rate})") + print(f" avg_turns : {summary['avg_turns']}") + print(f" tokens : {summary['tokens_in_total']} in / {summary['tokens_out_total']} out") + if breaches: + print(f" ⚠ gate breaches: {'; '.join(breaches)}") + else: + print(" ✓ all gates pass") + + if breaches and args.gate_mode == "hard": + return report, 1 + return report, 0 + + +def main() -> int: + p = argparse.ArgumentParser(description=__doc__.split("\n")[0] if __doc__ else None) + p.add_argument( + "--case-filter", choices=("same_name_different_module", "similar_intent", "cross_language") + ) + p.add_argument("--case-id", help="run a single case by id (debug)") + p.add_argument("--model", help="override BICAMERAL_GROUNDING_EVAL_MODEL") + p.add_argument("--min-recall", type=float, default=0.80) + p.add_argument("--min-precision", type=float, default=0.85) + p.add_argument("--max-abort-rate", type=float, default=0.30) + p.add_argument("--gate-mode", choices=("warn", "hard"), default="warn") + p.add_argument( + "--skip-missing-fixtures", + action="store_true", + help="skip cases without a cached fixture (no API call) — useful for offline runs", + ) + p.add_argument("-o", "--output", help="write JSON report to this path") + p.add_argument("--verbose", action="store_true") + args = p.parse_args() + + _, exit_code = asyncio.run(run(args)) + return exit_code + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/fixtures/grounding_recall/dataset.py b/tests/fixtures/grounding_recall/dataset.py new file mode 100644 index 00000000..6372ee30 --- /dev/null +++ b/tests/fixtures/grounding_recall/dataset.py @@ -0,0 +1,398 @@ +"""Synthetic grounding-recall dataset for the M2 caller-LLM eval (#280). + +Each row is a `GroundingCase` describing a decision text plus the +ground-truth (file, symbol) it should bind to. The fixture repo at +``tests/fixtures/grounding_recall/repo/`` contains the intended symbols +plus deliberate distractors; the LLM is given access to the repo and +must pick the right symbol to clear the precision gate. + +`GENERATOR_VERSION` invalidates the eval cache when bumped — change the +value if a row is added/edited/removed so the next CI run re-records. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +GENERATOR_VERSION = "1" + + +@dataclass(frozen=True) +class GroundingCase: + """One row in the M2 grounding-recall fixture.""" + + case_id: str + case_type: str # "same_name_different_module" | "similar_intent" | "cross_language" + description: str # the decision text the LLM reads + intended_file: str # ground-truth file (relative to repo root) + intended_symbol: str # ground-truth symbol name + distractors: tuple[tuple[str, str], ...] = field(default_factory=tuple) + # ↑ list of (file, symbol) plausible-but-wrong candidates that exist + # in the fixture repo. Used by the runner to verify the wrong-bind + # path actually has somewhere to land (i.e. the LLM could plausibly + # pick a distractor — measures real precision, not "no other option"). + + +# ─── Case A: same-name-different-module ───────────────────────────────────── + +CASES_A: list[GroundingCase] = [ + GroundingCase( + case_id="A1_process_order_checkout", + case_type="same_name_different_module", + description=( + "Customer checkout flow caps payment retries at 3 per Stripe contract — " + "after 3 declines the user sees a hard error and the cart unlocks." + ), + intended_file="src/checkout/orders.py", + intended_symbol="process_order", + distractors=( + ("src/admin/orders.py", "process_order"), + ("src/billing/refunds.py", "process_order"), + ), + ), + GroundingCase( + case_id="A2_process_order_admin", + case_type="same_name_different_module", + description=( + "Support team manually replays finance-flagged orders through the " + "refund pipeline — runs under elevated permissions, never touches " + "the customer payment-auth path." + ), + intended_file="src/admin/orders.py", + intended_symbol="process_order", + distractors=( + ("src/checkout/orders.py", "process_order"), + ("src/billing/refunds.py", "process_order"), + ), + ), + GroundingCase( + case_id="A3_process_order_billing", + case_type="same_name_different_module", + description=( + "Bulk-refund batch job + chargeback webhook handler push refund " + "requests through the billing pipeline (credit-to-source + " + "accounting reconciliation). Distinct from manual admin replay " + "and from customer checkout." + ), + intended_file="src/billing/refunds.py", + intended_symbol="process_order", + distractors=( + ("src/checkout/orders.py", "process_order"), + ("src/admin/orders.py", "process_order"), + ), + ), + GroundingCase( + case_id="A4_cancel_order_checkout", + case_type="same_name_different_module", + description=( + "User-initiated order cancellation from the storefront — refunds " + "within the 24-hour cancellation window, otherwise blocks with a " + "clear error to the customer." + ), + intended_file="src/checkout/orders.py", + intended_symbol="cancel_order", + distractors=(("src/billing/refunds.py", "cancel_order"),), + ), + GroundingCase( + case_id="A5_cancel_order_billing", + case_type="same_name_different_module", + description=( + "Stripe chargeback webhook + failed-renewal subscription job both " + "trigger billing-side cancellation — bypasses the user-facing " + "24-hour window because the trigger is external, records a " + "billing cancellation event, refunds to source of funds." + ), + intended_file="src/billing/refunds.py", + intended_symbol="cancel_order", + distractors=(("src/checkout/orders.py", "cancel_order"),), + ), +] + + +# ─── Case B: similar-intent-different-symbol ──────────────────────────────── + +CASES_B: list[GroundingCase] = [ + GroundingCase( + case_id="B1_tenant_rate_limit", + case_type="similar_intent", + description=( + "Enterprise contract clause: each tenant gets 1000 req/min on " + "checkout endpoints. Identity is the tenant_id from auth claims, " + "scope is the checkout-endpoint family. Implements the tenant-" + "scoped SLA, NOT the cluster-wide protective ceiling." + ), + intended_file="src/middleware/tenant_rate_limit.py", + intended_symbol="TenantCheckoutRateLimiter.check", + distractors=( + ("src/middleware/global_rate_limit.py", "GlobalRateLimiter.check"), + ("src/checkout/throttle.py", "throttle_checkout"), + ), + ), + GroundingCase( + case_id="B2_retry_cap", + case_type="similar_intent", + description=( + "Stripe contract clause: never retry a declined authorization " + "more than 3 times within 24 hours. Enforced before each retry " + "by raising MaxRetriesExceeded when the attempt counter hits 3. " + "Distinct from the backoff-delay configuration." + ), + intended_file="src/checkout/retry.py", + intended_symbol="CheckoutRetryGuard.check_cap", + distractors=( + ("src/checkout/retry.py", "StripeRetryPolicy.delay_for"), + ("src/checkout/orders.py", "process_order"), + ), + ), + GroundingCase( + case_id="B3_throttle_double_submit", + case_type="similar_intent", + description=( + "Prevent double-submit on the 'pay now' button by adding a " + "30-second cooldown per session. This is session-level cooldown, " + "NOT per-tenant rate limit — the trigger is the user's recent " + "click history, not the tenant's request volume." + ), + intended_file="src/checkout/throttle.py", + intended_symbol="throttle_checkout", + distractors=( + ("src/middleware/tenant_rate_limit.py", "TenantCheckoutRateLimiter.check"), + ("src/middleware/global_rate_limit.py", "GlobalRateLimiter.check"), + ), + ), + GroundingCase( + case_id="B4_validate_jwt", + case_type="similar_intent", + description=( + "API gateway middleware validates JWT signature, expiry, and " + "audience on every inbound request. Stateless bearer credentials — " + "the gateway does NOT consult any session store. Used on the " + "API surface, not the web UI." + ), + intended_file="src/auth/tokens.py", + intended_symbol="validate_token", + distractors=(("src/auth/session.py", "validate_session"),), + ), + GroundingCase( + case_id="B5_validate_session", + case_type="similar_intent", + description=( + "Web UI middleware validates the server-side session cookie " + "before serving authenticated pages. Sessions support invalidation " + "(unlike JWTs); a request carries a session cookie OR a JWT, " + "never both, and this function handles the cookie path." + ), + intended_file="src/auth/session.py", + intended_symbol="validate_session", + distractors=(("src/auth/tokens.py", "validate_token"),), + ), + GroundingCase( + case_id="B6_refresh_jwt", + case_type="similar_intent", + description=( + "Exchange a long-lived refresh token for a fresh access token. " + "Refresh tokens are JWTs marked with type='refresh' and have " + "their own expiry separate from access tokens. Mints a new " + "access JWT for the same subject." + ), + intended_file="src/auth/tokens.py", + intended_symbol="refresh_token", + distractors=(("src/auth/session.py", "refresh_session"),), + ), + GroundingCase( + case_id="B7_refresh_session", + case_type="similar_intent", + description=( + "Extend the server-side session's expiry on every authenticated " + "UI page-load — slides the idle-timeout window forward, but caps " + "at the absolute 24-hour creation timeout regardless of activity. " + "Server-side state, not a token mint." + ), + intended_file="src/auth/session.py", + intended_symbol="refresh_session", + distractors=(("src/auth/tokens.py", "refresh_token"),), + ), + GroundingCase( + case_id="B8_verify_webhook_python", + case_type="similar_intent", + description=( + "Verify HMAC-SHA256 signature on inbound webhook payloads before " + "any processing. Computed over the raw body using the per-source " + "shared secret. Constant-time comparison via hmac.compare_digest. " + "Implemented in the Python webhook ingress, not the auth path." + ), + intended_file="src/webhooks/verify.py", + intended_symbol="verify_webhook_signature", + distractors=( + ("src/auth/tokens.py", "validate_token"), + ("src/auth/session.py", "validate_session"), + ), + ), + GroundingCase( + case_id="B9_request_metrics", + case_type="similar_intent", + description=( + "API gateway emits per-request metrics — latency histogram, " + "status counter, route + tenant tags — after the response is " + "generated. One metric emission per inbound HTTP request, " + "tagged with tenant_id for SLA dashboards." + ), + intended_file="src/metrics/collect.py", + intended_symbol="collect_request_metrics", + distractors=(("src/metrics/collect.py", "collect_handler_metrics"),), + ), + GroundingCase( + case_id="B10_handler_metrics", + case_type="similar_intent", + description=( + "Each handler invocation emits its own latency + outcome metrics — " + "finer grain than per-request, since one inbound request fans out " + "to many handler invocations through the middleware chain. Used " + "to attribute regressions to specific handlers." + ), + intended_file="src/metrics/collect.py", + intended_symbol="collect_handler_metrics", + distractors=(("src/metrics/collect.py", "collect_request_metrics"),), + ), +] + + +# ─── Case C: cross-language ───────────────────────────────────────────────── + +CASES_C: list[GroundingCase] = [ + GroundingCase( + case_id="C1_verify_python", + case_type="cross_language", + description=( + "Python API service verifies HMAC-SHA256 signatures on inbound " + "webhook requests using the per-source shared secret with " + "constant-time comparison. Runtime: CPython on the API workers." + ), + intended_file="src/webhooks/verify.py", + intended_symbol="verify_webhook_signature", + distractors=(("src/webhooks/verify.ts", "verifyWebhookSignature"),), + ), + GroundingCase( + case_id="C2_verify_typescript", + case_type="cross_language", + description=( + "TypeScript edge-worker verifies HMAC-SHA256 webhook signatures " + "before forwarding to the origin. Same security contract as the " + "Python API path, but runs at the CDN edge so verification " + "happens before the request hits the origin." + ), + intended_file="src/webhooks/verify.ts", + intended_symbol="verifyWebhookSignature", + distractors=(("src/webhooks/verify.py", "verify_webhook_signature"),), + ), + GroundingCase( + case_id="C3_dispatch_python", + case_type="cross_language", + description=( + "Python webhook ingress routes verified events to the subscriber " + "handler chain — after signature verification, fans out to each " + "handler with per-handler retry. Errors in one handler do not " + "abort the rest. Runs in the Python ingestion path." + ), + intended_file="src/webhooks/dispatch.py", + intended_symbol="dispatch_event", + distractors=(("src/webhooks/dispatch.ts", "dispatchEvent"),), + ), + GroundingCase( + case_id="C4_dispatch_typescript", + case_type="cross_language", + description=( + "TypeScript runtime routes verified webhook events to subscriber " + "handlers, fanning out with per-handler retry. Same routing " + "contract as the Python sibling, but runs in the TS ingress (e.g. " + "the edge worker after sig-verify)." + ), + intended_file="src/webhooks/dispatch.ts", + intended_symbol="dispatchEvent", + distractors=(("src/webhooks/dispatch.py", "dispatch_event"),), + ), + GroundingCase( + case_id="C5_enqueue_python", + case_type="cross_language", + description=( + "Python webhook handler queues events for asynchronous dispatch " + "when the inbound request must respond inside Stripe's 5-second " + "window — the queue lives in-process, drained by a background " + "worker. Used only on the Python path." + ), + intended_file="src/webhooks/dispatch.py", + intended_symbol="enqueue_dispatch", + distractors=(("src/webhooks/dispatch.ts", "enqueueDispatch"),), + ), + GroundingCase( + case_id="C6_enqueue_typescript", + case_type="cross_language", + description=( + "TypeScript edge worker queues webhook events for async dispatch " + "to satisfy Stripe's 5-second response budget. Same async-queue " + "contract as the Python sibling, runs in the TS runtime where " + "edge workers must respond fast." + ), + intended_file="src/webhooks/dispatch.ts", + intended_symbol="enqueueDispatch", + distractors=(("src/webhooks/dispatch.py", "enqueue_dispatch"),), + ), + GroundingCase( + case_id="C7_metrics_python", + case_type="cross_language", + description=( + "Python API gateway emits per-request latency and status metrics " + "tagged with tenant_id. Runs in-process on the Python workers, " + "fires after the response is generated by the gateway middleware." + ), + intended_file="src/metrics/collect.py", + intended_symbol="collect_request_metrics", + distractors=(("src/metrics/collect.ts", "collectRequestMetrics"),), + ), + GroundingCase( + case_id="C8_metrics_typescript", + case_type="cross_language", + description=( + "TypeScript edge proxy emits per-request latency and status " + "metrics tagged with tenant_id. Runs in the TS edge runtime, " + "fires after each proxied response. Same metrics contract as " + "the Python gateway, different runtime." + ), + intended_file="src/metrics/collect.ts", + intended_symbol="collectRequestMetrics", + distractors=(("src/metrics/collect.py", "collect_request_metrics"),), + ), +] + + +ALL_CASES: list[GroundingCase] = CASES_A + CASES_B + CASES_C + + +def cases_by_type(case_type: str) -> list[GroundingCase]: + return [c for c in ALL_CASES if c.case_type == case_type] + + +def case_by_id(case_id: str) -> GroundingCase: + for c in ALL_CASES: + if c.case_id == case_id: + return c + raise KeyError(f"unknown case_id: {case_id}") + + +# Sanity check at import time — fail loud if the dataset shape regresses. +def _validate_dataset() -> None: + seen_ids: set[str] = set() + for c in ALL_CASES: + if c.case_id in seen_ids: + raise AssertionError(f"duplicate case_id: {c.case_id}") + seen_ids.add(c.case_id) + if c.case_type not in ("same_name_different_module", "similar_intent", "cross_language"): + raise AssertionError(f"{c.case_id}: invalid case_type {c.case_type!r}") + if not c.intended_file or not c.intended_symbol: + raise AssertionError(f"{c.case_id}: intended_file/symbol must be non-empty") + for df, ds in c.distractors: + if (df, ds) == (c.intended_file, c.intended_symbol): + raise AssertionError(f"{c.case_id}: distractor matches intended") + + +_validate_dataset() diff --git a/tests/fixtures/grounding_recall/repo/src/admin/orders.py b/tests/fixtures/grounding_recall/repo/src/admin/orders.py new file mode 100644 index 00000000..a2ee40c7 --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/admin/orders.py @@ -0,0 +1,39 @@ +"""Admin tools for finance/support — refunds, manual review, reporting. + +Runs under elevated permissions; never in the customer checkout path. +""" + + +def process_order(order): + """Admin replay of a finance-flagged order — refunds only. + + Used by the support team to manually replay a flagged order through + the refund pipeline. Does NOT exercise the customer checkout path + (no payment auth, no inventory holds). Distinct from + checkout/orders.py:process_order which is customer-facing. + """ + if not order.flagged_for_refund: + raise NotEligibleForAdminReplayError(order.id) + return _replay_refund(order) + + +def report_orders(start, end): + """Generate the daily ops report for finance — flagged orders only.""" + rows = _load_flagged_in_range(start, end) + return _format_report(rows) + + +def _replay_refund(order): + raise NotImplementedError + + +def _load_flagged_in_range(start, end): + raise NotImplementedError + + +def _format_report(rows): + raise NotImplementedError + + +class NotEligibleForAdminReplayError(Exception): + pass diff --git a/tests/fixtures/grounding_recall/repo/src/auth/session.py b/tests/fixtures/grounding_recall/repo/src/auth/session.py new file mode 100644 index 00000000..bf7a7723 --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/auth/session.py @@ -0,0 +1,70 @@ +"""Server-side session lifecycle — distinct from JWT tokens. + +Sessions are stored server-side and identified by an opaque cookie; +JWTs (tokens.py) are stateless bearer credentials. Don't confuse the +two — sessions support invalidation, JWTs don't. +""" + + +def validate_session(session_id): + """Look up a session by id and confirm it's not expired or revoked. + + Returns the session dict on success; raises SessionExpired or + SessionNotFound otherwise. Used by the web UI middleware (NOT the + API gateway, which uses validate_token). Mutually exclusive with + JWT validation — a request carries one or the other. + """ + session = _load_session(session_id) + if session is None: + raise SessionNotFound(session_id) + if session.expires_at < _now(): + raise SessionExpired(session_id) + return session + + +def refresh_session(session_id): + """Extend the session's expiry by another idle-timeout window. + + Called on every authenticated UI page-load. Caps at the absolute + timeout (24h from creation) regardless of activity. + """ + session = validate_session(session_id) + session.expires_at = min(_now() + _idle_timeout(), session.created_at + _absolute_timeout()) + _store_session(session) + + +def revoke_session(session_id): + """Logout — marks the session as revoked, future loads will fail.""" + _delete_session(session_id) + + +def _load_session(session_id): + raise NotImplementedError + + +def _store_session(session): + raise NotImplementedError + + +def _delete_session(session_id): + raise NotImplementedError + + +def _now(): + raise NotImplementedError + + +def _idle_timeout(): + return 1800 # 30 min + + +def _absolute_timeout(): + return 86400 # 24 h + + +class SessionNotFound(Exception): + pass + + +class SessionExpired(Exception): + pass diff --git a/tests/fixtures/grounding_recall/repo/src/auth/tokens.py b/tests/fixtures/grounding_recall/repo/src/auth/tokens.py new file mode 100644 index 00000000..b45f7fe1 --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/auth/tokens.py @@ -0,0 +1,62 @@ +"""JWT token lifecycle — validation, refresh, revocation.""" + + +def validate_token(token): + """Validate a JWT's signature, expiry, and audience. + + Returns the parsed claims dict if valid; raises TokenInvalid on + signature mismatch, TokenExpired on stale exp, TokenWrongAudience + on aud mismatch. Used by the API gateway middleware on every + inbound request. + """ + claims = _decode_jwt(token) + if claims["exp"] < _now(): + raise TokenExpired + if claims["aud"] != _expected_audience(): + raise TokenWrongAudience + return claims + + +def refresh_token(refresh_token): + """Exchange a long-lived refresh token for a fresh access token.""" + claims = _decode_jwt(refresh_token) + if claims.get("type") != "refresh": + raise TokenInvalid + return _mint_access_token(claims["sub"]) + + +def revoke_token(token): + """Add a token to the revocation list — checked on every validate.""" + claims = _decode_jwt(token) + _revocation_store.add(claims["jti"], claims["exp"]) + + +def _decode_jwt(token): + raise NotImplementedError + + +def _now(): + raise NotImplementedError + + +def _expected_audience(): + return "bicameral-api" + + +def _mint_access_token(subject): + raise NotImplementedError + + +_revocation_store = None + + +class TokenInvalid(Exception): + pass + + +class TokenExpired(Exception): + pass + + +class TokenWrongAudience(Exception): + pass diff --git a/tests/fixtures/grounding_recall/repo/src/billing/refunds.py b/tests/fixtures/grounding_recall/repo/src/billing/refunds.py new file mode 100644 index 00000000..c8f1eefa --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/billing/refunds.py @@ -0,0 +1,53 @@ +"""Billing-side refund processing — distinct from admin replay. + +Refunds initiated through the billing system flow through this module: +batch processing, source-of-funds tracking, accounting reconciliation. +Admin tools (admin/orders.py:process_order) replay individual flagged +orders manually; this is the bulk path for normal refund volume. +""" + + +def cancel_order(order_id, reason): + """Refund-driven cancellation — used when the billing system + initiates the cancel (e.g. failed renewal, chargeback). + + Distinct from checkout/orders.py:cancel_order which is user-initiated + and respects the 24h window. Billing cancellations bypass the window + because they originate from external triggers (Stripe chargeback + webhooks, subscription failures). + """ + order = _load(order_id) + _record_billing_cancellation(order, reason) + return _refund_to_source_of_funds(order) + + +def process_order(refund_request): + """Run a refund through the billing pipeline — credit + accounting. + + Used by the bulk-refund batch job and by the chargeback webhook + handler. Distinct from admin/orders.py:process_order (manual replay + of finance-flagged orders) and checkout/orders.py:process_order + (customer payment auth + fulfillment). + """ + _credit_to_source(refund_request.amount, refund_request.source) + _record_accounting_entry(refund_request) + + +def _load(order_id): + raise NotImplementedError + + +def _record_billing_cancellation(order, reason): + raise NotImplementedError + + +def _refund_to_source_of_funds(order): + raise NotImplementedError + + +def _credit_to_source(amount, source): + raise NotImplementedError + + +def _record_accounting_entry(refund): + raise NotImplementedError diff --git a/tests/fixtures/grounding_recall/repo/src/checkout/orders.py b/tests/fixtures/grounding_recall/repo/src/checkout/orders.py new file mode 100644 index 00000000..acac03c0 --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/checkout/orders.py @@ -0,0 +1,50 @@ +"""Customer-facing checkout flow. + +Stripe contract: max 3 retries per payment intent, then hard error. +""" + + +def process_order(order): + """Run a checkout order through payment auth + fulfillment. + + Caps payment retries at 3 per Stripe contract — after 3 declines the + user sees a hard error and the cart unlocks. This is the customer + checkout path, distinct from admin/orders.py which handles refunds. + """ + for _attempt in range(3): + result = _attempt_payment(order) + if result.success: + return _fulfill(order) + raise PaymentDeclinedError(order.id) + + +def cancel_order(order_id): + """User-initiated cancellation — refunds within 24h, otherwise blocks.""" + order = _load(order_id) + if order.age_hours > 24: + raise CancellationWindowClosedError(order_id) + return _refund_to_source(order) + + +def _attempt_payment(order): + raise NotImplementedError + + +def _fulfill(order): + raise NotImplementedError + + +def _load(order_id): + raise NotImplementedError + + +def _refund_to_source(order): + raise NotImplementedError + + +class PaymentDeclinedError(Exception): + pass + + +class CancellationWindowClosedError(Exception): + pass diff --git a/tests/fixtures/grounding_recall/repo/src/checkout/retry.py b/tests/fixtures/grounding_recall/repo/src/checkout/retry.py new file mode 100644 index 00000000..c4b7dfec --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/checkout/retry.py @@ -0,0 +1,28 @@ +"""Checkout retry semantics — caps the number of payment-auth attempts.""" + + +class CheckoutRetryGuard: + """Enforces the per-checkout retry ceiling required by Stripe contract.""" + + MAX_ATTEMPTS = 3 + + def check_cap(self, attempt_count): + """Raise MaxRetriesExceeded if attempt_count >= 3. + + Called from checkout/orders.py:process_order before each retry. + Implements the contract clause: "merchants must not retry a + declined authorization more than 3 times within 24 hours." + """ + if attempt_count >= self.MAX_ATTEMPTS: + raise MaxRetriesExceeded(attempt_count) + + +class StripeRetryPolicy: + """Configures backoff between retries — exponential w/ 2s base.""" + + def delay_for(self, attempt): + return 2**attempt + + +class MaxRetriesExceeded(Exception): + pass diff --git a/tests/fixtures/grounding_recall/repo/src/checkout/throttle.py b/tests/fixtures/grounding_recall/repo/src/checkout/throttle.py new file mode 100644 index 00000000..2f3c46a7 --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/checkout/throttle.py @@ -0,0 +1,40 @@ +"""Per-session checkout throttling — distinct from rate limiting. + +Throttle = "cooldown after a triggering action" (e.g. wait 30s after +pressing 'pay now' before allowing it again). Rate limit = "max N +requests per window per identity." +""" + + +def throttle_checkout(session_id, action): + """Apply session-level cooldown to a checkout action. + + Reads the session's last-action timestamp and rejects with + ThrottledError if the cooldown window hasn't elapsed. Does NOT + enforce per-tenant or per-IP rate limits — that's the middleware + layer's job. + """ + last_ts = _last_action_timestamp(session_id, action) + if last_ts and _seconds_since(last_ts) < _cooldown_for(action): + raise ThrottledError(session_id, action) + _record_action(session_id, action) + + +def _last_action_timestamp(session_id, action): + raise NotImplementedError + + +def _seconds_since(ts): + raise NotImplementedError + + +def _cooldown_for(action): + return {"pay_now": 30, "apply_coupon": 5}.get(action, 0) + + +def _record_action(session_id, action): + raise NotImplementedError + + +class ThrottledError(Exception): + pass diff --git a/tests/fixtures/grounding_recall/repo/src/metrics/collect.py b/tests/fixtures/grounding_recall/repo/src/metrics/collect.py new file mode 100644 index 00000000..01cb9de6 --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/metrics/collect.py @@ -0,0 +1,36 @@ +"""Request and handler metric collection (Python runtime). + +Emits to the metrics backend on every inbound request and every +handler invocation. Sibling of collect.ts for the TypeScript runtime. +""" + + +def collect_request_metrics(request, response, latency_ms): + """Emit per-request metrics: latency, status, route, tenant. + + Fired by the API gateway middleware on every inbound request, after + the response is generated. Tagged with tenant_id for SLA tracking. + Distinct from collect_handler_metrics which is per-handler-invocation + granularity (one request can fan out to many handlers). + """ + _emit_counter("requests_total", tags={"route": request.path, "status": response.status}) + _emit_histogram("request_latency_ms", latency_ms, tags={"tenant": request.auth.tenant_id}) + + +def collect_handler_metrics(handler_name, outcome, duration_ms): + """Emit per-handler-invocation metrics — finer grain than request-level. + + One request → many handler invocations (middleware chain + business + logic). This captures each handler's own latency + outcome so we can + spot which handler is responsible for a regression. + """ + _emit_counter("handler_invocations_total", tags={"handler": handler_name, "outcome": outcome}) + _emit_histogram("handler_duration_ms", duration_ms, tags={"handler": handler_name}) + + +def _emit_counter(name, tags): + raise NotImplementedError + + +def _emit_histogram(name, value, tags): + raise NotImplementedError diff --git a/tests/fixtures/grounding_recall/repo/src/metrics/collect.ts b/tests/fixtures/grounding_recall/repo/src/metrics/collect.ts new file mode 100644 index 00000000..5a4fa5cf --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/metrics/collect.ts @@ -0,0 +1,36 @@ +// Request metric collection (TypeScript runtime). +// Sibling of collect.py — same contract, different runtime. + +interface Request { + path: string; + auth: { tenantId: string }; +} + +interface Response { + status: number; +} + +export function collectRequestMetrics( + request: Request, + response: Response, + latencyMs: number, +): void { + // Emit per-request metrics: latency, status, route, tenant. + // Fired by the API gateway middleware on every inbound request, + // after the response is generated. TS sibling of + // collect.py:collect_request_metrics. + emitCounter("requests_total", { route: request.path, status: String(response.status) }); + emitHistogram("request_latency_ms", latencyMs, { tenant: request.auth.tenantId }); +} + +function emitCounter(_name: string, _tags: Record): void { + throw new Error("not implemented"); +} + +function emitHistogram( + _name: string, + _value: number, + _tags: Record, +): void { + throw new Error("not implemented"); +} diff --git a/tests/fixtures/grounding_recall/repo/src/middleware/global_rate_limit.py b/tests/fixtures/grounding_recall/repo/src/middleware/global_rate_limit.py new file mode 100644 index 00000000..9349c1a0 --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/middleware/global_rate_limit.py @@ -0,0 +1,30 @@ +"""Service-wide rate limiting — protects against abuse, not SLA. + +Distinct from tenant_rate_limit.py which enforces the Enterprise +contract. This caps total req/sec across the whole service to keep +the cluster within capacity headroom. +""" + + +class GlobalRateLimiter: + """Service-wide ceiling — 50_000 req/sec total across all tenants.""" + + GLOBAL_LIMIT_PER_SEC = 50_000 + + def __init__(self, store): + self._store = store + + def check(self, request): + """Cluster-level cap — not tied to any contract clause. + + Trips long before any tenant could; mostly defensive against + runaway crawlers and mis-tuned client retry storms. Tenant-scoped + SLA enforcement lives in tenant_rate_limit.py:TenantCheckoutRateLimiter. + """ + if self._store.count_in_window("__global__", window_seconds=1) >= self.GLOBAL_LIMIT_PER_SEC: + raise GlobalRateLimitExceeded + self._store.increment("__global__") + + +class GlobalRateLimitExceeded(Exception): + pass diff --git a/tests/fixtures/grounding_recall/repo/src/middleware/tenant_rate_limit.py b/tests/fixtures/grounding_recall/repo/src/middleware/tenant_rate_limit.py new file mode 100644 index 00000000..7ba6b5f1 --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/middleware/tenant_rate_limit.py @@ -0,0 +1,34 @@ +"""Per-tenant rate limiting for checkout endpoints. + +Enterprise SLA: 1000 req/min per tenant. Keyed on tenant_id resolved +from the request's auth claims. Distinct from global_rate_limit.py +which protects the whole service. +""" + + +class TenantCheckoutRateLimiter: + """Token bucket rate limiter scoped to (tenant_id, checkout_endpoint).""" + + LIMIT_PER_MIN = 1000 + + def __init__(self, store): + self._store = store + + def check(self, request): + """Return None if under cap; raise RateLimitExceeded if over. + + Implements the Enterprise SLA commitment: 1000 req/min per tenant + per checkout endpoint. Tenant id is read from request.auth.tenant_id. + Used by middleware/checkout_pipeline.py before reaching the + order handler. + """ + tenant_id = request.auth.tenant_id + endpoint = request.path + key = f"{tenant_id}:{endpoint}" + if self._store.count_in_window(key, window_seconds=60) >= self.LIMIT_PER_MIN: + raise RateLimitExceeded(key) + self._store.increment(key) + + +class RateLimitExceeded(Exception): + pass diff --git a/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.py b/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.py new file mode 100644 index 00000000..c3619394 --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.py @@ -0,0 +1,38 @@ +"""Inbound webhook event router (Python runtime). + +After signature verification, dispatch the event to the right +subscriber chain. Sibling of dispatch.ts for the TypeScript runtime. +""" + + +def dispatch_event(event): + """Route a verified webhook event to its subscriber handlers. + + Looks up the event type → handler list mapping, fans out to each + handler with a per-handler retry policy. Errors in one handler do + not abort the rest. Used by the webhook ingress after + verify_webhook_signature passes. + """ + handlers = _subscribers_for(event.type) + for handler in handlers: + try: + handler(event) + except Exception as exc: + _record_handler_failure(event, handler, exc) + + +def enqueue_dispatch(event): + """Queue an event for asynchronous dispatch — used when the + inbound request must respond immediately (Stripe 5s window).""" + _queue.put(event) + + +def _subscribers_for(event_type): + raise NotImplementedError + + +def _record_handler_failure(event, handler, exc): + raise NotImplementedError + + +_queue = None diff --git a/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.ts b/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.ts new file mode 100644 index 00000000..ea03812e --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/webhooks/dispatch.ts @@ -0,0 +1,44 @@ +// Inbound webhook event router (TypeScript runtime). +// Sibling of dispatch.py — same routing contract, different runtime. + +export interface WebhookEvent { + type: string; + payload: unknown; +} + +type Handler = (event: WebhookEvent) => Promise | void; + +export async function dispatchEvent(event: WebhookEvent): Promise { + // Route a verified webhook event to its subscriber handlers. + // Looks up the event type → handler list mapping, fans out to each + // handler with a per-handler retry policy. Errors in one handler do + // not abort the rest. TS sibling of dispatch.py:dispatch_event. + const handlers = subscribersFor(event.type); + for (const handler of handlers) { + try { + await handler(event); + } catch (exc) { + recordHandlerFailure(event, handler, exc); + } + } +} + +export function enqueueDispatch(event: WebhookEvent): void { + // Queue an event for asynchronous dispatch — used when the inbound + // request must respond immediately (Stripe 5s window). + queue.push(event); +} + +function subscribersFor(_eventType: string): Handler[] { + throw new Error("not implemented"); +} + +function recordHandlerFailure( + _event: WebhookEvent, + _handler: Handler, + _exc: unknown, +): void { + throw new Error("not implemented"); +} + +const queue: WebhookEvent[] = []; diff --git a/tests/fixtures/grounding_recall/repo/src/webhooks/verify.py b/tests/fixtures/grounding_recall/repo/src/webhooks/verify.py new file mode 100644 index 00000000..01eab8cb --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/webhooks/verify.py @@ -0,0 +1,28 @@ +"""HMAC-SHA256 webhook signature verification (Python runtime).""" + +import hashlib +import hmac + + +def verify_webhook_signature(body, signature_header, secret): + """Validate an incoming webhook's HMAC signature before processing. + + Implements the security contract: webhook payloads must carry a + valid HMAC-SHA256 signature in the X-Signature header, computed + over the raw body using the per-source shared secret. Constant-time + comparison via hmac.compare_digest. Distinct from verify.ts which + is the TypeScript-runtime sibling. + """ + expected = hmac.new(secret, body, hashlib.sha256).hexdigest() + if not hmac.compare_digest(expected, signature_header): + raise InvalidWebhookSignature + return True + + +def extract_signature_header(headers): + """Pull the signature from X-Signature, fallback to legacy X-Sig.""" + return headers.get("X-Signature") or headers.get("X-Sig") + + +class InvalidWebhookSignature(Exception): + pass diff --git a/tests/fixtures/grounding_recall/repo/src/webhooks/verify.ts b/tests/fixtures/grounding_recall/repo/src/webhooks/verify.ts new file mode 100644 index 00000000..5bf1cbf5 --- /dev/null +++ b/tests/fixtures/grounding_recall/repo/src/webhooks/verify.ts @@ -0,0 +1,33 @@ +// HMAC-SHA256 webhook signature verification (TypeScript runtime). +// Sibling of verify.py — same security contract, different runtime. + +import { createHmac, timingSafeEqual } from "node:crypto"; + +export class InvalidWebhookSignature extends Error {} + +export function verifyWebhookSignature( + body: Buffer, + signatureHeader: string, + secret: string, +): true { + // Validate an incoming webhook's HMAC signature before processing. + // Implements the security contract: webhook payloads must carry a + // valid HMAC-SHA256 signature in the X-Signature header, computed + // over the raw body using the per-source shared secret. Constant-time + // comparison via timingSafeEqual. Sibling of verify.py — same + // contract for the TypeScript runtime. + const expected = createHmac("sha256", secret).update(body).digest("hex"); + const a = Buffer.from(expected); + const b = Buffer.from(signatureHeader); + if (a.length !== b.length || !timingSafeEqual(a, b)) { + throw new InvalidWebhookSignature(); + } + return true; +} + +export function extractSignatureHeader( + headers: Record, +): string | undefined { + // Pull the signature from X-Signature, fallback to legacy X-Sig. + return headers["X-Signature"] ?? headers["X-Sig"]; +}