diff --git a/.claude/settings.json b/.claude/settings.json index 2b7f98a7..36c4619f 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -6,7 +6,16 @@ "hooks": [ { "type": "command", - "command": "python3 -c \"import json,sys,re; d=json.load(sys.stdin); c=d.get('tool_input',{}).get('command',''); ops=('git commit','git merge ','git pull','git rebase --continue'); [print('bicameral: git write-op detected — call bicameral.link_commit(commit_hash=\\'HEAD\\') now to sync the decision ledger') for _ in [1] if any(op in c for op in ops)]\"" + "command": "python3 scripts/hooks/post_commit_sync_reminder.py" + } + ] + }, + { + "matcher": "mcp__bicameral__bicameral_preflight", + "hooks": [ + { + "type": "command", + "command": "python3 scripts/hooks/post_preflight_capture_reminder.py" } ] } @@ -16,7 +25,17 @@ "hooks": [ { "type": "command", - "command": "[ -d .bicameral ] && claude -p '/bicameral:capture-corrections' || true" + "command": "[ -d .bicameral ] && [ -z \"$BICAMERAL_SESSION_END_RUNNING\" ] && BICAMERAL_SESSION_END_RUNNING=1 claude -p '/bicameral:capture-corrections --auto-ingest' || true" + } + ] + } + ], + "UserPromptSubmit": [ + { + "hooks": [ + { + "type": "command", + "command": "python3 scripts/hooks/preflight_reminder.py" } ] } diff --git a/.github/workflows/label-merged-to-dev.yml b/.github/workflows/label-merged-to-dev.yml new file mode 100644 index 00000000..c97c99a8 --- /dev/null +++ b/.github/workflows/label-merged-to-dev.yml @@ -0,0 +1,70 @@ +name: Apply merged-to-dev label + +on: + pull_request: + branches: [dev] + types: [closed] + +jobs: + label: + name: Label closed-by-PR issues + runs-on: ubuntu-latest + if: github.event.pull_request.merged == true + permissions: + issues: write + pull-requests: read + steps: + - name: Apply merged-to-dev label + uses: actions/github-script@v7 + with: + script: | + // Workflow caveat: this job needs Settings -> Actions -> + // General -> Workflow permissions set to "Read and write + // permissions" at the repo level. The job-level + // `permissions: issues: write` block can only NARROW what + // the repo allows, never expand it. If the repo default + // is read-only, addLabels returns 403 "Resource not + // accessible by integration" regardless of the job-level + // grant. + // + // See: #115 (root cause + symptoms) and #104 (admin-side + // fix tracked alongside branch-protection setup). + // + // GitHub close keywords (case-insensitive): close, closes, + // closed, fix, fixes, fixed, resolve, resolves, resolved. + const pr = context.payload.pull_request; + const body = pr.body || ""; + const closeRegex = /(?:close[sd]?|fix(?:es|ed)?|resolve[sd]?)\s+#(\d+)/gi; + const matches = [...body.matchAll(closeRegex)]; + const issues = [...new Set(matches.map(m => parseInt(m[1])))]; + + const failed = []; + for (const num of issues) { + try { + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: num, + labels: ["merged-to-dev"] + }); + console.log(`Labeled #${num}`); + } catch (e) { + console.log(`Failed to label #${num}: ${e.message}`); + failed.push({ num, message: e.message }); + } + } + + if (failed.length > 0) { + const summary = failed + .map(f => ` - #${f.num}: ${f.message}`) + .join("\n"); + throw new Error( + `merged-to-dev labeller could not label ${failed.length} ` + + `issue(s) referenced by PR #${pr.number}:\n${summary}\n\n` + + `Most likely cause: repo Settings -> Actions -> General -> ` + + `Workflow permissions is set to read-only. ` + + `Job-level "permissions: issues: write" cannot expand a ` + + `read-only repo default. See #104 (admin fix) and ` + + `#115 (root cause).` + ); + } diff --git a/.github/workflows/lint-and-typecheck.yml b/.github/workflows/lint-and-typecheck.yml new file mode 100644 index 00000000..a8f8bd5d --- /dev/null +++ b/.github/workflows/lint-and-typecheck.yml @@ -0,0 +1,24 @@ +name: Lint & Type Check + +on: + pull_request: + branches: [main, dev] + +jobs: + lint: + name: ruff + mypy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + - name: Install + run: pip install -e ".[test]" + - name: Ruff check + run: ruff check . + - name: Ruff format check + run: ruff format --check . + - name: Mypy + run: mypy . diff --git a/.github/workflows/secret-scan.yml b/.github/workflows/secret-scan.yml new file mode 100644 index 00000000..7a04f54f --- /dev/null +++ b/.github/workflows/secret-scan.yml @@ -0,0 +1,24 @@ +name: Secret Scan + +on: + pull_request: + branches: [main, dev] + +# gitleaks-action@v2 requires a paid license for organizations +# (https://github.com/gitleaks/gitleaks-action#-announcement). +# We use trufflehog instead — free for all repos, equally capable +# detector ruleset, and faster cold-start than spinning up a +# gitleaks container. +jobs: + trufflehog: + name: TruffleHog + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # full history so trufflehog can scan the diff range + - uses: trufflesecurity/trufflehog@main + with: + base: ${{ github.event.pull_request.base.sha }} + head: ${{ github.event.pull_request.head.sha }} + extra_args: --only-verified diff --git a/.github/workflows/test-mcp-regression.yml b/.github/workflows/test-mcp-regression.yml index 4336950e..fdcacc0e 100644 --- a/.github/workflows/test-mcp-regression.yml +++ b/.github/workflows/test-mcp-regression.yml @@ -2,7 +2,7 @@ name: MCP Regression Tests on: pull_request: - branches: [main] + branches: [main, dev] env: PYTHON_VERSION: '3.11' @@ -10,7 +10,12 @@ env: jobs: mcp-tests: name: MCP Regression Suite - runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest] + runs-on: ${{ matrix.os }} + timeout-minutes: 20 # Needed so ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} / ANTHROPIC_API_KEY # (environment secrets scoped to `ci-test`) is injected into the M1 # extraction step. The env is gate-free so this does not block @@ -47,7 +52,10 @@ jobs: # ── Clone OSS repos for eval ground truth ──────────────────────── # Only medusa is needed — saleor/vendure were used by eval_code_locator.py # which was removed in v0.6.4 when search_code was nuked. + # Ubuntu-only: bash function syntax + medusa corpus consumed by + # the Linux-only M1 adversarial eval and E2E report below. - name: Clone eval repos (shallow, pinned commits) + if: matrix.os == 'ubuntu-latest' run: | clone_at_commit() { local repo_url=$1 dest=$2 commit=$3 @@ -80,6 +88,7 @@ jobs: # "secret is not set" from "secret is set to empty string" from # "secret is set correctly" without ever exposing the key. - name: M1 secret visibility probe + if: matrix.os == 'ubuntu-latest' run: | set +e if [ -n "${ANTHROPIC_API_KEY}" ]; then @@ -109,6 +118,7 @@ jobs: # as a red "M1 adversarial" step in the job without failing the # whole build, so the rest of the regression suite still reports. - name: M1 adversarial corpus eval (warn-only) + if: matrix.os == 'ubuntu-latest' continue-on-error: true env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} @@ -120,8 +130,12 @@ jobs: -o test-results/m1-adversarial.json # ── Generate rich E2E report from artifacts ──────────────────── + # Ubuntu-only: the script consumes the medusa adversarial corpus + # (cloned only on Ubuntu above) plus the Phase 3 E2E artifacts + # the report builds. Windows runs the unit + integration suite + # for cross-platform coverage but skips the corpus-driven E2E. - name: Generate E2E report - if: always() + if: always() && matrix.os == 'ubuntu-latest' run: python tests/generate_e2e_report.py # ── Generate step summary from JUnit XML ─────────────────────── @@ -137,6 +151,6 @@ jobs: uses: actions/upload-artifact@v4 if: always() with: - name: mcp-test-results + name: mcp-test-results-${{ matrix.os }} path: test-results/ retention-days: 30 diff --git a/.github/workflows/v0-user-flow-e2e.yml b/.github/workflows/v0-user-flow-e2e.yml new file mode 100644 index 00000000..28b47492 --- /dev/null +++ b/.github/workflows/v0-user-flow-e2e.yml @@ -0,0 +1,219 @@ +name: v0 user flow e2e + +# End-to-end validation of BicameralAI/bicameral#108's six canonical user +# flows via real Claude Code CLI sessions with bicameral-mcp registered. +# See tests/e2e/README.md for the design. +# +# Two-stage workflow: +# 1. assertions — always runs (PR + dispatch), no manual gate. Validates +# MCP tool callability + surfaces agentic-layer advisories. +# 2. recording — manual dispatch only, gated by an environment with +# required reviewers (`recording-approval`). Produces +# split-screen demo MP4s; expensive (~30-45 min wall + +# API spend), so worth gating behind explicit approval. +# +# Note: when this workflow file lands, it will not run on the PR that +# adds it — pull_request workflows execute the version on the base +# branch (main). First execution is on the next qualifying PR after merge. + +on: + pull_request: + branches: [main, dev] + paths: + - 'tests/e2e/**' + - 'handlers/**' + - 'ledger/**' + - 'contracts.py' + - 'skills/bicameral-**' + - 'server.py' + - 'pyproject.toml' + - '.github/workflows/v0-user-flow-e2e.yml' + workflow_dispatch: + +env: + PYTHON_VERSION: '3.11' + NODE_VERSION: '20' + # Pinned commit of github.com/desktop/desktop. Bump when the roadmap.md + # shape drifts in ways that break prompts, or when bind targets change. + DESKTOP_PINNED_COMMIT: 'e6c50fb028171e9cec03594273c8116bb135847e' + DESKTOP_REPO_PATH: /tmp/desktop-clone + +jobs: + # ── Stage 1: assertions — always runs ─────────────────────────────── + assertions: + name: e2e assertions (auto) + runs-on: ubuntu-latest + # production env provides CLAUDE_CODE_OAUTH_TOKEN. No required reviewers + # on this env → PR triggers flow through automatically. + environment: production + timeout-minutes: 25 + steps: + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Setup Node.js (for Claude Code CLI) + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + + - name: Install bicameral-mcp + test deps + run: pip install -e ".[test]" + + - name: Install Claude Code CLI + run: npm install -g @anthropic-ai/claude-code + + - name: Verify CLI tooling on PATH + run: | + which claude && claude --version + which bicameral-mcp + + - name: Clone desktop/desktop at pinned commit + run: | + mkdir -p ${{ env.DESKTOP_REPO_PATH }} + cd ${{ env.DESKTOP_REPO_PATH }} + git init -q + git remote add origin https://github.com/desktop/desktop + git fetch --depth 1 origin "${DESKTOP_PINNED_COMMIT}" + git checkout FETCH_HEAD + git checkout -b main + git config user.email ci@bicameral.test + git config user.name CI + test -f docs/process/roadmap.md + test -f app/src/lib/git/cherry-pick.ts + + - name: Claude Code OAuth token visibility probe + run: | + set +e + if [ -n "${CLAUDE_CODE_OAUTH_TOKEN}" ]; then + echo "CLAUDE_CODE_OAUTH_TOKEN: present (length=${#CLAUDE_CODE_OAUTH_TOKEN})" + else + echo "CLAUDE_CODE_OAUTH_TOKEN: EMPTY or UNSET" + echo " secret expression non-empty: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN != '' }}" + exit 1 + fi + env: + CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + + - name: Run v0 user flow e2e (assertion-only, blocking) + env: + CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + run: python tests/e2e/run_e2e_flows.py + + - name: Upload e2e transcripts + if: always() + uses: actions/upload-artifact@v4 + with: + name: v0-user-flow-e2e-transcripts + path: test-results/e2e/ + retention-days: 30 + + # ── Stage 2: recording — manual approval required ──────────────────── + recording: + name: split-screen demo recording (manual approval) + # No `needs:` — runs in parallel with `assertions`. Advisory failures + # in the assertion harness must NOT block recording: the demo is + # meant to showcase the agentic gap as well as the wins, and the two + # paths have independent value (assertion = MCP-tool callability, + # recording = visual validation of the agentic layer). + # + # The `recording-approval` environment's required-reviewers rule is + # the SOLE gate. No `if:` predicate — adding one would skip the job + # on PR triggers (or on dispatch without an extra input toggle), so + # reviewers would never see the approval prompt. Letting the job + # always queue means it sits in "Waiting" until someone with reviewer + # permission clicks Approve in the Actions UI. + runs-on: ubuntu-latest + # `recording-approval` env should have required reviewers configured + # in repo settings → that's the manual gate. Inherits OAuth token from + # the same env (or repo-level secrets). + environment: recording-approval + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Setup Node.js (for Claude Code CLI) + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + + - name: Install bicameral-mcp + test deps + run: pip install -e ".[test]" + + - name: Install Claude Code CLI + run: npm install -g @anthropic-ai/claude-code + + - name: Verify CLI tooling on PATH + run: | + which claude && claude --version + which bicameral-mcp + + - name: Clone desktop/desktop at pinned commit + run: | + mkdir -p ${{ env.DESKTOP_REPO_PATH }} + cd ${{ env.DESKTOP_REPO_PATH }} + git init -q + git remote add origin https://github.com/desktop/desktop + git fetch --depth 1 origin "${DESKTOP_PINNED_COMMIT}" + git checkout FETCH_HEAD + git checkout -b main + git config user.email ci@bicameral.test + git config user.name CI + test -f docs/process/roadmap.md + test -f app/src/lib/git/cherry-pick.ts + + # NOTE: do NOT install `chromium-browser` here — on Ubuntu 22.04+ the + # apt package is a snap-store wrapper that hangs the runner. GitHub's + # ubuntu-latest image ships google-chrome-stable pre-installed; + # record_demo.sh auto-detects it. + - name: Install recording dependencies (Xvfb + ffmpeg + xterm + tmux) + run: | + sudo apt-get update -qq + sudo apt-get install -y --no-install-recommends \ + xvfb fluxbox xterm ffmpeg tmux fonts-dejavu + command -v google-chrome-stable || command -v google-chrome || \ + command -v chromium || command -v chromium-browser || \ + { echo "ERROR: no chromium-compatible browser found on PATH" >&2; exit 1; } + + # ANTHROPIC_API_KEY (NOT CLAUDE_CODE_OAUTH_TOKEN) — interactive `claude` + # ignores the OAuth env var (verified against 2.1.126; matches GH issue + # #32463). The assertions job's `claude -p` path keeps using OAuth. + - name: Anthropic API key visibility probe + run: | + set +e + if [ -n "${ANTHROPIC_API_KEY}" ]; then + echo "ANTHROPIC_API_KEY: present (length=${#ANTHROPIC_API_KEY})" + else + echo "ANTHROPIC_API_KEY: EMPTY or UNSET" + echo " secret expression non-empty: ${{ secrets.ANTHROPIC_API_KEY != '' }}" + exit 1 + fi + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + + # continue-on-error: a recording flake should not propagate as a hard + # failure. The artifact upload below preserves whatever was captured. + # Uses the interactive (tmux-driven real claude TUI) path; legacy + # `tests/e2e/record_demo.sh` is retained as a fallback. + - name: Record demo videos (split-screen, interactive TUI) + continue-on-error: true + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: bash tests/e2e/record_demo_interactive.sh + + - name: Upload demo videos + if: always() + uses: actions/upload-artifact@v4 + with: + name: v0-user-flow-e2e-demos + path: docs/demos/v0-userflow-e2e/*.mp4 + retention-days: 90 + if-no-files-found: warn diff --git a/.gitignore b/.gitignore index c32c25b9..fea06007 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,11 @@ test-results/ # Bicameral MCP local data (history stored in parent repo) .bicameral/ +# Demo MP4s — generated by the optional `record_demo` workflow path. +# Path-tracked under docs/demos/v0-userflow-e2e/ but binaries are +# distributed via the GitHub Actions artifact, not git. +docs/demos/**/*.mp4 + # QOR governance (process-only — not part of the published artifact) .agent/ .failsafe/ diff --git a/CHANGELOG.md b/CHANGELOG.md index f0486be6..68c0bf6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,30 @@ All notable changes to bicameral-mcp are tracked here. Format loosely follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +<<<<<<< triage-from-dev +## [Unreleased] + +### Added + +- `handlers/preflight.py` — `_region_anchored_preflight` now expands caller-supplied `file_paths` by 1 hop along the code-locator graph's **import edges** before the `binds_to` lookup. Lifts the strict exact-match recall ceiling so a decision bound to `app/src/lib/git/reorder.ts` surfaces when the caller passes the structurally-near `app/src/ui/multi-commit-operation/reorder.tsx`. Decisions reached only via expansion carry `confidence=0.7` (vs `0.9` for direct pins). `sources_chained` includes `"graph"` (alongside `"region"`) when expansion contributed at least one hit. Bounded per #64: ≤10 input seeds × `max_neighbors_per_result` neighbors per seed. Closes #173 (and supersedes #64). +- `adapters/code_locator.py::RealCodeLocatorAdapter.expand_file_paths_via_graph` — public method backing the expansion. Filters to ``imports`` edges only (file-level structural dependency); ``invokes`` / ``inherits`` / ``contains`` are symbol-level edges that over-broaden the file-level expansion. Returns `(expanded, added)` so callers can mark provenance. +- `skills/bicameral-preflight/SKILL.md` Step 2 — documents the imports-only expansion + caller-side `confidence` and `sources_chained` semantics. +- `tests/eval/preflight_dataset.jsonl` — M6 row flipped from XFAIL → live. Setup updated to specify graph-neighbor topology (`graph_neighbors`) and pinned-decision targets (`region_decisions_pinned_to`); the asserter now tests true graph-expansion semantics rather than mock-returns-decision-regardless-of-input. +- `tests/eval/run_preflight_eval.py` — `_apply_setup` extended with `region_decisions_pinned_to` (path-aware decision lookup) and `graph_neighbors` (stub code_graph) so M6-style scenarios can be expressed in the dataset. + +### Changed + +- `skills/bicameral-preflight/SKILL.md` Step 5.6 — judgment for contradiction-capture moves from the agent to the user via `AskUserQuestion` (Step 5.6.1). The agent no longer infers whether the prompt contradicts a surfaced decision; it asks the user (`supersede` / `keep_both` / `unrelated`) and acts mechanically on the answer (Step 5.6.2 — ingest + resolve_collision). The PostToolUse hook reminder now templates the disambiguation question rather than the bare ingest+resolve_collision sequence. Closes #175. +- `tests/e2e/run_e2e_flows.py::assert_flow_2a` — pass criterion changed from "ingest+resolve_collision fired" to "`AskUserQuestion` invoked with disambiguation shape after preflight surfaced ≥1 decision." The user-side response can't be driven in headless `claude -p`, so the testable signal is the question invocation. The mechanical capture (Step 5.6.2) only fires after a human answers and is exercised in interactive Claude Code sessions, not CI. + +### Fixed + +### Schema + +### Security + +======= +>>>>>>> main ## v0.13.6 — Triage: dashboard tooltip + capture-corrections source fix + #108 sim — built via [QorLogic SDLC](https://github.com/MythologIQ-Labs-LLC/qor-logic) Triage release per [DEV_CYCLE.md §10.5](DEV_CYCLE.md). Forwards three diff --git a/adapters/code_locator.py b/adapters/code_locator.py index ceb88624..64011d4b 100644 --- a/adapters/code_locator.py +++ b/adapters/code_locator.py @@ -63,6 +63,7 @@ def _ensure_initialized(self) -> None: ) self._db = db + self._config = config self._validate_tool = ValidateSymbolsTool(db, config) self._neighbors_tool = GetNeighborsTool(db, config) self._initialized = True @@ -90,6 +91,180 @@ def get_neighbors(self, symbol_id: int) -> list[dict]: results = self._neighbors_tool.execute({"symbol_id": symbol_id}) return [r.model_dump() for r in results] + # Hard cap on the number of caller-supplied seeds we expand. Mirrors the + # contract documented in #64: ≤10 input seeds × ≤max_neighbors_per_result + # neighbors per seed, so the worst-case response is still bounded even + # when the caller passes a large file_paths list. Tightens the cost + # envelope vs the per-config-only cap. Tunable via the PR's #64 lineage + # if telemetry shows we're losing recall. + _MAX_EXPANSION_SEEDS = 10 + + def expand_file_paths_via_graph( + self, + file_paths: list[str], + hops: int = 1, + ) -> tuple[list[str], list[str]]: + """Expand caller-supplied file paths to include 1-hop *import* graph + neighbors. + + For each input file, look up its indexed symbols, fetch each + symbol's 1-hop ego graph filtered to **import edges only**, and + collect the file paths those neighbor symbols live in. The expanded + set is the union of inputs and neighbor files. + + **Why imports only** (per #64): import is a *file-level* structural + dependency edge ("module A's contract is referenced by module B"), + which matches the granularity of the region-anchored decision + lookup. ``invokes`` / ``inherits`` / ``contains`` are *symbol-level* + edges that broaden the expansion to "any file whose symbols are + used by my file's symbols," which over-fires for the recall + contract this method backs. If telemetry surfaces real-world + contradictions that imports-only misses, widen the filter then — + not preemptively. + + Returns ``(expanded, added)`` where ``expanded`` is the deduped + union (preserving caller order for inputs, then appending + newly-discovered neighbor files) and ``added`` is the list of file + paths NOT in the original input — the caller uses this to mark + expanded matches with lower confidence than direct pins. + + Bounds (mirrors #64's spec): + - At most ``_MAX_EXPANSION_SEEDS`` (=10) input seeds are walked. + - For each seed, at most ``max_neighbors_per_result`` symbols are + walked; for each symbol, at most ``max_neighbors_per_result`` + neighbors are inspected. + - Global cap on the added set is the product so the worst-case + response is still bounded for hub seeds. + Falls back gracefully (returns input unchanged + empty added list) + on any exception or if the symbol index is unavailable. + + Used by ``handlers/preflight.py::_region_anchored_preflight`` to + lift the strict ``WHERE file_path IN $fps`` recall ceiling so the + contradiction-capture loop fires even when the caller picks a + structurally-near-but-not-exact file. See issue #173 (and the + superseded #64 for the imports-only design rationale). + """ + if not file_paths or hops < 1: + return list(file_paths), [] + try: + self._ensure_initialized() + except Exception: + return list(file_paths), [] + + per_symbol_cap = self._config.max_neighbors_per_result + # Cap total NEW paths added by expansion. With ≤10 seeds and + # ≤per_symbol_cap neighbors each, the worst case is bounded. + global_cap = max(per_symbol_cap, per_symbol_cap * self._MAX_EXPANSION_SEEDS) + + # Cap the number of input seeds we expand from. Caller can still pass + # more file_paths to the underlying ledger lookup — we just don't + # blow up the graph walk. + seeds = [fp for fp in file_paths if fp][: self._MAX_EXPANSION_SEEDS] + + original_set = {fp for fp in file_paths if fp} + added_paths: list[str] = [] + added_set: set[str] = set() + + for fp in seeds: + try: + symbols = self._db.lookup_by_file(fp) or [] + except Exception: + continue + for sym in symbols[:per_symbol_cap]: + if len(added_paths) >= global_cap: + break + sym_id = sym["id"] + try: + neighbors = self._db.get_ego_graph(sym_id, hops=hops) or [] + except Exception: + continue + for n in neighbors[:per_symbol_cap]: + if len(added_paths) >= global_cap: + break + if (n.get("edge_type") or "") != "imports": + continue + nfp = (n.get("file_path") or "").strip() + if not nfp or nfp in original_set or nfp in added_set: + continue + added_set.add(nfp) + added_paths.append(nfp) + if len(added_paths) >= global_cap: + break + + # Preserve caller order for the input prefix; append newly-added in + # discovery order. + expanded: list[str] = [] + for fp in file_paths: + if fp and fp not in expanded: + expanded.append(fp) + expanded.extend(added_paths) + return expanded, added_paths + + def neighbors_for( + self, + file_path: str, + start_line: int, + end_line: int, + ) -> tuple[str, ...]: + """Return 1-hop neighbor symbol addresses for a code span. + + Phase 3 (#60) protocol: resolve the symbol at ``(file, start, end)`` + via the existing symbol index, fetch its 1-hop neighbors, return + their addresses (``"::"``) as a sorted tuple. + Returns ``()`` when no symbol resolves to the span — matcher + gracefully degrades on the Jaccard signal. + """ + self._ensure_initialized() + try: + sym_id = self._resolve_symbol_id_for_span(file_path, start_line, end_line) + if sym_id is None: + return () + neighbors = self._neighbors_tool.execute({"symbol_id": sym_id}) + except Exception: + return () + addresses = sorted( + f"{getattr(n, 'file_path', '')}::{getattr(n, 'symbol_name', '') or getattr(n, 'name', '')}" + for n in neighbors + ) + return tuple(addresses) + + def _resolve_symbol_id_for_span( + self, + file_path: str, + start_line: int, + end_line: int, + ) -> int | None: + """Look up the symbol_id whose span contains the given line range. + + Uses the already-initialized ``self._db`` (set up in + ``_ensure_initialized``) via ``lookup_by_file``, then picks the + smallest enclosing symbol (most specific match). Returns + ``None`` if no symbol's span covers the requested range — + caller treats this as "no neighbors known" and the matcher's + Jaccard signal contributes zero. + + PR #73 review history: + - Earlier draft opened a fresh ``SymbolDB(...)`` per call, + leaking SQLite handles (CodeRabbit MAJOR adapters/code_locator.py:136). + - It also referenced ``config.sqlite_db_path``, which doesn't + exist on ``CodeLocatorConfig`` — the real attribute is + ``sqlite_db``. The ``AttributeError`` was silently swallowed + by ``neighbors_for``'s broad ``except``, so the method + always returned ``()`` and the continuity Jaccard signal + was permanently zero in production (Devin CRITICAL). + Both fixed by reusing ``self._db``. + """ + rows = self._db.lookup_by_file(file_path) + best_id: int | None = None + best_span: int = 1 << 30 + for r in rows: + r_start, r_end = r["start_line"], r["end_line"] + if r_start <= start_line and r_end >= end_line: + span = r_end - r_start + if span < best_span: + best_span, best_id = span, r["id"] + return best_id + async def extract_symbols(self, file_path: str) -> list[dict]: """Extract symbols from a file via tree-sitter (no LLM).""" from code_locator.indexing.symbol_extractor import extract_symbols @@ -102,12 +277,14 @@ async def extract_symbols(self, file_path: str) -> list[dict]: sym_type = rec.type if sym_type not in ("function", "class", "module", "file"): sym_type = "function" - symbols.append({ - "name": rec.qualified_name or rec.name, - "type": sym_type, - "start_line": rec.start_line, - "end_line": rec.end_line, - }) + symbols.append( + { + "name": rec.qualified_name or rec.name, + "type": sym_type, + "start_line": rec.start_line, + "end_line": rec.end_line, + } + ) return symbols def resolve_symbols(self, payload: dict) -> dict: @@ -117,10 +294,7 @@ def resolve_symbols(self, payload: dict) -> dict: if not mappings: return payload - needs_resolution = any( - m.get("symbols") and not m.get("code_regions") - for m in mappings - ) + needs_resolution = any(m.get("symbols") and not m.get("code_regions") for m in mappings) if not needs_resolution: return payload @@ -141,21 +315,27 @@ def resolve_symbols(self, payload: dict) -> dict: try: rows = db.lookup_by_name(name) except Exception as exc: - logger.warning("[resolve_symbols] lookup_by_name failed for '%s': %s", name, exc) + logger.warning( + "[resolve_symbols] lookup_by_name failed for '%s': %s", name, exc + ) rows = [] for row in rows: - code_regions.append({ - "symbol": row["qualified_name"] or row["name"], - "file_path": row["file_path"], - "start_line": row["start_line"], - "end_line": row["end_line"], - "type": row["type"], - "purpose": mapping.get("intent", ""), - }) + code_regions.append( + { + "symbol": row["qualified_name"] or row["name"], + "file_path": row["file_path"], + "start_line": row["start_line"], + "end_line": row["end_line"], + "type": row["type"], + "purpose": mapping.get("intent", ""), + } + ) if code_regions: mapping = {**mapping, "code_regions": code_regions} else: - logger.debug("[resolve_symbols] no symbols found in index for: %s", symbol_names) + logger.debug( + "[resolve_symbols] no symbols found in index for: %s", symbol_names + ) resolved_mappings.append(mapping) diff --git a/adapters/ledger.py b/adapters/ledger.py index 3516d7c9..71341c5b 100644 --- a/adapters/ledger.py +++ b/adapters/ledger.py @@ -34,6 +34,7 @@ def _read_collaboration_mode(repo_path: str) -> str: return "solo" try: import yaml + config = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} return config.get("mode", "solo") except Exception: @@ -66,9 +67,9 @@ def get_ledger(): mode = _read_collaboration_mode(repo_path) if mode == "team": - from events.writer import EventFileWriter, _get_git_email from events.materializer import EventMaterializer from events.team_adapter import TeamWriteAdapter + from events.writer import EventFileWriter, _get_git_email # BICAMERAL_DATA_PATH redirects all history (events + local state) # to a separate directory — typically a private parent repo when @@ -103,4 +104,5 @@ def get_drift_analyzer(): or CodeGenomeDriftAnalyzer when ready. """ from ledger.drift import HashDriftAnalyzer + return HashDriftAnalyzer() diff --git a/cli/__init__.py b/cli/__init__.py new file mode 100644 index 00000000..6a6f550b --- /dev/null +++ b/cli/__init__.py @@ -0,0 +1 @@ +"""Bicameral-MCP CLI utilities.""" diff --git a/code_locator/indexing/cocoindex_pipeline.py b/code_locator/indexing/cocoindex_pipeline.py index bed6b4f3..67d170f2 100644 --- a/code_locator/indexing/cocoindex_pipeline.py +++ b/code_locator/indexing/cocoindex_pipeline.py @@ -129,9 +129,7 @@ def extract_file_symbols(filename: str, content: str) -> list[dict]: def text_to_embedding( text: cocoindex.DataSlice[str], ) -> cocoindex.DataSlice[list[float]]: - return text.transform( - cocoindex.functions.SentenceTransformerEmbed(model=embedding_model) - ) + return text.transform(cocoindex.functions.SentenceTransformerEmbed(model=embedding_model)) @cocoindex.flow_def(name="CodeLocatorIndex") def code_locator_flow( @@ -175,9 +173,7 @@ def code_locator_flow( ) # Path 2: Symbol extraction - file["symbols"] = file["content"].transform( - extract_file_symbols, file["filename"] - ) + file["symbols"] = file["content"].transform(extract_file_symbols, file["filename"]) with file["symbols"].row() as sym: symbol_collector.collect( @@ -292,8 +288,10 @@ def _count_cocoindex_table(table_name: str) -> int: Falls back to 0 if the table doesn't exist or connection fails. """ import os + try: import psycopg2 + url = os.environ.get("COCOINDEX_DATABASE_URL", "") if not url: return 0 diff --git a/code_locator/indexing/graph_builder.py b/code_locator/indexing/graph_builder.py index 32e5bd1f..6945e32d 100644 --- a/code_locator/indexing/graph_builder.py +++ b/code_locator/indexing/graph_builder.py @@ -8,19 +8,17 @@ import os from pathlib import Path -from typing import Dict, List, Set, Tuple from .sqlite_store import SymbolDB from .symbol_extractor import ( EXTENSION_LANGUAGE, - SKIP_DIRS, _get_parser, _node_text, ) - # ── Contains edges ─────────────────────────────────────────────────── + def _build_contains_edges(db: SymbolDB) -> list[tuple[int, int, str]]: """Build parent->child edges using parent_qualified_name.""" conn = db._connect() @@ -50,6 +48,7 @@ def _build_contains_edges(db: SymbolDB) -> list[tuple[int, int, str]]: # ── Import edges ───────────────────────────────────────────────────── + def _extract_python_imports(tree, code: bytes) -> list[str]: """Extract imported names from Python import statements.""" names: list[str] = [] @@ -73,7 +72,11 @@ def walk(node): if node.type == "import_from_statement": # from foo import bar, baz for child in node.children: - if child.type == "dotted_name" and child.prev_sibling and _node_text(code, child.prev_sibling) == "import": + if ( + child.type == "dotted_name" + and child.prev_sibling + and _node_text(code, child.prev_sibling) == "import" + ): names.append(_node_text(code, child)) elif child.type == "aliased_import": alias = child.child_by_field_name("alias") @@ -198,6 +201,7 @@ def _extract_imports_for_language(language_id: str, tree, code: bytes) -> list[s # ── Invokes edges ──────────────────────────────────────────────────── + def _extract_call_names(tree, code: bytes, language_id: str) -> list[tuple[int, str]]: """Extract (line_number, called_function_name) from call expressions. @@ -230,6 +234,7 @@ def walk(node): # ── Main builder ───────────────────────────────────────────────────── + def build_graph(db: SymbolDB, repo_path: str) -> int: """Build dependency edges for all indexed symbols. Returns edge count.""" # Clear old edges — full rebuild is fast relative to symbol extraction @@ -250,7 +255,7 @@ def build_graph(db: SymbolDB, repo_path: str) -> int: ).fetchall() # Map: name -> list of symbol ids (multiple symbols can have the same name) - name_to_ids: Dict[str, list[int]] = {} + name_to_ids: dict[str, list[int]] = {} for sym in all_symbols: name = sym[1] if name not in name_to_ids: @@ -274,7 +279,7 @@ def build_graph(db: SymbolDB, repo_path: str) -> int: continue try: - with open(abs_path, "r", encoding="utf-8", errors="replace") as f: + with open(abs_path, encoding="utf-8", errors="replace") as f: source = f.read() except OSError: continue @@ -301,7 +306,7 @@ def build_graph(db: SymbolDB, repo_path: str) -> int: for row in file_all_symbols: all_file_sym_ids.add(row[0]) - seen_import_edges: Set[Tuple[int, int]] = set() + seen_import_edges: set[tuple[int, int]] = set() for imp_name in imported_names: target_ids = name_to_ids.get(imp_name, []) for target_id in target_ids: @@ -324,7 +329,7 @@ def build_graph(db: SymbolDB, repo_path: str) -> int: (rel_path,), ).fetchall() - seen_invoke_edges: Set[Tuple[int, int]] = set() + seen_invoke_edges: set[tuple[int, int]] = set() for func in func_symbols: func_id = func[0] func_start = func[2] diff --git a/code_locator/indexing/index_builder.py b/code_locator/indexing/index_builder.py index bf66f885..cf1e1d1c 100644 --- a/code_locator/indexing/index_builder.py +++ b/code_locator/indexing/index_builder.py @@ -93,6 +93,7 @@ def build_index(repo_path: str, db_path: str) -> IndexStats: # Build dependency graph edges from .graph_builder import build_graph + stats.edges_created = build_graph(db, repo_path) db.close() diff --git a/code_locator/indexing/sqlite_store.py b/code_locator/indexing/sqlite_store.py index 0f744fd9..a1a7e649 100644 --- a/code_locator/indexing/sqlite_store.py +++ b/code_locator/indexing/sqlite_store.py @@ -9,7 +9,6 @@ import sqlite3 from dataclasses import dataclass from pathlib import Path -from typing import Any @dataclass @@ -96,8 +95,16 @@ def insert_symbols_batch(self, symbols: list[SymbolRecord]) -> None: (name, qualified_name, type, file_path, start_line, end_line, signature, parent_qualified_name) VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", [ - (s.name, s.qualified_name, s.type, s.file_path, - s.start_line, s.end_line, s.signature, s.parent_qualified_name) + ( + s.name, + s.qualified_name, + s.type, + s.file_path, + s.start_line, + s.end_line, + s.signature, + s.parent_qualified_name, + ) for s in symbols ], ) @@ -110,21 +117,15 @@ def delete_file_symbols(self, file_path: str) -> None: def lookup_by_name(self, name: str) -> list[sqlite3.Row]: conn = self._connect() - return conn.execute( - "SELECT * FROM symbols WHERE name = ?", (name,) - ).fetchall() + return conn.execute("SELECT * FROM symbols WHERE name = ?", (name,)).fetchall() def lookup_by_file(self, file_path: str) -> list[sqlite3.Row]: conn = self._connect() - return conn.execute( - "SELECT * FROM symbols WHERE file_path = ?", (file_path,) - ).fetchall() + return conn.execute("SELECT * FROM symbols WHERE file_path = ?", (file_path,)).fetchall() def get_all_symbol_names(self) -> list[tuple[int, str, str]]: conn = self._connect() - rows = conn.execute( - "SELECT id, name, qualified_name FROM symbols" - ).fetchall() + rows = conn.execute("SELECT id, name, qualified_name FROM symbols").fetchall() return [(r[0], r[1], r[2]) for r in rows] def symbol_count(self) -> int: @@ -133,9 +134,7 @@ def symbol_count(self) -> int: def lookup_by_id(self, symbol_id: int) -> sqlite3.Row | None: conn = self._connect() - return conn.execute( - "SELECT * FROM symbols WHERE id = ?", (symbol_id,) - ).fetchone() + return conn.execute("SELECT * FROM symbols WHERE id = ?", (symbol_id,)).fetchone() def delete_all_edges(self) -> None: conn = self._connect() diff --git a/code_locator/indexing/symbol_extractor.py b/code_locator/indexing/symbol_extractor.py index 6b74deb5..51f246a1 100644 --- a/code_locator/indexing/symbol_extractor.py +++ b/code_locator/indexing/symbol_extractor.py @@ -6,8 +6,6 @@ from __future__ import annotations -from typing import Dict, List, Optional - from .sqlite_store import SymbolRecord # ── Language mappings ──────────────────────────────────────────────── @@ -39,14 +37,16 @@ _USE_LEGACY = False try: - from tree_sitter_languages import get_language as _legacy_get_language, get_parser as _legacy_get_parser + from tree_sitter_languages import get_language as _legacy_get_language + from tree_sitter_languages import get_parser as _legacy_get_parser + _USE_LEGACY = True except Exception: _legacy_get_language = None _legacy_get_parser = None # Individual language packages for the modern API -_LANG_MODULES: Dict[str, object] = {} +_LANG_MODULES: dict[str, object] = {} if not _USE_LEGACY: try: @@ -66,8 +66,8 @@ # ── Parser caching ─────────────────────────────────────────────────── -PARSER_CACHE: Dict[str, object] = {} -LANGUAGE_CACHE: Dict[str, object] = {} +PARSER_CACHE: dict[str, object] = {} +LANGUAGE_CACHE: dict[str, object] = {} def _get_language_obj(resolved: str): @@ -84,6 +84,7 @@ def _get_language_obj(resolved: str): if pkg_name not in _LANG_MODULES: import importlib + mod = importlib.import_module(pkg_name) _LANG_MODULES[pkg_name] = mod @@ -109,11 +110,12 @@ def _get_parser(language_id: str): # ── Helpers ────────────────────────────────────────────────────────── + def _node_text(code: bytes, node) -> str: - return code[node.start_byte:node.end_byte].decode("utf-8", errors="replace") + return code[node.start_byte : node.end_byte].decode("utf-8", errors="replace") -def _get_name_from_node(node, code: bytes) -> Optional[str]: +def _get_name_from_node(node, code: bytes) -> str | None: name_node = node.child_by_field_name("name") if name_node is None: return None @@ -148,10 +150,11 @@ def _make_record( # ── Python ─────────────────────────────────────────────────────────── -def _extract_python_defs(tree, code: bytes, rel_path: str) -> List[SymbolRecord]: - records: List[SymbolRecord] = [] - def walk(node, class_stack: List[str]): +def _extract_python_defs(tree, code: bytes, rel_path: str) -> list[SymbolRecord]: + records: list[SymbolRecord] = [] + + def walk(node, class_stack: list[str]): if node.type == "class_definition": name = _get_name_from_node(node, code) if not name: @@ -187,14 +190,15 @@ def walk(node, class_stack: List[str]): # ── JavaScript / TypeScript / JSX / TSX ────────────────────────────── -def _extract_js_ts_defs(tree, code: bytes, rel_path: str, language_id: str) -> List[SymbolRecord]: - records: List[SymbolRecord] = [] + +def _extract_js_ts_defs(tree, code: bytes, rel_path: str, language_id: str) -> list[SymbolRecord]: + records: list[SymbolRecord] = [] class_types = {"class_declaration"} if language_id in ("typescript", "tsx"): class_types.update({"interface_declaration", "type_alias_declaration", "enum_declaration"}) - def walk(node, class_stack: List[str]): + def walk(node, class_stack: list[str]): if node.type in class_types: name = _get_name_from_node(node, code) if not name: @@ -250,11 +254,12 @@ def walk(node, class_stack: List[str]): # ── Java ───────────────────────────────────────────────────────────── -def _extract_java_defs(tree, code: bytes, rel_path: str) -> List[SymbolRecord]: - records: List[SymbolRecord] = [] + +def _extract_java_defs(tree, code: bytes, rel_path: str) -> list[SymbolRecord]: + records: list[SymbolRecord] = [] class_types = {"class_declaration", "interface_declaration", "enum_declaration"} - def walk(node, class_stack: List[str]): + def walk(node, class_stack: list[str]): if node.type in class_types: name = _get_name_from_node(node, code) if not name: @@ -288,10 +293,11 @@ def walk(node, class_stack: List[str]): # ── Go ─────────────────────────────────────────────────────────────── -def _extract_go_defs(tree, code: bytes, rel_path: str) -> List[SymbolRecord]: - records: List[SymbolRecord] = [] - def walk(node, class_stack: List[str]): +def _extract_go_defs(tree, code: bytes, rel_path: str) -> list[SymbolRecord]: + records: list[SymbolRecord] = [] + + def walk(node, class_stack: list[str]): if node.type == "type_spec": type_node = node.child_by_field_name("type") if type_node is not None and type_node.type in ("struct_type", "interface_type"): @@ -326,11 +332,12 @@ def walk(node, class_stack: List[str]): # ── Rust ───────────────────────────────────────────────────────────── -def _extract_rust_defs(tree, code: bytes, rel_path: str) -> List[SymbolRecord]: - records: List[SymbolRecord] = [] + +def _extract_rust_defs(tree, code: bytes, rel_path: str) -> list[SymbolRecord]: + records: list[SymbolRecord] = [] class_types = {"struct_item", "enum_item", "trait_item"} - def walk(node, class_stack: List[str]): + def walk(node, class_stack: list[str]): if node.type in class_types: name = _get_name_from_node(node, code) if not name: @@ -356,11 +363,17 @@ def walk(node, class_stack: List[str]): # ── C# ─────────────────────────────────────────────────────────────── -def _extract_csharp_defs(tree, code: bytes, rel_path: str) -> List[SymbolRecord]: - records: List[SymbolRecord] = [] - class_types = {"class_declaration", "interface_declaration", "struct_declaration", "enum_declaration"} - def walk(node, class_stack: List[str]): +def _extract_csharp_defs(tree, code: bytes, rel_path: str) -> list[SymbolRecord]: + records: list[SymbolRecord] = [] + class_types = { + "class_declaration", + "interface_declaration", + "struct_declaration", + "enum_declaration", + } + + def walk(node, class_stack: list[str]): if node.type in class_types: name = _get_name_from_node(node, code) if not name: @@ -394,7 +407,8 @@ def walk(node, class_stack: List[str]): # ── Dispatch ───────────────────────────────────────────────────────── -def _extract_definitions(language_id: str, tree, code: bytes, rel_path: str) -> List[SymbolRecord]: + +def _extract_definitions(language_id: str, tree, code: bytes, rel_path: str) -> list[SymbolRecord]: if language_id == "python": return _extract_python_defs(tree, code, rel_path) if language_id in ("javascript", "jsx", "typescript", "tsx"): @@ -412,6 +426,7 @@ def _extract_definitions(language_id: str, tree, code: bytes, rel_path: str) -> # ── Public API ─────────────────────────────────────────────────────── + def extract_symbols_from_content( content: str, language_id: str, rel_path: str ) -> list[SymbolRecord]: @@ -453,7 +468,7 @@ def extract_symbols(file_path: str, repo_root: str) -> list[SymbolRecord]: rel_path = Path(file_path).relative_to(repo_root).as_posix() - with open(file_path, "r", encoding="utf-8", errors="replace") as f: + with open(file_path, encoding="utf-8", errors="replace") as f: source = f.read() return extract_symbols_from_content(source, language_id, rel_path) diff --git a/code_locator/models.py b/code_locator/models.py index a06de85c..2a4d8a27 100644 --- a/code_locator/models.py +++ b/code_locator/models.py @@ -9,7 +9,6 @@ from pydantic import BaseModel, Field - # ── Input (from Agent A: Transcript Extractor) ────────────────────── @@ -44,12 +43,8 @@ class ValidatedSymbol(BaseModel): original_candidate: str = Field(description="What the LLM (or keyword extractor) proposed") matched_symbol: str = Field(description="The real symbol from the index that matched") - match_score: float = Field( - ge=0.0, le=100.0, description="rapidfuzz match score (0-100)" - ) - symbol_id: int | None = Field( - default=None, description="SQLite row ID of the matched symbol" - ) + match_score: float = Field(ge=0.0, le=100.0, description="rapidfuzz match score (0-100)") + symbol_id: int | None = Field(default=None, description="SQLite row ID of the matched symbol") repo: str = Field(default="", description="Source repo for multi-repo support") bridge_method: str = Field( default="rapidfuzz_validate", @@ -96,9 +91,7 @@ class Provenance(BaseModel): bridge_match_score: float = Field( default=0.0, description="rapidfuzz score of the bridge match" ) - bridge_method: str = Field( - default="", description="How the bridge candidate was generated" - ) + bridge_method: str = Field(default="", description="How the bridge candidate was generated") rrf_score: float = Field(default=0.0, description="Weighted RRF fusion score") @@ -112,7 +105,9 @@ class NeighborInfo(BaseModel): file_path: str = Field(description="Path relative to repo root") line_number: int = Field(default=0) edge_type: str = Field(description="Relationship: contains, imports, invokes, inherits") - direction: str = Field(description="forward (this calls neighbor) or backward (neighbor calls this)") + direction: str = Field( + description="forward (this calls neighbor) or backward (neighbor calls this)" + ) # ── Output (to Agent C: Evidence Gater) ────────────────────────────── @@ -134,5 +129,3 @@ class FoundComponent(BaseModel): neighbors: list[NeighborInfo] = Field( default_factory=list, description="1-hop structural neighbors" ) - - diff --git a/code_locator/tools/validate_symbols.py b/code_locator/tools/validate_symbols.py index c0d02707..7b1a68cb 100644 --- a/code_locator/tools/validate_symbols.py +++ b/code_locator/tools/validate_symbols.py @@ -2,10 +2,11 @@ from __future__ import annotations +from rapidfuzz import fuzz + from ..config import CodeLocatorConfig from ..indexing.sqlite_store import SymbolDB from ..models import ValidatedSymbol -from rapidfuzz import fuzz # JSON Schema for tool parameter validation TOOL_SCHEMA = { diff --git a/code_locator_runtime.py b/code_locator_runtime.py index 4dc43c57..733e2888 100644 --- a/code_locator_runtime.py +++ b/code_locator_runtime.py @@ -48,8 +48,6 @@ def ensure_runtime_env() -> None: os.environ.setdefault("CODE_LOCATOR_SQLITE_DB", str(cache_root / "code-graph.db")) - - def _git_stdout(repo_path: str, *args: str) -> str: try: result = subprocess.run( diff --git a/codegenome/adapter.py b/codegenome/adapter.py index 306192e2..9850ddf5 100644 --- a/codegenome/adapter.py +++ b/codegenome/adapter.py @@ -13,12 +13,23 @@ from typing import Any, Literal EvidenceType = Literal[ - "code", "test", "diff", "runtime", "doc", "decision", "agent_eval", "manual", + "code", + "test", + "diff", + "runtime", + "doc", + "decision", + "agent_eval", + "manual", ] DriftStatus = Literal[ - "reflected", "drifted", "pending", "ungrounded", - "semantically_preserved", "needs_review", + "reflected", + "drifted", + "pending", + "ungrounded", + "semantically_preserved", + "needs_review", ] diff --git a/codegenome/bind_service.py b/codegenome/bind_service.py index 0e8ea5d3..bfed2595 100644 --- a/codegenome/bind_service.py +++ b/codegenome/bind_service.py @@ -40,14 +40,23 @@ def _check_hash_parity( logger.warning( "[codegenome] identity content_hash %s != region content_hash %s " "(decision_id=%s, %s:%d-%d) — writing identity anyway", - identity.content_hash, code_region_content_hash, - decision_id, file_path, start_line, end_line, + identity.content_hash, + code_region_content_hash, + decision_id, + file_path, + start_line, + end_line, ) async def _persist_subject_and_identity( - *, ledger, identity: SubjectIdentity, - kind: str, canonical_name: str, decision_id: str, repo_ref: str, + *, + ledger, + identity: SubjectIdentity, + kind: str, + canonical_name: str, + decision_id: str, + repo_ref: str, ) -> bool: """Run the four ledger writes; return ``True`` on full success. @@ -57,13 +66,16 @@ async def _persist_subject_and_identity( that as identity-not-written. """ subject_id = await ledger.upsert_code_subject( - kind=kind, canonical_name=canonical_name, - current_confidence=identity.confidence, repo_ref=repo_ref, + kind=kind, + canonical_name=canonical_name, + current_confidence=identity.confidence, + repo_ref=repo_ref, ) if not subject_id: logger.warning( "[codegenome] upsert_code_subject empty id for %s/%s", - kind, canonical_name, + kind, + canonical_name, ) return False @@ -107,8 +119,12 @@ async def write_codegenome_identity( repo_ref=repo_ref, ) _check_hash_parity( - identity, code_region_content_hash, - decision_id, file_path, start_line, end_line, + identity, + code_region_content_hash, + decision_id, + file_path, + start_line, + end_line, ) persisted = await _persist_subject_and_identity( ledger=ledger, diff --git a/codegenome/confidence.py b/codegenome/confidence.py index 9345de5e..c3a23cbc 100644 --- a/codegenome/confidence.py +++ b/codegenome/confidence.py @@ -4,17 +4,16 @@ from collections.abc import Iterable, Mapping - # Default weights for the confidence model defined in the architecture # plan; referenced by Phase 3+4 callers (continuity, drift classifier). # Lives here so future phases import from one place without restructuring. DEFAULT_CONFIDENCE_WEIGHTS: dict[str, float] = { - "subject_resolution": 0.25, - "structural_identity": 0.20, - "content_similarity": 0.15, + "subject_resolution": 0.25, + "structural_identity": 0.20, + "content_similarity": 0.15, "call_graph_similarity": 0.15, - "test_support": 0.15, - "runtime_support": 0.10, + "test_support": 0.15, + "runtime_support": 0.10, } diff --git a/codegenome/deterministic_adapter.py b/codegenome/deterministic_adapter.py index 8773a3d1..1edb1e76 100644 --- a/codegenome/deterministic_adapter.py +++ b/codegenome/deterministic_adapter.py @@ -56,7 +56,11 @@ def compute_identity( address = f"cg:{signature_hash}" content = get_git_content( - file_path, start_line, end_line, self.repo_path, ref=repo_ref, + file_path, + start_line, + end_line, + self.repo_path, + ref=repo_ref, ) if content is None or start_line < 1 or end_line < start_line: content_hash: str | None = None diff --git a/consent.py b/consent.py index 9e5f5494..2814de00 100644 --- a/consent.py +++ b/consent.py @@ -27,9 +27,10 @@ import logging import os import sys -from datetime import datetime, timezone +from collections.abc import Callable +from datetime import UTC, datetime from pathlib import Path -from typing import Any, Callable +from typing import Any logger = logging.getLogger(__name__) @@ -70,7 +71,7 @@ def write_consent(telemetry: bool, *, via: str) -> None: record: dict[str, Any] = { "telemetry": "enabled" if telemetry else "disabled", "policy_version": POLICY_VERSION, - "acknowledged_at": datetime.now(timezone.utc).isoformat(), + "acknowledged_at": datetime.now(UTC).isoformat(), "acknowledged_via": via, } _CONSENT_FILE.parent.mkdir(parents=True, exist_ok=True) diff --git a/context.py b/context.py index e2a84fef..1d65f8a6 100644 --- a/context.py +++ b/context.py @@ -36,6 +36,7 @@ def _read_guided_mode(repo_path: str) -> bool: return False try: import yaml + config = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} return bool(config.get("guided", False)) except Exception: @@ -93,7 +94,11 @@ def from_env(cls) -> BicameralContext: from adapters.code_locator import get_code_locator from adapters.codegenome import get_codegenome from adapters.ledger import get_drift_analyzer, get_ledger - from code_locator_runtime import detect_authoritative_ref, get_repo_index_state, resolve_ref_sha + from code_locator_runtime import ( + detect_authoritative_ref, + get_repo_index_state, + resolve_ref_sha, + ) from codegenome.config import CodeGenomeConfig repo_path = os.getenv("REPO_PATH", ".") diff --git a/contracts.py b/contracts.py index dadc8d56..c7e7b1c4 100644 --- a/contracts.py +++ b/contracts.py @@ -18,7 +18,6 @@ from pydantic import BaseModel, ConfigDict - # ── Skill telemetry diagnostic models ──────────────────────────────── # One model per skill. extra="forbid" means the handler can detect and # echo back any field names the LLM sent that don't belong here. @@ -85,13 +84,14 @@ class SyncMetrics(BaseModel): be ``None`` if that path did not run in the handler — e.g. ledger was already synced, or the handler did not take the write barrier. """ + sync_catchup_ms: float | None = None barrier_held_ms: float | None = None - class CodeRegionSummary(BaseModel): """Lean code region for MCP responses — no pipeline metadata.""" + file_path: str symbol: str lines: tuple[int, int] # (start_line, end_line) @@ -116,13 +116,15 @@ class DecisionStatusEntry(BaseModel): decision_id: str description: str status: Literal["reflected", "drifted", "pending", "ungrounded"] - signoff_state: str | None = None # proposed | ratified | rejected | collision_pending | context_pending | superseded - source_type: str # transcript | notion | document | manual | implementation_choice - source_ref: str # meeting ID, Notion page ID, etc. - ingested_at: str # ISO datetime + signoff_state: str | None = ( + None # proposed | ratified | rejected | collision_pending | context_pending | superseded + ) + source_type: str # transcript | notion | document | manual | implementation_choice + source_ref: str # meeting ID, Notion page ID, etc. + ingested_at: str # ISO datetime code_regions: list[CodeRegionSummary] - drift_evidence: str = "" # populated when status = "drifted" - blast_radius: list[str] = [] # symbol names of structural dependents (1-hop) + drift_evidence: str = "" # populated when status = "drifted" + blast_radius: list[str] = [] # symbol names of structural dependents (1-hop) source_excerpt: str = "" meeting_date: str = "" speakers: list[str] = [] @@ -130,9 +132,9 @@ class DecisionStatusEntry(BaseModel): class DecisionStatusResponse(BaseModel): - ref: str # git ref evaluated against - as_of: str # ISO datetime of evaluation - summary: dict[str, int] # {"reflected": N, "drifted": N, ...} + ref: str # git ref evaluated against + as_of: str # ISO datetime of evaluation + summary: dict[str, int] # {"reflected": N, "drifted": N, ...} decisions: list[DecisionStatusEntry] @@ -141,10 +143,12 @@ class DecisionStatusResponse(BaseModel): class DecisionMatch(BaseModel): decision_id: str - description: str # the original decision text + description: str # the original decision text status: Literal["reflected", "drifted", "pending", "ungrounded"] - signoff_state: str | None = None # proposed | ratified | rejected | collision_pending | context_pending | superseded - confidence: float # BM25 match score (0–1) + signoff_state: str | None = ( + None # proposed | ratified | rejected | collision_pending | context_pending | superseded + ) + confidence: float # BM25 match score (0–1) source_ref: str code_regions: list[CodeRegionSummary] drift_evidence: str = "" @@ -164,17 +168,19 @@ class ComplianceVerdict(BaseModel): this decision. Server will prune the binds_to edge and record compliance_check with pruned=true. """ + decision_id: str region_id: str - content_hash: str # echoed from PendingComplianceCheck.content_hash + content_hash: str # echoed from PendingComplianceCheck.content_hash verdict: Literal["compliant", "drifted", "not_relevant"] confidence: Literal["high", "medium", "low"] - explanation: str # one-sentence rationale for audit trail + explanation: str # one-sentence rationale for audit trail phase_metadata: dict = {} class ResolveComplianceRejection(BaseModel): """Structured rejection for a verdict that failed input validation.""" + decision_id: str region_id: str reason: Literal[ @@ -200,6 +206,7 @@ class ResolveComplianceResponse(BaseModel): pruned=true). Holistic status is projected via project_decision_status after all verdicts in the batch are written. """ + phase: Literal["ingest", "drift", "regrounding", "supersession", "divergence"] accepted: list[ResolveComplianceAccepted] = [] rejected: list[ResolveComplianceRejection] = [] @@ -210,19 +217,21 @@ class PendingComplianceCheck(BaseModel): v0.5.0: decision_id replaces intent_id. """ + phase: Literal["ingest", "drift", "regrounding"] decision_id: str region_id: str decision_description: str file_path: str symbol: str - content_hash: str # key the verdict must be written against - code_body: str = "" # extracted via tree-sitter, capped - old_code_body: str | None = None # drift-phase only + content_hash: str # key the verdict must be written against + code_body: str = "" # extracted via tree-sitter, capped + old_code_body: str | None = None # drift-phase only class LinkCommitResponse(BaseModel): """Returned by /link_commit and embedded in /search_decisions + /detect_drift.""" + commit_hash: str synced: bool reason: Literal["new_commit", "already_synced", "no_changes"] @@ -246,6 +255,7 @@ class LinkCommitResponse(BaseModel): class ActionHint(BaseModel): """Tester-mode directive appended to search/brief responses.""" + kind: Literal[ "answer_open_questions", "review_drift", @@ -262,7 +272,7 @@ class SearchDecisionsResponse(BaseModel): sync_status: LinkCommitResponse matches: list[DecisionMatch] ungrounded_count: int - suggested_review: list[str] # decision_ids of drifted/pending to review first + suggested_review: list[str] # decision_ids of drifted/pending to review first action_hints: list[ActionHint] = [] sync_metrics: SyncMetrics | None = None # V1 A3 — catch-up / barrier wall times @@ -274,7 +284,9 @@ class DriftEntry(BaseModel): decision_id: str description: str status: Literal["reflected", "drifted", "pending", "ungrounded"] - signoff_state: str | None = None # proposed | ratified | rejected | collision_pending | context_pending | superseded + signoff_state: str | None = ( + None # proposed | ratified | rejected | collision_pending | context_pending | superseded + ) symbol: str lines: tuple[int, int] drift_evidence: str = "" @@ -306,6 +318,7 @@ class ScanBranchResponse(BaseModel): Decisions are deduped by decision_id across the full set of changed files. """ + base_ref: str head_ref: str sweep_scope: Literal["head_only", "range_diff", "range_truncated", "branch_delta"] @@ -337,8 +350,8 @@ class DoctorLedgerSummary(BaseModel): class DoctorResponse(BaseModel): scope: Literal["file", "branch", "empty"] - file_scan: "DetectDriftResponse | None" = None - branch_scan: "ScanBranchResponse | None" = None + file_scan: DetectDriftResponse | None = None + branch_scan: ScanBranchResponse | None = None ledger_summary: DoctorLedgerSummary | None = None action_hints: list[ActionHint] = [] @@ -348,8 +361,11 @@ class DoctorResponse(BaseModel): class IngestSpan(BaseModel): """Source excerpt from a meeting, document, or manual input.""" + text: str = "" - source_type: str = "manual" # transcript | notion | document | manual | agent_session | implementation_choice + source_type: str = ( + "manual" # transcript | notion | document | manual | agent_session | implementation_choice + ) source_ref: str = "" speakers: list[str] = [] meeting_date: str = "" @@ -357,6 +373,7 @@ class IngestSpan(BaseModel): class IngestCodeRegion(BaseModel): """Pre-resolved code region for a mapping.""" + symbol: str file_path: str start_line: int = 0 @@ -367,13 +384,14 @@ class IngestCodeRegion(BaseModel): class IngestMapping(BaseModel): """One decision-to-code mapping in the internal pipeline format.""" + intent: str span: IngestSpan = IngestSpan() symbols: list[str] = [] code_regions: list[IngestCodeRegion] = [] signoff: dict | None = None feature_group: str | None = None - decision_level: str | None = None # L1 | L2 | L3 + decision_level: str | None = None # L1 | L2 | L3 parent_decision_id: str | None = None @@ -389,6 +407,7 @@ class IngestDecision(BaseModel): decisions are extracted from source, not inferred. Empty excerpts are rejected with a clear error. """ + id: str = "" title: str = "" description: str = "" @@ -409,11 +428,12 @@ class IngestActionItem(BaseModel): class IngestPayload(BaseModel): """Ingest input — accepts EITHER mappings (internal) or decisions (natural LLM).""" + repo: str = "" commit_hash: str = "" query: str = "" mappings: list[IngestMapping] = [] - source: str = "manual" # transcript | notion | slack | document | manual | agent_session | implementation_choice + source: str = "manual" # transcript | notion | slack | document | manual | agent_session | implementation_choice title: str = "" date: str = "" participants: list[str] = [] @@ -443,7 +463,8 @@ class ContextForCandidate(BaseModel): a decision with signoff.state='context_pending' that overlaps with the ingested span. Human confirms or rejects via bicameral.resolve_collision. """ - span_id: str # input_span record ID (e.g. 'input_span:abc123') + + span_id: str # input_span record ID (e.g. 'input_span:abc123') decision_id: str decision_description: str overlap_score: float = 0.0 # rank-position score; raw BM25 score is always 0 in v2 embedded @@ -455,9 +476,10 @@ class CreatedDecision(BaseModel): Returned in IngestResponse.created_decisions so the caller-LLM can cross-reference against bicameral.history without fuzzy text matching. """ + decision_id: str description: str - decision_level: str | None = None # L1 | L2 | L3 + decision_level: str | None = None # L1 | L2 | L3 class IngestResponse(BaseModel): @@ -468,10 +490,10 @@ class IngestResponse(BaseModel): stats: IngestStats created_decisions: list[CreatedDecision] = [] pending_grounding_decisions: list[dict] = [] - context_for_candidates: "list[ContextForCandidate]" = [] + context_for_candidates: list[ContextForCandidate] = [] source_cursor: SourceCursorSummary | None = None - judgment_payload: "GapJudgmentPayload | None" = None # kept for backward compat - judgment_payloads: "list[GapJudgmentPayload]" = [] # one per feature_group topic + judgment_payload: GapJudgmentPayload | None = None # kept for backward compat + judgment_payloads: list[GapJudgmentPayload] = [] # one per feature_group topic sync_status: LinkCommitResponse | None = None @@ -479,7 +501,9 @@ class BriefDecision(BaseModel): decision_id: str description: str status: Literal["reflected", "drifted", "pending", "ungrounded"] - signoff_state: str | None = None # proposed | ratified | rejected | collision_pending | context_pending | superseded + signoff_state: str | None = ( + None # proposed | ratified | rejected | collision_pending | context_pending | superseded + ) source_type: str = "" source_ref: str = "" code_regions: list[CodeRegionSummary] = [] @@ -488,7 +512,7 @@ class BriefDecision(BaseModel): source_excerpt: str = "" meeting_date: str = "" signoff: dict | None = None - decision_level: str | None = None # L1 | L2 | L3 — CodeGenome claim/identity split + decision_level: str | None = None # L1 | L2 | L3 — CodeGenome claim/identity split parent_decision_id: str | None = None # L2 → L1 parent link for evidence inheritance @@ -549,8 +573,8 @@ class PreflightResponse(BaseModel): action_hints: list[ActionHint] = [] sources_chained: list[str] = [] # v0.8.0 HITL annotations (topic-independent, ledger health) - unresolved_collisions: list[BriefDecision] = [] # collision_pending from prior sessions - context_pending_ready: list[BriefDecision] = [] # context_pending with ≥1 confirmed context_for + unresolved_collisions: list[BriefDecision] = [] # collision_pending from prior sessions + context_pending_ready: list[BriefDecision] = [] # context_pending with ≥1 confirmed context_for sync_metrics: SyncMetrics | None = None # V1 A3 — catch-up wall times product_stage: str | None = None # shown once per device; wait-time expectation-setting @@ -612,8 +636,9 @@ class RatifyResponse(BaseModel): Idempotent: calling ratify on an already-signed-off decision returns was_new=False and leaves the existing signoff record untouched. """ + decision_id: str - was_new: bool # True if this call set the signoff; False if already set + was_new: bool # True if this call set the signoff; False if already set signoff: dict projected_status: Literal["reflected", "drifted", "pending", "ungrounded"] @@ -628,15 +653,16 @@ class ResolveCollisionResponse(BaseModel): - collision: new_id + old_id + action ('supersede'|'keep_both') - context_for: span_id + decision_id + confirmed (bool) """ + mode: Literal["collision", "context_for"] action_taken: str - new_decision_id: str = "" # collision mode - old_decision_id: str = "" # collision mode - span_id: str = "" # context_for mode - decision_id: str = "" # context_for mode + new_decision_id: str = "" # collision mode + old_decision_id: str = "" # collision mode + span_id: str = "" # context_for mode + decision_id: str = "" # context_for mode edge_written: bool = False - new_status: str = "" # projected status of new decision after action - old_status: str = "" # projected status of old decision (supersede only) + new_status: str = "" # projected status of new decision after action + old_status: str = "" # projected status of old decision (supersede only) # ── Tool: bicameral.history ────────────────────────────────────────────────── @@ -644,45 +670,51 @@ class ResolveCollisionResponse(BaseModel): class HistorySource(BaseModel): """One input span that originated or updated a decision.""" - source_ref: str # e.g. "sprint-14-planning" + + source_ref: str # e.g. "sprint-14-planning" source_type: Literal["transcript", "slack", "document", "agent_session", "manual"] - date: str # ISO date + date: str # ISO date speaker: str | None = None - quote: str # verbatim excerpt from source_span.text + quote: str # verbatim excerpt from source_span.text class HistoryFulfillment(BaseModel): """Code grounding for a decision.""" + file_path: str symbol: str | None = None start_line: int end_line: int git_url: str | None = None - grounded_at_ref: str = "" # git ref when first grounded + grounded_at_ref: str = "" # git ref when first grounded baseline_hash: str | None = None current_hash: str | None = None class HistoryDecision(BaseModel): """Balance-sheet view of one decision: commitment + fulfillment + balance.""" - id: str # decision_id - summary: str # canonical decision text + + id: str # decision_id + summary: str # canonical decision text featureId: str status: Literal["reflected", "drifted", "pending", "ungrounded"] - signoff_state: str | None = None # proposed | ratified | rejected | collision_pending | context_pending | superseded - sources: list[HistorySource] = [] # 1+ input spans; empty for AI-discovered - fulfillments: list[HistoryFulfillment] = [] # all bound code regions - drift_evidence: str | None = None # human-readable delta when drifted - signoff: dict | None = None # ratification record: state, signer, ratified_at - decision_level: str | None = None # L1 | L2 | L3 — for balance-sheet display + signoff_state: str | None = ( + None # proposed | ratified | rejected | collision_pending | context_pending | superseded + ) + sources: list[HistorySource] = [] # 1+ input spans; empty for AI-discovered + fulfillments: list[HistoryFulfillment] = [] # all bound code regions + drift_evidence: str | None = None # human-readable delta when drifted + signoff: dict | None = None # ratification record: state, signer, ratified_at + decision_level: str | None = None # L1 | L2 | L3 — for balance-sheet display parent_decision_id: str | None = None - ephemeral: bool = False # True when current status was determined by a feature-branch commit not yet in authoritative ref + ephemeral: bool = False # True when current status was determined by a feature-branch commit not yet in authoritative ref class HistoryFeature(BaseModel): """A feature group containing related decisions.""" - id: str # feature group id (slugified name) - name: str # canonical feature_group noun phrase + + id: str # feature group id (slugified name) + name: str # canonical feature_group noun phrase decisions: list[HistoryDecision] @@ -690,7 +722,7 @@ class HistoryResponse(BaseModel): features: list[HistoryFeature] truncated: bool = False total_features: int = 0 - as_of: str = "" # git ref evaluated against + as_of: str = "" # git ref evaluated against sync_metrics: SyncMetrics | None = None # V1 A3 — catch-up wall times @@ -699,7 +731,8 @@ class HistoryResponse(BaseModel): class DashboardResponse(BaseModel): """Response from bicameral.dashboard.""" - url: str # http://localhost:{port} + + url: str # http://localhost:{port} status: Literal["started", "already_running"] port: int @@ -709,6 +742,7 @@ class DashboardResponse(BaseModel): class BindResult(BaseModel): """Result for one binding in a bicameral.bind call.""" + decision_id: str region_id: str content_hash: str @@ -718,6 +752,7 @@ class BindResult(BaseModel): class BindResponse(BaseModel): """Response envelope for bicameral.bind.""" + bindings: list[BindResult] sync_metrics: SyncMetrics | None = None # V1 A3 — write-barrier hold time @@ -727,6 +762,7 @@ class BindResponse(BaseModel): class SessionStartBanner(BaseModel): """Open-decision summary shown once per session at session start.""" + drifted_count: int = 0 ungrounded_count: int = 0 proposal_count: int = 0 diff --git a/dashboard/server.py b/dashboard/server.py index 1d231d2b..90306ca3 100644 --- a/dashboard/server.py +++ b/dashboard/server.py @@ -17,7 +17,6 @@ import asyncio import json import logging -import os import socket from pathlib import Path from typing import Any @@ -100,11 +99,13 @@ async def stop(self) -> None: async def notify(self, ctx: Any) -> None: """Build a fresh HistoryResponse and push it to all SSE clients.""" from dashboard.sse import get_broadcaster + broadcaster = get_broadcaster() if broadcaster.subscriber_count == 0: return try: from handlers.history import handle_history + response = await handle_history(ctx) payload = json.dumps(response.model_dump(), default=str) await broadcaster.broadcast(payload) @@ -162,6 +163,7 @@ async def _serve_history(self, writer: asyncio.StreamWriter) -> None: try: ctx = self._ctx_factory() from handlers.history import handle_history + response = await handle_history(ctx) body = json.dumps(response.model_dump(), default=str).encode() except Exception as exc: @@ -171,6 +173,7 @@ async def _serve_history(self, writer: asyncio.StreamWriter) -> None: async def _serve_sse(self, writer: asyncio.StreamWriter) -> None: from dashboard.sse import get_broadcaster + broadcaster = get_broadcaster() writer.write(_HTTP_200_SSE.encode()) await writer.drain() @@ -179,6 +182,7 @@ async def _serve_sse(self, writer: asyncio.StreamWriter) -> None: try: ctx = self._ctx_factory() from handlers.history import handle_history + response = await handle_history(ctx) initial = json.dumps(response.model_dump(), default=str) writer.write(f"data: {initial}\n\n".encode()) @@ -191,7 +195,7 @@ async def _serve_sse(self, writer: asyncio.StreamWriter) -> None: while True: try: data = await asyncio.wait_for(q.get(), timeout=30.0) - except asyncio.TimeoutError: + except TimeoutError: # Keep connection alive with an SSE comment; loop and keep waiting. writer.write(b": keepalive\n\n") await writer.drain() diff --git a/docs/DEV_CYCLE.md b/docs/DEV_CYCLE.md new file mode 100644 index 00000000..3ece53fe --- /dev/null +++ b/docs/DEV_CYCLE.md @@ -0,0 +1,1177 @@ +# Development Cycle + +**Audience**: contributors, release managers (Jin), and anyone shipping a change +to `BicameralAI/bicameral-mcp`. This document is the contract — if you are about +to open a branch, write a PR, cut a release, or close an issue, follow what is +written here. Deviations require a META_LEDGER entry explaining why. + +**Repo topology** (as of v0.13.0, post-Phase-4): + +```text +contributor fork (e.g. Knapp-Kevin/bicameral-mcp) + │ feature branches live here + ▼ +BicameralAI/bicameral-mcp + ├── dev ← integration branch; CI green, code complete, NOT shipped + └── main ← shipped; tagged; users pull from here +``` + +Two branches, one direction of flow: **feature → dev → main**. Nothing else +merges to `main` except `dev` (and the rare hotfix — see §10). + +--- + +## 0. Workflow Feature Release Cycle + +**Audience**: anyone proposing a new agentic workflow capability — a new +skill, a new lifecycle hook, a new auto-fire trigger, a new dashboard +surface. Distinct from §6 (engineering version release): §6 covers how a +finished change reaches users; **§0 covers how a workflow idea becomes a +finished change worth releasing.** + +**Why this exists separately**: most of our P0 misses (#146 preflight +auto-fire, #147 SessionEnd capture-corrections, the e2e harness churn +across 2026-04 → 2026-05) trace back to the same root cause — we shipped +the implementation BEFORE we wrote down what success looks like and +BEFORE we had any way to observe whether it actually worked in the wild. +The fix is to put validation in front of implementation, not behind it. + +### The cycle + +``` +┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ +│ 1. │ │ 2. │ │ 3. │ │ 4. │ │ 5. │ │ 6. │ +│ Friction │─▶│Candidate │─▶│ Test │─▶│Functional│─▶│Telemetry │─▶│Optimized │ +│ capture │ │ workflow │ │ harness │ │ solution │ │collection│ │ solution │ +└──────────┘ └──────────┘ └──────────┘ └──────────┘ └──────────┘ └──────────┘ + ▲ │ + │ ◀─── feedback loop ──────┘ + │ (telemetry surfaces gaps the harness should have caught) +``` + +**Anti-pattern (the trap we keep falling into)**: jump from step 1 +directly to step 4. Build the skill. Ship it. Discover the harness can't +observe the auto-fire and telemetry surfaces nothing. Now you're +retrofitting phases 2/3/5 onto a thing already in production — every +iteration loses fidelity because the spec and the implementation are +entangled. (See: every revision history of `tests/e2e/run_e2e_flows.py`.) + +### Phases + +#### 1. Friction capture + +Observable evidence that a real user / agent / contributor stubbed their +toe on something that should "just work." Symptoms, not fixes. + +Examples: +- Slack thread from a design partner showing `claude -p '/bicameral:sync'` + exiting silently (#124). +- Dashboard footage of a mid-session constraint orphaning as a parallel + decision instead of linking to its parent. +- An e2e harness flow that fails for a reason no one can immediately + explain. + +Captured as a GitHub issue with `friction` or `desync:*` label, in the +repo where the friction was observed. Body answers: *what was the user +trying to do, what happened instead, what would "right" look like.* + +**Out of scope at this stage**: solution shape, file paths, schema +changes. Don't pre-commit to an implementation in the friction note. + +#### 2. Candidate workflow + +A short prose spec of what the new workflow should look like end-to-end, +written from the user/agent perspective, NOT the implementation +perspective. Lives in a source-of-truth issue (e.g. +`BicameralAI/bicameral#108` for the v0 user flow spec). + +Format: +- **Trigger**: what does the user do or say to enter this workflow? +- **Sequence**: numbered list of agent-observable steps — tool calls, + hook fires, status transitions. Reference the spec; do NOT inline + implementation details (file paths, function names, schema columns). +- **Success outcome**: what visible state proves the workflow worked? + Status flip, ledger row, dashboard panel, ratification record. +- **Failure modes**: what should the user see when each step fails, and + what's the recovery path? + +The spec is the contract for phases 3–6. If the spec is wrong, the +harness validates the wrong thing and the implementation chases the +wrong target. + +#### 3. Test harness + +A real e2e test that exercises the spec from step 2 against a real +claude session (not mocks). For bicameral-mcp this lives at +`tests/e2e/run_e2e_flows.py`. + +**Required before any implementation work begins.** The harness fails on +day one — that's the point. A failing harness with a clear assertion +message is the spec made executable. + +Harness rules: +- Assert on the spec's success outcome, not the implementation path. + ("After commit, decision X is in `pending` state" is good. "Agent + called `link_commit` then `resolve_compliance` in that order" is + brittle and couples the test to the substrate.) +- Use natural prompts — never name the tool the agent is supposed to + auto-fire. Naming the tool defeats the trigger that IS the product. +- When success isn't observable in stream-json (e.g. a SessionEnd + subprocess writes to the ledger out-of-band), validate via post-hoc + ledger query. Document the indirection in the asserter docstring. +- When a flow fails: distinguish test-harness bug from product gap. If + the asserter is wrong about the spec, fix the asserter (no GitHub + issue needed). If the spec says X happens and X doesn't happen, that's + a product gap — open or update an issue, leave the harness asserting + the spec, mark the failure as expected until the implementation lands. + +#### 4. Functional solution + +Implementation pass that makes the harness pass. Optimize for spec +correctness — not performance, not polish. Skill description, tool +contract, lifecycle hooks all in scope. + +Done when: +- Harness PASSes against the unmodified natural prompt from step 3. +- A real user can complete the flow end-to-end without hitting any of + the friction from step 1. +- Implementation is documented at the level needed for phase 5 telemetry + to know what to count. + +#### 5. Telemetry collection + +Instrument the new workflow with PostHog events / +`bicameral.skill_begin/end` calls / structured logs that answer: *is +this actually being used, by whom, and does it work in their hands?* + +Telemetry contract is part of the spec, not an afterthought. Each step +in the candidate workflow (phase 2) should map to a telemetry event the +dashboard can query. + +Wire telemetry BEFORE merging the implementation PR. A workflow you +can't observe in production is a workflow that's never validated in +production. + +#### 6. Optimized solution + +Iterate based on what telemetry shows: +- Drop-off after step N → step N is unclear or broken in real + conditions. Could be a description fix or a substrate change. +- Auto-fire rate <X% → trigger discipline is losing the priority race; + restate the skill description, change the trigger phrasing, or move to + a deterministic hook. +- Compliance verdict mix unexpected → either the rubric is wrong or the + user is using the workflow differently than the spec assumed. + +Optimization changes route through the same cycle: telemetry-observed +friction → updated workflow spec → updated harness → new functional +pass → new telemetry. Don't optimize without re-passing the harness. + +### Audit trail + +Every workflow feature gets a short META_LEDGER entry at each phase +boundary: + +``` +2026-05-01 workflow:bicameral-capture-corrections phase=3→4 + harness PR: BicameralAI/bicameral-mcp#147 (SKIP→SETUP) + spec: BicameralAI/bicameral#108 § Flow 4 + next: implementation PR + telemetry wiring +``` + +This makes it possible to look at any open workflow feature and +immediately see which phase it's in, what's blocking the next phase, and +where the spec lives. It's also the first place to look when a feature +ships and silently regresses — phase boundaries are where the harness +should pass before/after the change. + +--- + +## 1. Lifecycle map + +``` +┌──────────┐ ┌────────┐ ┌──────────┐ ┌─────┐ ┌─────────────┐ ┌──────┐ ┌────────┐ +│ Issue │──▶│ Branch │──▶│ Feature │──▶│ dev │──▶│ Release PR │──▶│ main │──▶│ Tag │ +│ (#nnn) │ │ named │ │ PR │ │ │ │ (dev→main) │ │ │ │ vX.Y.Z │ +│ │ │ /-x │ │ → dev │ │ │ │ │ │ │ │ │ +└──────────┘ └────────┘ └──────────┘ └─────┘ └─────────────┘ └──────┘ └────────┘ + │ │ │ │ │ │ │ + │ │ Closes #nnn │ │ │ GitHub + │ │ on squash │ Bumps version, │ Release + │ │ │ │ CHANGELOG flip, │ published + │ │ ▼ │ milestone close │ │ + │ │ CI must pass │ │ │ ▼ + │ │ QOR seal in │ ▼ │ Help/training + │ │ META_LEDGER │ Squash-merge │ docs published + │ │ │ OR merge commit │ + ▼ ▼ ▼ ▼ + Milestone: Branch name: Issue auto-closed, User-facing release; + vX.Y.Z - milestone open upstream consumers + ("pending release") pull from main +``` + +**One rule of thumb**: any work that touches user-visible behavior must traverse +every box in that diagram. No back-doors to `main`. + +--- + +## 2. Issues + +### 2.1 Creating + +- **Title**: imperative, scoped. `feat(codegenome): semantic drift evaluation in resolve_compliance`, + not "add drift evaluation". **Do not** prefix with `[P0]`/`[P1]`/`[P2]` — use + the priority labels in §2.1.1 instead. +- **Required labels** (apply at least one of each mandatory axis): + - **Type** (mandatory): `feat`, `fix`, `docs`, `chore`, `test`, `refactor`, `perf`, `security`. + - **Surface** (mandatory): `tool`, `skill`, `ledger`, `code-locator`, `codegenome`, `infra`, `docs-only`. + - **Priority** (mandatory after triage): see §2.1.1 below. + - **State** (optional): see §2.1.2 below. +- **Milestone**: attach to the next-up release (`v0.14.0`). If you don't know + which release it lands in, attach to `vNext-triage` and let Jin re-assign. +- **Body template** (see `.github/ISSUE_TEMPLATE/`): + - **Why**: one paragraph. The product decision this serves. + - **What**: the smallest change that satisfies "Why". + - **Out of scope**: explicit exclusions. Stops scope creep at PR-review time. + - **Acceptance**: bullet list of testable conditions. CI green is implied; add + behavioural checks ("`link_commit` returns `auto_resolved_count` ≥ 0"). + +> **Risk** (`risk:L1` / `risk:L2` / `risk:L3`) lives on **PRs**, not issues — +> see §4.4. Risk is a property of the change being made, knowable only after +> design. Issues carry priority (urgency); PRs carry risk (review tier). + +#### 2.1.1 Priority labels (one per issue, mandatory after triage) + +Exactly one priority label per triaged issue. Untriaged issues carry `triage` +(see §2.1.2) until a maintainer assigns priority. + +| Label | Color | Meaning | +|---|---|---| +| `P0` | red | Critical — drop everything. Production down, data loss, security regression, ledger corruption. **Triggers an immediate response, even off-hours.** | +| `P1` | orange | High — ship this milestone. User-impacting bug or committed feature with a deadline. | +| `P2` | yellow | Medium — next milestone or two. The default for routine new feature work and non-urgent bugs. | +| `P3` | grey | Low — eventually. Nice-to-have, polish, non-load-bearing improvements. | + +**Calibration heuristics**: + +- *"If this stays open for the next two months, will any user be unhappy?"* + → No: `P3`. Yes: at least `P2`. +- *"Is there a workaround that's acceptable for the next milestone?"* + → Yes: `P2` or lower. No: at least `P1`. +- *"Is anyone losing data, money, or trust right now?"* + → Yes: `P0`. No: not `P0`. + +**P0 is rare.** If we have more than two open `P0` issues at any time, something +is wrong with our triage discipline — `P0` should mean *"the team stops other +work"*. Promoting too many issues to `P0` dilutes the signal. + +#### 2.1.2 State labels (optional, orthogonal to priority) + +| Label | Color | Meaning | +|---|---|---| +| `triage` | light grey | Needs assessment; no priority assigned yet. Default for newly-filed issues. | +| `blocked` | dark grey | Temporarily blocked by another issue or external dependency. Always include a comment naming the blocker. | +| `parked` | purple | Known issue, deferred indefinitely (external blocker, strategic pause, cost > benefit at current scale). Not abandoned, but not on a roadmap. **Only maintainers apply `parked`.** | + +State labels are mostly orthogonal to priority — with one exception: + +- **`triage` and `blocked` coexist with priority.** A `P1 + blocked` issue is + high-priority work waiting on a dependency; a `triage` issue gets a priority + label as soon as a maintainer assesses it. +- **`parked` supersedes priority.** Don't apply both. A parked issue is, by + definition, not on the priority axis — it's deferred indefinitely. Adding + `P3` to a `parked` issue is redundant and clutters the label list. If a + parked issue ever becomes actionable, drop `parked` and assign a real + priority at that moment. + +**Never close a `parked` issue** — keep it open as a known-deferred record +so future filers find it. + +The existing `merged-to-dev` label (post-merge status, not pre-merge state) +remains separate from this axis. See §6.8. + +### 2.2 Closure + +`Closes #X` in a PR body **fires when that PR's HEAD merges into its BASE**, not +when work reaches `main`. PRs target `dev`, so issues close at the dev-merge. + +Why we keep auto-close on dev: closure tracks "the work is in code", milestones +track "the work is shipped". Two signals, two artifacts. + +### 2.3 Reopening + +If a hotfix or follow-up reveals the dev work was wrong, **reopen the original +issue** rather than filing a new one — keeps history threaded. Add a comment +linking the regression's hotfix PR. + +--- + +## 3. Branches + +### 3.1 Naming + +`-` from a fork. + +``` +Knapp-Kevin/codegenome-phase-4-qor ← acceptable (descriptive slug) +Knapp-Kevin/61-drift-classifier ← preferred (issue-numbered) +Knapp-Kevin/main ← never push feature work to fork's main +Knapp-Kevin/dev ← does not exist (BicameralAI/dev is canonical) +``` + +A fork's `dev` branch is **not** maintained. The integration branch is exactly +one place: `BicameralAI/dev`. + +### 3.2 Branching off + +Always branch off `BicameralAI/dev`, never `main`. `dev` is what other in-flight +work has integrated against; `main` is a moving snapshot of the last release. + +```bash +git fetch BicameralAI dev +git checkout -b 61-drift-classifier BicameralAI/dev +``` + +### 3.3 Stacking + +Stacked PRs (PR B depends on PR A's branch) are tolerated for short windows +(< 48 h). Rebase the stack onto `dev` the moment the bottom PR merges. Long +stacks compound merge-conflict risk and review fatigue. + +--- + +## 4. Pull Requests + +### 4.1 Targeting + +**All feature/fix PRs target `dev`.** The release PR (and only the release PR) +targets `main`. CI workflows enforce both: `pull_request: branches: [main, dev]`. + +#### 4.1.1 Flow labels (mandatory) + +Every PR carries exactly one `flow:` label so contributors and reviewers can +tell at a glance which lane it's in. The label mirrors the target branch but +disambiguates the two cases that share `main`: + +| Label | Color | Target | Meaning | +|---|---|---|---| +| `flow:feature` | green | `dev` | Standard feature/fix going through the integration branch. The default. | +| `flow:release` | blue | `main` | Periodic `dev → main` release PR opened by the release manager. Carries no new code — only the integrated `dev` HEAD. | +| `flow:hotfix` | red | `main` | Emergency fix bypassing `dev`. Sets the §10 sync-back-to-dev clock. | + +Why labels in addition to the base branch: + +- `gh pr list --base main` returns *both* release PRs and hotfix PRs — different + processes, different review tiers, different urgencies. The label + disambiguates. +- Filters like `gh pr list --label flow:hotfix --state closed` give a clean + audit trail of every emergency bypass over time. We want that visible. +- Dependabot auto-applies `flow:feature` via `.github/dependabot.yml`; nothing + arrives without a flow label. + +Reviewers can refuse to review a PR that has no `flow:` label — the contract +is "label first, review second." + +**Distinct from the post-merge `merged-to-dev` label.** That one tracks +*status* ("this work has landed on dev but not yet on main"). The `flow:` +labels track *intent* (which lane the PR is in). Both can coexist on a single +PR after merge if Jin uses `merged-to-dev` to surface his release queue. + +### 4.2 Title + +`(): ` — the same shape as the issue title. +The squash commit message inherits this; loose PR titles produce ugly history. + +### 4.3 Body — required sections + +```markdown +## Summary +1–3 bullets, user-facing outcome. + +## Linked issues +Closes #61 +Refs #60 (depends on continuity matcher landed there) + +## Plan / Audit / Seal +- Plan: docs/Planning/plan-codegenome-phase-4.md (v3, content hash sha256:911171cf…) +- Audit: META_LEDGER Entry #13, chain hash 21ac210f… — verdict PASS +- Seal: META_LEDGER Entry #14, chain hash 0ebcf69b… + +## Test plan +- [ ] `pytest tests/test_codegenome_drift_classifier.py -q` (32/32) +- [ ] `pytest tests/test_m3_benchmark.py -q` (5/5) +- [ ] regression: `pytest -q` (189/189) +``` + +The Plan/Audit/Seal section is **mandatory for any PR > 100 LOC or risk:L2+**. +Smaller PRs may use `Plan: trivial; risk:L1`. + +### 4.4 Reviewers + +- Code-owner from `CODEOWNERS` is auto-requested. +- **Risk:L3 PRs**: require a second reviewer + a security-pass note in the + description. +- **Risk:L2 PRs**: one reviewer. +- **Risk:L1 PRs** (typo, comment fixes, dep bumps from Dependabot with green + CI): owner self-merge after CI is green. + +### 4.5 CI gates + +Two-tier model: a fast set on every PR-to-`dev`, a deeper set on the release +PR (`dev` → `main`). The asymmetry is deliberate — see §4.5.3. + +#### 4.5.1 Tier 1 — PR → `dev` (fast, blocks every PR) + +The bar is *"this won't break dev for everyone else."* Target wall-clock: under +5 minutes. Red on any of these blocks merge. + +| Gate | Workflow / tool | Why | +|---|---|---| +| **Lint** | `ruff` + `black --check` | Catches style drift, dead imports, unused vars before review | +| **Type check** | `mypy` (or `pyright`) | Type errors surface at runtime via Pydantic boundaries; keep them at PR-time | +| **Unit + integration tests (Linux)** | `test-mcp-regression.yml` (existing) | Core regression suite | +| **Unit + integration tests (Windows)** | matrix on `test-mcp-regression.yml` | Three of the last four bugs (#67, #68, #74) were Windows-only — manual verification is not a strategy | +| **Schema persistence smoke** | `test-schema-persistence.yml` (existing) | Schema bugs are silent killers; cheap to run | +| **Module import smoke** | `python -c "import server, telemetry, consent, ..."` | Catches missing modules / circular imports in seconds | +| **Secret scan** | `gitleaks` or `trufflehog`, fail-on-find | API keys, tokens, credentials in code or test fixtures | +| **`pip check`** | one-liner job | Detects broken dependency tree on the PR's `pip install -e .[test]` | +| **`merged-to-dev` label automation** | post-merge GitHub Action | Auto-applies the label on merge; resolves the manual labeling problem from the PR-A audit | + +#### 4.5.2 Tier 2 — Release PR (`dev` → `main`) + +The bar is *"this is releasable to users."* Inherits all Tier 1 gates plus the +following. Can run 10–20 minutes; runs less often (one release PR at a time). + +| Gate | Workflow / tool | Why | +|---|---|---| +| **All Tier 1 gates** | — | Inherits dev's bar | +| **Full regression including slow markers** | `pytest -m "not bench"` | Tier 1 may exclude `alpha_flow`, `desync_scenarios`; the release run includes them | +| **Preflight eval — blocking** | `preflight-eval.yml` (currently advisory) | Currently advisory on every PR; should block release if drift precision regresses | +| **Schema migration validation against persistent DB with seed data** | bespoke job | Beyond the smoke — apply migration on a `v_(N-1)` seed, assert no row loss + roundtrip works | +| **Performance regression** | bespoke job | Drift detection p50, ingest throughput, search latency. Fail if > 15% regression vs `main`'s last successful run | +| **Security scan** | `bandit`, `pip-audit`, GitHub Dependency Review | Required before any user touches the binary | +| **CHANGELOG enforcement** | bespoke job | Reject release PR if `CHANGELOG.md` does not move `## Unreleased` content under a new `## [vX.Y.Z]` block | +| **Version monotonicity** | bespoke job | Version in `pyproject.toml` must be `>` current `main` tag | +| **MCP protocol live smoke** | bespoke job | Spawn server, call each tool over stdio, assert response shape. Catches handler-registration / Pydantic-boundary issues unit tests miss | +| **Issue auto-close on merge** | post-merge action | `Closes #N` fires on merge into the PR's base; on release PR merge to `main`, also strip the `merged-to-dev` label from issues whose fix is now shipped | + +#### 4.5.3 Why the split + +The asymmetry isn't arbitrary — it's about **failure cost vs velocity**: + +| Concern | dev gate | main gate | +|---|---|---| +| Style / type errors | Block dev (cheap to fix at PR time) | Inherited | +| Windows breakage | Block dev (recent bug history mandates) | Inherited | +| Eval regression | Advisory on dev (don't slow feature work for noise) | **Block main** (release quality) | +| Performance regression | Don't run (too slow per PR) | **Block main** | +| CHANGELOG / version | Don't enforce (dev work is in-flight) | **Block main** | +| Security scan | Don't run per PR (slow, noisy) | **Block main** | +| MCP protocol live smoke | Don't run (requires server boot) | **Block main** | + +#### 4.5.4 Implementation phases (current state vs target) + +A dev-cycle gate is only as strong as its branch-protection rule. Adding the +workflow file is half the job; the other half is requiring it via the GitHub +"Require status checks to pass before merging" setting on `dev` and `main`. + +**Phase 1 — biggest impact, low risk** (open as one chore PR): + +1. Add Windows test job to `test-mcp-regression.yml` matrix + (`runs-on: [ubuntu-latest, windows-latest]`). +2. Add `lint-and-typecheck.yml` (ruff + mypy) running on all PRs. +3. Add `secret-scan.yml` (gitleaks) on all PRs. +4. Add the `merged-to-dev` auto-labeller as a post-merge action on `dev`. +5. Update `dev` branch-protection to require: lint, typecheck, regression + (Linux + Windows), schema persistence, secret scan. + +**Phase 2 — release-quality gates**: + +6. Convert `preflight-eval.yml` from advisory to blocking on `main`-bound PRs + only (use `if: github.base_ref == 'main'`). +7. New `release-gates.yml` running only on `main`-bound PRs: CHANGELOG diff, + version monotonicity, MCP live smoke. +8. Add `bandit` + `pip-audit` to `release-gates`. +9. Performance baseline harness — capture drift detection p50 and search + latency; compare against `main`'s last successful run. +10. Update `main` branch-protection to require all Tier 1 + Tier 2 checks. + +**Phase 3 — nice to have**: + +11. Auto-close `merged-to-dev` issues when `dev` → `main` forward-merges. +12. Sticky PR-comment bot for preflight-eval results (covered by issue #49). + +Until Phase 1 ships, the documented Tier 1 list is **aspirational** — only +`test-mcp-regression`, `test-schema-persistence`, and `preflight-eval` +(advisory) actually run today. Reviewers should treat the rest as their own +responsibility (run lint locally, verify on Windows, etc.) until the gates +land. + +Red CI blocks merge. Don't ask reviewers to look at red PRs. + +### 4.6 Review feedback discipline + +CodeRabbit, Devin, and human reviewers all leave comments. The author's job: + +- **Address** every actionable comment with a commit or a reply justifying + decline. +- **Resolve** the conversation thread only after addressing. +- **Never** push `--force` on a PR with active review threads — comments lose + their line anchors. Use `--force-with-lease` only after a `git fetch`, and + call it out in a PR comment so reviewers re-fetch. + +--- + +## 5. Merging to `dev` + +### 5.1 Strategy + +**Squash merging is disabled at the repo level** (`allow_squash_merge: false`) +so the wrong choice is unavailable, not just discouraged. The reason this +matters at all — beyond style preference — is that squash collapses +multi-commit PRs into opaque blobs that cannot be cleanly cherry-picked into +the §10.5 triage lane. See §10.5.0 "Why this lane exists" for the full +rationale. Two options remain: + +| Merge style | When to use | Rationale | +|---|---|---| +| **Rebase and merge** *(default — covers ~all PRs)* | Single-commit PRs; multi-commit features; any PR a maintainer might backport to `triage-from-dev`; any PR with a `Triage-Cc:` trailer (see §10.5); Dependabot bumps | Preserves atomic commits as individually-cherry-pickable SHAs on `dev`. For single-commit PRs, this is the literal squash equivalent (one commit on `dev`) without the opaque-blob failure mode. GitHub's docs explicitly warn that squashing long-running branches "makes merge conflicts more likely … you'll have to resolve the same conflicts repeatedly." | +| **Merge commit (`--no-ff`)** | Multi-commit features whose grouping matters historically (e.g. coordinated multi-handler refactor); any PR you may want to revert atomically with `git revert -m 1` | Preserves both individual commits *and* the merge boundary. Use sparingly — `dev` log gets noisy fast. | + +**Author obligation, not just merger obligation.** If you write a PR that may be +triage-eligible, write atomic commits — one logical change per commit, each +individually buildable, each with a meaningful subject line. The Linux kernel's +atomic-commit discipline ([Linus on commit messages](https://yarchive.net/comp/linux/commit_messages.html)) +exists precisely so cherry-pick is mechanical, not interpretive. Reviewers may +ask you to reorganize. WIP messages like `wip`, `fix typo`, `address review` +should be squashed locally with `git rebase -i` *before* the PR is merged — +since repo-level squash is off, the rebase-and-merge button will preserve them +verbatim otherwise. + +### 5.2 Pre-merge checklist (for the merger) + +- [ ] CI green +- [ ] All review threads resolved +- [ ] Milestone attached on the PR (== same milestone as the issue) +- [ ] Plan / Audit / Seal references exist for non-trivial PRs +- [ ] CHANGELOG `## Unreleased` updated (or PR explicitly states "no user-visible change") + +### 5.3 Post-merge + +- Issue auto-closes (via `Closes #X`). +- Milestone progress bar advances. +- Branch may be deleted (GitHub default). +- If the work shipped a new tool / new tool field / changed default, the matching + `pilot/mcp/skills//SKILL.md` **must** be in the same PR — for + rebase-and-merge, in the same atomic commit; for merge-commit, in one of the + commits being merged. Project rule from `CLAUDE.md`. Reviewers reject + silently-mismatched skill contracts. + +--- + +## 6. Release cycle + +### 6.1 Cadence + +- **Minor releases** (`v0.X.0`): roughly every 2–3 weeks, when the milestone is + full and `dev` is stable. +- **Patch releases** (`v0.X.Y`): as needed for bug fixes that can't wait. +- **Major release** (`v1.0.0`): scheduled; not driven by milestone fill. + +Jin owns the call on "is `dev` ready to ship". Heuristic: milestone closed-issue +count covers the headline features, and CI on `dev` HEAD has been green for ≥ 24 h. + +### 6.2 Version selection + +Semver applies: + +- **PATCH** — bug fix only, no public-API change, no schema migration. +- **MINOR** — new tool / new tool field / new schema migration that is **additive** + with a registered `_migrate_vN_to_vN+1` and bumped `SCHEMA_COMPATIBILITY` map. +- **MAJOR** — breaking change to a tool's request/response shape, or a destructive + schema migration, or a CLI flag rename. + +If the change is borderline, round **up**. Schema-migrating PRs are never PATCH. + +### 6.3 The release PR (`dev` → `main`) + +Jin opens this PR. It targets `main`, base = `main`, head = `dev`. + +**Title**: `release: v0.13.0` + +**Body**: + +```markdown +## Release v0.13.0 + +### Headline +One sentence the README and Twitter post can both quote. + +### Included issues +Closes milestone v0.13.0 +- #61 — CodeGenome Phase 4 (semantic drift evaluation) +- #75 — <…> +- … + +### Schema +- Migrates ledger v13 → v14 (additive: CHANGEFEED on compliance_check, + semantic_status, evidence_refs) + +### Breaking changes +None. (or: list each.) + +### Documentation +- CHANGELOG.md — v0.13.0 section +- skills/bicameral-sync/SKILL.md — Phase 3+4 callout updated +- README.md — bumped feature list (if applicable) +- New: docs/DEV_CYCLE.md +``` + +### 6.4 Pre-release checklist + +Jin runs through this before merging the release PR. Items marked **CI** are +enforced by the Tier 2 gates in §4.5.2 once Phase 2 lands; until then they are +manual. + +- [ ] **CHANGELOG flip** — move `## Unreleased` content under `## [v0.13.0] - 2026-04-29`. + Add a fresh empty `## Unreleased` block at the top. **(CI: CHANGELOG enforcement)** +- [ ] **Version bump** — update `pyproject.toml` / `__init__.py` / wherever the + canonical version lives. **(CI: version monotonicity)** +- [ ] **`SCHEMA_COMPATIBILITY` map** — confirm the new schema version maps to the + new release version (e.g. `14: "0.13.0"`). **(CI: schema migration validation)** +- [ ] **Skill files** — every changed skill is committed in `pilot/mcp/skills/`, + not just in `.claude/skills/`. +- [ ] **Help / training docs** (see §8) — published for any feature on the + "user-touching" list. +- [ ] **Demo readiness** — at least one demo script (§11) covers each headline + feature. +- [ ] **CI on `dev` HEAD** — green for ≥ 24 h. **(CI: full regression incl. slow markers)** +- [ ] **Preflight eval** — blocking gate, no regression vs `main`'s baseline. + **(CI: preflight-eval blocking on `main`-bound)** +- [ ] **Performance** — drift detection p50, ingest throughput, search latency + within ±15 % of `main`'s last successful run. **(CI: performance regression)** +- [ ] **Security scan** — `bandit` + `pip-audit` + GitHub Dependency Review + clean. **(CI: security scan)** +- [ ] **MCP protocol live smoke** — server boots, every registered tool returns + a shape-conformant response over stdio. **(CI: MCP protocol live smoke)** +- [ ] **Milestone** — every issue under it is closed. + +### 6.5 Merging the release PR + +**Strategy**: **merge-commit**, not squash. `main` is meant to preserve the +release boundary in history; a merge commit ("`Merge dev into main for +v0.13.0`") gives `git log main` a clean release-by-release walk. + +```bash +git checkout main +git pull +git merge --no-ff dev -m "release: v0.13.0" +git push +``` + +GitHub's UI "Create a merge commit" button does the same. + +### 6.6 Tagging + +Immediately after the merge: + +```bash +git tag -a v0.13.0 -m "Release v0.13.0 — CodeGenome Phase 4 (semantic drift)" +git push --tags +``` + +Tag format: `vMAJOR.MINOR.PATCH`. Annotated, never lightweight. The annotation +body is the headline sentence from the release PR. + +### 6.7 GitHub Release + +Create a Release object on GitHub from the tag (`gh release create v0.13.0` or +the UI): + +**Title**: `v0.13.0 — CodeGenome Phase 4 (semantic drift)` + +**Body**: copy/paste the CHANGELOG section for this version, then append: + +```markdown +--- + +## Documentation +- [Migration notes](https://…/docs/migrations/v0.13.md) — schema v13 → v14 +- [User guide for semantic drift evaluation](https://…/docs/guides/semantic-drift.md) +- [Demo: cosmetic-vs-semantic auto-resolve](https://…/docs/demos/04-drift-classifier.md) + +## Verification +Merkle seal: 0ebcf69b… +META_LEDGER entries: #11 (VETO), #12 (PASS), #13 (PASS post-rebase), #14 (seal) +``` + +**Attachments**: none for now (we ship via PyPI/source). When we ship binaries, +attach platform builds here. + +### 6.8 Post-release + +- Close the milestone. +- Open the next milestone (`v0.14.0`). +- Announce: README badge bump, project README "Latest" line, optional Slack / + Discord drop. Use the headline sentence verbatim. + +--- + +## 7. CHANGELOG.md conventions + +We follow [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) loosely. + +**Top of file at all times**: + +```markdown +## [Unreleased] + +### Added +- (work in flight that's already merged to dev) + +### Changed +### Fixed +### Schema +### Security +``` + +When Jin cuts a release, he replaces `[Unreleased]` with the version + date, +then prepends a fresh empty `[Unreleased]` block. + +**Section ordering** (preserve even when empty — drop a section only at release +flip): `Added`, `Changed`, `Deprecated`, `Removed`, `Fixed`, `Schema`, +`Security`. + +**One bullet per logical change**, not per file. User-facing language. Internal +governance details (chain hashes, verdicts) stay out of CHANGELOG; they live in +META_LEDGER. + +--- + +## 8. Documentation requirements per release + +Some features ship with code only. Some ship with code **plus** mandatory docs. +Use this matrix: + +| Feature class | User-touching? | Docs required | +|---|---|---| +| New MCP tool | yes | `pilot/mcp/skills//SKILL.md` + entry in `README.md#tools` | +| New tool field / new status value | yes | Update every skill that renders the field | +| New schema migration | indirect | `docs/migrations/vN.md` — what changes, automatic or manual | +| New caller-facing helper (e.g. `ensure_ledger_synced`) | yes | `docs/guides/.md` user guide | +| New deterministic primitive (e.g. continuity matcher) | yes | demo script in `docs/demos/` | +| Bug fix without behavior change | no | CHANGELOG entry only | +| Internal refactor | no | CHANGELOG entry only ("Changed: …") | +| Performance improvement | no, unless > 2× | CHANGELOG entry; `> 2×` adds a `docs/perf/` note | +| Security fix | yes | CHANGELOG `### Security` entry + `SECURITY.md` advisory if disclosed | + +**Help docs go in**: `docs/guides/.md`. Structure: + +```markdown +# — User Guide + +## What it does +One paragraph. + +## When you'd use it +Bulleted scenarios. + +## Quickstart +Smallest end-to-end example. + +## Reference +Tool name, request shape, response shape, error modes. + +## See also +Links to related guides + demo script. +``` + +**Training docs** (longer-form, multi-step walkthroughs intended to teach a +concept, not just document a tool) go in `docs/training/.md`. These are +optional unless the feature introduces a concept the user must internalize +(example: "what does `pending` vs `reflected` mean?" — that's training, not +reference). + +--- + +## 9. Skill file rule (project-specific, mandatory) + +From `CLAUDE.md`: + +> Any change to an MCP tool's behavior — new fields in a response, new status +> values, changed defaults, new tool calls, deprecated params — **must ship +> with a matching update to the relevant `pilot/mcp/skills/*/SKILL.md`** in the +> same commit. + +This is enforced at review time. `pilot/mcp/skills/` is canonical; +`.claude/skills/bicameral-*/SKILL.md` copies are stale and slated for deletion. + +--- + +## 10. Hotfix path (main → main → dev) + +When `main` has a bug that can't wait for the next release: + +``` + ┌──── tag v0.13.1 ────┐ +main ─────●─────────────────────────●─────────────────────●─────▶ + \ / \ + └── hotfix/0.13.1 ────┘ │ + │ merge or + │ cherry-pick + ▼ +dev ─────────────────────────────────────────────────────●─────▶ +``` + +1. Branch from `main` (not `dev`): `hotfix/0.13.1-`. +2. Smallest possible diff. No tangential cleanup. +3. PR targets `main`. Reviewer approves; CI green. +4. Merge to `main`, tag `v0.13.1`, GitHub Release. +5. **Immediately** sync to `dev`: either merge `main` into `dev` or cherry-pick + the hotfix commit. Resolve conflicts. Push. Don't let `dev` and `main` + diverge in opposite directions for more than an hour. + +Hotfixes never carry feature work — feature work goes through the normal +feature → dev → release cycle. + +### 10.5 Triage lane (`dev` → `triage-from-dev` → `main`) + +`triage-from-dev` is a long-lived **curated stable lane** that ships a *subset* +of `dev` to `main` between full releases. It exists for changes that should +reach users faster than the next minor release allows, but that aren't +emergency hotfixes (which use §10's path). + +#### 10.5.0 Why this lane exists + +The triage lane plus the §5.1 rebase-and-merge default (with squash disabled +at the repo level) together **allow for parallel development of feature work +on `dev` and selective incorporation into production based on live feedback**. + +That goal decomposes into three constraints the existing two-branch flow +(feature → dev → main) cannot satisfy on its own: + +- **Fast iteration on `dev` shouldn't gate user-visible delivery on `main`.** + Without a triage lane, every minor-release cycle is "ship the whole + integrated batch or wait." A bug fix that's ready in week one of a six-week + release cycle waits five weeks for a milestone full of unrelated work to + close. The triage lane lets ready-and-eligible work reach users on its own + cadence. +- **Live feedback should steer what reaches `main`, not just what reaches + `dev`.** When telemetry / a customer report / a security finding marks a + specific change as important, the maintainer needs to be able to ship that + change *without* shipping everything ahead of it on `dev`. Cherry-picking a + selected subset (under §10.5.1's eligibility rule) is that mechanism. +- **The merge style on `dev` must preserve cherry-pickability.** Squash + collapses a multi-commit PR into one opaque blob — fine for `dev`'s log, + fatal for backport. Rebase-and-merge keeps each commit as an individually + addressable SHA, which is the unit the §10.5.3 cherry-pick mechanic operates + on. §5.1's "squash disabled at the repo level" exists to make this + guarantee structural rather than aspirational. + +Together these rules let the project hold two timelines: a fast-iteration +trunk where features can land in pieces and the team can change its mind, and +a slower curated trunk where users see only what's been deemed ready for +broad delivery. Neither trunk forces the other's cadence. + +``` +dev ────●────●────●────●────●────●─────▶ + \ \ \ + cherry-pick -x (selected commits only) + \ \ \ + ▼ ▼ ▼ +triage-from-dev ●────────●────●─────▶ ──── release PR ────▶ main +``` + +**Direction is one-way.** Cherry-picks flow `dev → triage-from-dev` only. Never +develop on `triage-from-dev` directly; never cherry-pick `triage-from-dev → +dev`. (Bugs introduced *only* on the triage lane get fixed on `dev` first, then +re-cherry-picked.) + +#### 10.5.1 Eligibility — what gets triaged + +Modeled after the Linux kernel's `stable` tree rules +([kernel.org stable rules](https://docs.kernel.org/process/stable-kernel-rules.html)). +A commit is triage-eligible if **all** of: + +- It is small and self-contained (rough guideline: ≤ 100 lines of context-diff, + one logical change). +- It is **obviously correct and tested** — the kernel's exact phrasing. +- It fixes one of: a real user-facing bug, a security regression, a build break + on a supported platform, a data-loss/corruption bug, or a documented + cross-platform quirk. Or it is a small additive feature whose risk surface is + isolated (e.g. a new optional MCP tool field with a default). +- It does not depend on `dev`-only refactors that haven't shipped to `main`. If + it does, the prerequisites must be triage-eligible too, and they all + cherry-pick as a coherent batch. + +**Not triage-eligible** by default: schema-migrating changes, breaking +public-API changes, multi-PR feature epics, "v1 patches" (the catch-all +`triage-from-dev` PR title uses for work explicitly held for the next major). + +When in doubt, the change waits for the next `dev → main` release. + +#### 10.5.2 Author trailer — `Triage-Cc:` + +If you (the author) believe a commit belongs on the triage lane, add a trailer: + +``` +Triage-Cc: triage-from-dev +``` + +For commits that fix an earlier commit (kernel-style), also add: + +``` +Fixes: ("") +``` + +The release manager finds candidates with: + +```bash +git log --grep='^Triage-Cc:' origin/dev ^origin/triage-from-dev +``` + +Trailers are advisory — the release manager makes the final call — but they +make the candidate set legible without re-reading every commit message. + +#### 10.5.3 Cherry-pick mechanics + +Always use `cherry-pick -x` so the resulting commit message records its +provenance (`(cherry picked from commit )`): + +```bash +git checkout triage-from-dev +git fetch origin +git cherry-pick -x +# resolve conflicts narrowly — do NOT pull in unrelated dev refactors +git push origin triage-from-dev +``` + +When a cherry-pick conflicts, classify the conflict before resolving: + +- **Missing-prerequisite conflict** — the dev commit calls a function / + references a schema field / depends on a contract that does not exist on + `triage-from-dev` and is not introduced by this same commit. **Stop.** Either + pick the prerequisite first (if it is itself triage-eligible per §10.5.1) or + hold the change for the next full `dev → main` release. +- **Diverged-surface conflict** — the change's *target file* has been + refactored on dev's path between triage's branch point and the cherry-pick + source, but every symbol / schema field / contract the cherry-picked commit + *actually depends on* either already exists on triage or is additively + introduced in this same commit. **Adaptable** — see below. + +##### Adaptation clause + +A diverged-surface conflict may be resolved by manually adapting the conflict +hunks to triage's surrounding code, provided **all** of the following hold: + +1. The cherry-pick's *intent* (the conceptual change — e.g. "route through + new adapter method", "add replay case for new event type") is preserved. + The semantic effect on triage matches the semantic effect on dev from any + external caller's POV. +2. No new logic is *invented* — every line in the resolution either comes + from the cherry-picked commit, exists on triage already, or is the + minimal mechanical glue to bridge the two (e.g. renaming a local variable + to match triage's existing identifier). +3. Each adapted hunk is annotated: + - In the **commit message** under an `Adaptation:` trailer: + `Adaptation: handlers/ratify.py — rewrote against pre-#65 inline impl` + - In the **code itself**, where the adapted block isn't trivially obvious, + with `# triage-adapt: ` immediately above the block. + +If you find yourself writing a hunk that doesn't satisfy (2) — i.e. you're +inventing logic to bridge the gap — the conflict is in fact a missing- +prerequisite conflict in disguise. Stop and reclassify. + +The release manager reviews adapted commits with extra scrutiny at the +§10.5.4 release PR; adapted commits should be a small fraction of any +triage release, and a triage cycle that's mostly adaptations is a signal +that the lane has drifted too far from `dev`. + +Resolving conflicts by inventing replacement code that does not satisfy the +adaptation clause above is forbidden — the cherry-pick must remain a faithful +subset of `dev`, modulo legitimate adaptation to a diverged surface. + +The fact that `triage-from-dev` already carries some commits with **different +SHAs than dev** (e.g. v0.14.0 telemetry, RFC #98) is sunk cost from the lane's +pre-§10.5 era. Going forward every cherry-pick uses `-x` and the audit trail +re-converges. Do **not** rewrite history on `triage-from-dev` to fix the +divergence — it is a published branch. + +#### 10.5.4 Release PR (`triage-from-dev` → `main`) + +The triage release PR follows §6 with two adjustments: + +- **Title**: `release: v0.X.Y (triage)` — the patch version bumps; minor stays + pinned to whatever `main` last tagged from a full `dev → main` release. +- **Flow label**: `flow:release` (same as a full release). +- **Body** lists each cherry-picked commit with its source `dev-sha` and the + issue/PR it traces back to. + +After the triage release tags on `main`, sync `main` back to `dev` per §10 +(merge or cherry-pick — the next-release CHANGELOG flip absorbs the patch). + +--- + +## 11. Roles + +| Role | Owner | Responsibilities | +|---|---|---| +| **Contributor** | anyone | Open issues, branch off `dev`, open PRs to `dev`, address review feedback, keep skill files in sync. | +| **Reviewer** | code-owners | Block on red CI, Razor violations, missing skill updates, missing Plan/Audit/Seal references on non-trivial PRs. | +| **Release manager** | Jin | Decide release cadence, open release PR, run pre-release checklist, merge to `main`, tag, publish GitHub Release, manage milestones. | +| **Doc steward** | rotating | Verify the §8 matrix is satisfied before each release. | +| **Governance steward** | QOR-chain owner | Verify META_LEDGER chain integrity at each release seal. | + +Single-maintainer fallback: if Jin is offline, the release waits. We do not +unilaterally promote `dev` → `main`. + +--- + +## 12. Demo scripts + +Every shipped feature should have at least one runnable demo that takes a +viewer from "I don't know what this does" to "I see the value" in under 5 +minutes. Demos live in `docs/demos/-.md` and follow the same template: + +```markdown +# Demo NN: + +**Audience**: <e.g. "first-time evaluator"> +**Time**: <≤ 5 min> +**Prereqs**: <repo cloned, deps installed, MCP server running> + +## What you'll see +1-paragraph spoiler. + +## Setup +Copy-pasteable shell block. + +## Walkthrough +Numbered steps, each with the exact tool call / command and the expected +output (truncated where it makes sense). + +## What just happened +Plain-English read of the result. Tie it back to the user-value claim. + +## Next +Pointer to the user guide and related demos. +``` + +Below: four demo scripts that cover the project's headline functionality. Each +one should be authored as a standalone file and kept in sync with the matching +skill / tool. + +### Demo 01 — First decision bind, search, drift detect + +**Path**: `docs/demos/01-first-bind.md` +**Audience**: "I just installed bicameral-mcp; what's the loop?" + +**Storyline**: + +1. `bicameral.bind` a decision: *"all monetary calculations use `Decimal`, + never `float`"*. Show that the tool returns a region-id and a content hash. +2. `bicameral.search_decisions` for the keyword `"monetary"`. Show the just-bound + decision returns at the top. +3. Edit the bound region: change `Decimal` to `float` in the linked file. +4. `bicameral.detect_drift`. Show that the region surfaces with status + `drifted`. +5. Restore the file. Re-run. Status flips back to `reflected`. + +**Value claim**: "Your decisions are now first-class artifacts — searchable, +hash-anchored, and drift-detected without you running anything by hand." + +### Demo 02 — Commit-sync loop (post-commit hook → resolve_compliance) + +**Path**: `docs/demos/02-commit-sync.md` +**Audience**: "How does this play with my actual git workflow?" + +**Storyline**: + +1. Show the post-commit hook installed (`.git/hooks/post-commit`) calling + `bicameral-mcp link_commit HEAD`. +2. Edit a bound region. `git commit`. +3. Show the hook output: `bicameral: new commit detected`. +4. Show `_pending_compliance_checks` injected into the next tool response. +5. Walk through the `bicameral-sync` skill: read region → reason → batched + `resolve_compliance(verdicts=[...])`. +6. Show the final ledger state: N reflected, N drifted, 0 pending. + +**Value claim**: "Compliance is computed automatically on every commit, not +quarterly by a human auditor." + +### Demo 03 — Continuity matcher: function rename auto-redirect (Phase 3) + +**Path**: `docs/demos/03-continuity-rename.md` +**Audience**: "What happens when I refactor?" + +**Storyline**: + +1. Bind a decision to a function `calculate_tax_v1`. +2. Rename the function to `compute_tax`. Move it to a different file. Commit. +3. Naïvely: the binding would orphan and the decision would go `ungrounded`. +4. With `BICAMERAL_CODEGENOME_ENHANCE_DRIFT=1`: `link_commit` runs the + continuity matcher pre-pass. +5. Show the response's `continuity_resolutions` list: + `semantic_status: identity_renamed`, the binding redirected, no manual + action needed. + +**Value claim**: "Refactoring no longer breaks your decision graph. The matcher +recognises moved or renamed code and updates bindings automatically." + +### Demo 04 — Cosmetic-vs-semantic drift classifier (Phase 4) + +**Path**: `docs/demos/04-drift-classifier.md` +**Audience**: "Why does this not flag every whitespace change as drift?" + +**Storyline**: + +1. Bind a decision to a function. Capture the baseline ledger state. +2. **Cosmetic change**: re-format the docstring; re-order imports. Commit. + Run `link_commit`. Show `auto_resolved_count: 1`, status flips to + `compliant` with `semantic_status: semantically_preserved`. Zero LLM calls. +3. **Semantic change**: change the threshold inside the function from 100 + to 50. Commit. Run `link_commit`. Show the region appears in + `pending_compliance_checks` with a `pre_classification` hint + (`verdict: uncertain`, signals breakdown). +4. Walk through the LLM-side reasoning the `bicameral-sync` skill applies to + issue the `drifted` verdict. +5. Show the M3 benchmark: 30 cases × 7 languages, 0% false-positive rate on + the cosmetic-only set. + +**Value claim**: "The classifier handles the easy 80% deterministically, leaves +only genuinely ambiguous cases for the LLM, and never costs you a token on a +docstring tweak." + +### Authoring rules for new demos + +- Run the demo end-to-end on a fresh clone before committing it. Demos that + drift become anti-marketing. +- If the demo depends on a feature flag (`BICAMERAL_CODEGENOME_ENHANCE_DRIFT`, + etc.), say so in **Prereqs**. +- If the demo records output, store the recording in `docs/demos/recordings/` + next to the script. Keep recordings under 30 MB. +- Update the demo whenever the underlying tool's response shape changes — + this is enforced under §9 (skill rule). + +--- + +## 13. When in doubt + +- **"Does this need a release PR?"** — If `main`'s SHA would change, yes. +- **"Should I close this issue?"** — `Closes #X` in the PR body, then yes + (auto on dev-merge). +- **"Should I bump the version?"** — Only Jin bumps the version, only at + release time. +- **"Can I commit a skill change separately from the tool change?"** — No. + Same commit, same PR. +- **"Should I write a guide for this?"** — Use the §8 matrix. If the row says + "yes", yes. +- **"Is this a hotfix or a feature?"** — Hotfix is for a regression on `main` + that broke a user. Everything else is a feature. + +--- + +**Owner**: Jin (release manager) + repo maintainers. +**Last reviewed**: 2026-04-29. +**Change protocol**: amendments require a META_LEDGER entry + a PR labeled +`docs:dev-cycle`. diff --git a/docs/demos/README.md b/docs/demos/README.md new file mode 100644 index 00000000..eab196b0 --- /dev/null +++ b/docs/demos/README.md @@ -0,0 +1,53 @@ +# Demos + +Runnable, ≤ 5-minute walkthroughs of headline functionality. Each demo takes a +viewer from "I don't know what this does" to "I see the value" without leaving +the file. + +See [`docs/DEV_CYCLE.md` §12](../DEV_CYCLE.md#12-demo-scripts) for the +authoring rules and the demo template. + +## Index + +| # | Title | Audience | Status | +|---|---|---|---| +| 01 | First decision bind, search, drift detect | "what's the loop?" | planned | +| 02 | Commit-sync hook → resolve_compliance | "how does it play with git?" | planned | +| 03 | Continuity matcher: function rename auto-redirect (Phase 3) | "what about refactors?" | planned | +| 04 | Cosmetic-vs-semantic drift classifier (Phase 4) | "why no whitespace false-flags?" | planned | +| — | [v0 user flow e2e (split-screen)](./v0-userflow-e2e.md) | "what does the loop look like end-to-end?" | live (manual workflow) | + +## Authoring rules (summary) + +- Run the demo end-to-end on a fresh clone before committing it. +- If the demo depends on a feature flag (e.g. + `BICAMERAL_CODEGENOME_ENHANCE_DRIFT`), say so in **Prereqs**. +- Recordings (≤ 30 MB) live in `recordings/` next to the script. +- Update the demo whenever the underlying tool's response shape changes — + enforced by the skill rule in `DEV_CYCLE.md` §9. + +## Template + +```markdown +# Demo NN: <Title> + +**Audience**: <e.g. "first-time evaluator"> +**Time**: <≤ 5 min> +**Prereqs**: <repo cloned, deps installed, MCP server running> + +## What you'll see +1-paragraph spoiler. + +## Setup +Copy-pasteable shell block. + +## Walkthrough +Numbered steps, each with the exact tool call / command and the expected +output (truncated where it makes sense). + +## What just happened +Plain-English read of the result. Tie it back to the user-value claim. + +## Next +Pointer to the user guide and related demos. +``` diff --git a/docs/demos/v0-userflow-e2e.md b/docs/demos/v0-userflow-e2e.md new file mode 100644 index 00000000..cf951470 --- /dev/null +++ b/docs/demos/v0-userflow-e2e.md @@ -0,0 +1,101 @@ +# Demo: v0 user flow e2e (split-screen, two views) + +**Audience**: first-time evaluators who want to see the loop without running it. +**Time**: ~6 min PM view, ~10 min Dev view. +**Prereqs**: none — videos play in any browser. + +## What you'll see + +A continuous Claude Code CLI session — recorded once, then split in post +into two persona-shaped videos: + +- **Left pane** of the recording — `xterm` running `claude -p <composite-prompt>` + with `bicameral-mcp` registered as the only MCP server. The LLM's reasoning, + tool calls, and outputs render in real time via a small stream-json formatter. +- **Right pane** — `chromium` pointed at the bicameral dashboard sidecar + (`http://localhost:<port>`). Live SSE updates as the session emits ledger + writes. **Because both PM scenes and the Dev scene share one MCP process, + the dashboard state in the post-implementation chapter literally reflects + the commits the dev made on screen** — not a re-hydration from a separate + ledger. + +### `pm.mp4` (PM view) + +| Chapter | Tools used | What's on screen | +|---|---|---| +| 1. Post-meeting | `bicameral.dashboard`, `bicameral.ingest`, `bicameral.ratify` | PM ingests three GitHub Desktop roadmap decisions; the dashboard fills with proposed-then-ratified entries. | +| _Transition slide_ | _(ffmpeg-generated)_ | "Dev now implements the change → Returning to PM after the implementation has landed." | +| 2. Post-implementation | `bicameral.history`, `bicameral.ratify` | PM calls `history`; the cherry-pick decision now shows `status=reflected` (was pending). PM ratifies the post-implementation state. | + +### `dev.mp4` (Dev view) + +| Step | Tool | What's on screen | +|---|---|---| +| 1 | `bicameral.preflight` | Surfaces the cherry-pick decision before any edit. | +| 2 | `Edit` | Single-line annotation added to `app/src/lib/git/cherry-pick.ts`. | +| 3 | `Bash` (`git add` + `git commit`) | Real commit on the desktop/desktop fixture. | +| 4 | `bicameral.link_commit` | Detects drift candidates against decisions bound to that file. | +| 5 | `bicameral.resolve_compliance` | Verdict per pending compliance check (compliant / drifted / not_relevant). | +| 6 | `bicameral.ingest` (source=agent_session) | Captures any session-end corrections. | + +A third file, `full.mp4`, contains the full unbroken arc — useful if you +want to see the Dev's commits land in the dashboard without the +transition cut. + +## How to access the latest demos + +The MP4s are generated on demand and **not committed to git** — they live in +the `v0-user-flow-e2e-demos` artifact attached to the manual workflow run. + +1. Open the [v0 user flow e2e workflow runs](../../../../actions/workflows/v0-user-flow-e2e.yml). +2. Filter to runs triggered via "Run workflow" with `record_demo = true`. +3. Scroll to the run's **Artifacts** section, download `v0-user-flow-e2e-demos`. +4. Unzip → `pm.mp4`, `dev.mp4`, `full.mp4`. + +Artifact retention is 90 days. On a release cut (per +[`docs/DEV_CYCLE.md` §6.7](../DEV_CYCLE.md#67-github-release)), the maintainer +attaches the latest demos to the GitHub release for permanent URLs. + +## How to record a fresh set + +Demos are intentionally manual — not gated on every PR — because they cost +~25–35 minutes wall + Claude API spend per run. + +1. Trigger via the workflow's **Run workflow** dropdown (UI), or: + ```bash + gh workflow run v0-user-flow-e2e.yml -f record_demo=true + ``` +2. Wait for the run to finish. The assertion step still runs first and is + the authority on pass/fail; the recording step is `continue-on-error`, + so a flake never blocks merge. +3. Download the `v0-user-flow-e2e-demos` artifact as above. + +## How the split works + +`tests/e2e/record_demo.sh` runs one continuous claude session driven by +`tests/e2e/prompts/composite-demo.md` (three scenes: PM-pre, Dev, PM-post). +The session's stream-json output is piped through +`tests/e2e/demo_renderer.py`, which: + +1. Pretty-prints to stdout so the xterm shows readable text. +2. Watches the tool-call timeline and writes wall-clock timestamps to + `composite-demo-scenes.txt` at two boundaries: + - **Scene 1 → 2** = first `bicameral.preflight` call (Dev starts). + - **Scene 2 → 3** = first `bicameral.history` call after any + `bicameral.link_commit` (PM resumes). +3. Persists the raw stream-json transcript for forensic review. + +After ffmpeg stops, the script trims `full.mp4` at those two timestamps +into `pm-pre`, `dev`, `pm-post`, generates a 4-second transition slide via +`drawtext`, and concats `pm-pre + transition + pm-post → pm.mp4`. + +If scene markers are missing (e.g., the LLM declined a step), the script +falls back to keeping `full.mp4` only — the recording is preserved but +the split is skipped. + +## Next + +- [End-to-end suite README](../../tests/e2e/README.md) — the assertion-only + path that runs on every qualifying PR. +- [`#108` spec](https://github.com/BicameralAI/bicameral/issues/108) — the + six canonical flows the composite prompt orchestrates. diff --git a/docs/guides/README.md b/docs/guides/README.md new file mode 100644 index 00000000..1b16b769 --- /dev/null +++ b/docs/guides/README.md @@ -0,0 +1,45 @@ +# User Guides + +Reference-style documentation for individual features. Pairs with the demos in +`docs/demos/` (which show *how it feels*) by answering *what it does, when to +use it, and what every field means*. + +See [`docs/DEV_CYCLE.md` §8](../DEV_CYCLE.md#8-documentation-requirements-per-release) +for when a guide is required by the release process. + +## Index + +| Topic | Surface | Status | +|---|---|---| +| (none yet) | — | — | + +## Template + +```markdown +# <Feature> — User Guide + +## What it does +One paragraph. + +## When you'd use it +Bulleted scenarios. + +## Quickstart +Smallest end-to-end example. + +## Reference +Tool name, request shape, response shape, error modes. + +## See also +Links to related guides + demo script. +``` + +## Authoring rules + +- One guide per feature, named `<feature-slug>.md`. +- Guides are reference, not tutorial — show field shapes and error modes + exhaustively. Tutorial-style content belongs in `docs/training/`. +- A guide referenced by a release PR's documentation checklist must exist by + the time the release PR opens, not later. +- When a tool's response shape changes, update the matching guide in the same + commit (per `DEV_CYCLE.md` §9 skill rule). diff --git a/docs/preflight-failure-scenarios.md b/docs/preflight-failure-scenarios.md index 7a30bb3e..1543e9cc 100644 --- a/docs/preflight-failure-scenarios.md +++ b/docs/preflight-failure-scenarios.md @@ -54,7 +54,7 @@ Status legend: | **M3** | skill | Internal acronym / jargon | Decision: *"Audit log captures every admin action..."* / Topic: `SOC2 compliance trail` | ⚪ | | **M4** | skill | Ungrounded decision (no `binds_to`) — only surfaces if skill judges its feature group relevant from history | Decision (status=ungrounded): *"Permission checks always run server-side"* / Topic: `permission middleware client check` | ⚪ | | **M5** | handler | Region-anchored miss — caller didn't pass `file_paths` | Topic: `update auth config` / `file_paths=[]` — handler returns no region matches; only HITL/guided can fire | ⚪ acknowledged caller responsibility; HITL still global | -| **M6** | handler | Transitive — decision pinned to a dependency of `file_paths` | Decision pinned to `auth/jwt.py` / `file_paths=["auth/login_handler.py"]` (imports `jwt`) | ❌ region lookup only sees the direct file | +| **M6** | handler | Transitive — decision pinned to a dependency of `file_paths` | Decision pinned to `auth/jwt.py` / `file_paths=["auth/login_handler.py"]` (imports `jwt`) | ✅ closed by #173/#174 — `_region_anchored_preflight` expands `file_paths` by 1 hop along import edges before the `binds_to` lookup; expansion-only matches surface with `confidence=0.7` and `sources_chained` adds `"graph"` | | **M7** | handler | Dedup-key coarseness — current key is `(topic)`; same topic with changed `file_paths`, new HITL state, or a fresh ledger revision is silenced | (a) Topic re-asked after a relevant decision lands; (b) topic kept stable while `file_paths` shifts to a different region; (c) HITL condition resolves mid-window | ❌ open — broaden cache key to `(topic, normalized_file_paths, ledger_revision)` and invalidate on HITL change | | **M8** | meta | Skill skips `bicameral.history()` despite non-empty ledger (skill-step adherence drift) | Caller LLM jumps straight to `bicameral.preflight` and never reads history | ⛔ skill-conformance, not handler-eval scope | | **M9** | meta | `BICAMERAL_PREFLIGHT_MUTE` set, developer forgot it's on | Env var carried over from prior debug session | ⛔ intentional kill switch | diff --git a/docs/training/README.md b/docs/training/README.md new file mode 100644 index 00000000..a0896f97 --- /dev/null +++ b/docs/training/README.md @@ -0,0 +1,61 @@ +# Training + +Long-form, multi-step walkthroughs that teach a *concept*, not a tool. Use +training docs when a feature introduces an idea the user must internalise +before the reference docs make sense. + +Examples of concepts that warrant training: + +- *"What does `pending` vs `reflected` vs `drifted` vs `ungrounded` actually + mean, and how does the ledger derive each?"* +- *"What's a content-hash CAS guard, why does the server reject your verdict + when it doesn't match, and how do you recover?"* +- *"How does the continuity matcher decide a renamed function is the same + identity?"* + +If the answer fits in a guide's intro paragraph, it's a guide, not a training +doc. + +See [`docs/DEV_CYCLE.md` §8](../DEV_CYCLE.md#8-documentation-requirements-per-release) +for when training is required by the release process (rule of thumb: only when +the feature introduces a concept, not just a tool). + +## Index + +| Topic | Status | +|---|---| +| [Cosmetic vs semantic drift](./cosmetic-vs-semantic.md) | Active | + +## Template + +```markdown +# <Concept> — Training + +## Why this exists +Two sentences. The mental-model gap this doc closes. + +## Prerequisites +What the reader should already understand or have read. + +## The concept +The actual teaching content. Use diagrams, worked examples, anti-examples. +Be willing to spend 1000+ words if the concept is load-bearing. + +## Worked example +End-to-end scenario tying the concept to a real tool call. + +## Common pitfalls +Numbered list of mistakes people make and the corrected behaviour. + +## See also +Links to relevant guides, demos, and source files. +``` + +## Authoring rules + +- Training docs are not release-blocking unless `DEV_CYCLE.md` §8 says so for + the specific feature class. +- One concept per file. If you find yourself splitting into Part 1 / Part 2, + the concept is probably two concepts. +- Reviewers may push back on training that overlaps with an existing guide — + guides are the canonical reference; training is supplementary. diff --git a/events/materializer.py b/events/materializer.py index cd0bbf24..6ebe90f9 100644 --- a/events/materializer.py +++ b/events/materializer.py @@ -91,7 +91,8 @@ async def replay_new_events(self, inner_adapter) -> int: replayed += 1 elif etype == "link_commit.completed": await inner_adapter.ingest_commit( - payload.get("commit_hash", ""), payload.get("repo_path", ""), + payload.get("commit_hash", ""), + payload.get("repo_path", ""), ) replayed += 1 elif etype == "decision_ratified.completed": diff --git a/events/team_adapter.py b/events/team_adapter.py index a4ecfae0..3a433e57 100644 --- a/events/team_adapter.py +++ b/events/team_adapter.py @@ -8,9 +8,8 @@ from __future__ import annotations import logging -from pathlib import Path -from ledger.queries import find_decision_by_canonical_id, get_canonical_id +from ledger.queries import get_canonical_id from .materializer import EventMaterializer from .writer import EventFileWriter @@ -122,13 +121,16 @@ async def bind_decision( ) -> dict: """Emit bind event, then delegate to inner adapter.""" await self._ensure_ready() - self._writer.write("bind_decision.completed", { - "decision_id": decision_id, - "file_path": file_path, - "symbol_name": symbol_name, - "start_line": start_line, - "end_line": end_line, - }) + self._writer.write( + "bind_decision.completed", + { + "decision_id": decision_id, + "file_path": file_path, + "symbol_name": symbol_name, + "start_line": start_line, + "end_line": end_line, + }, + ) return await self._inner.bind_decision( decision_id=decision_id, file_path=file_path, diff --git a/events/writer.py b/events/writer.py index fc78965d..6abd159d 100644 --- a/events/writer.py +++ b/events/writer.py @@ -17,9 +17,9 @@ import logging import subprocess import sys -from datetime import datetime, timezone +from datetime import UTC, datetime from pathlib import Path -from typing import Any, IO +from typing import IO, Any from pydantic import BaseModel, Field @@ -71,10 +71,11 @@ def _unlock(f: IO[bytes]) -> None: class EventEnvelope(BaseModel): """One event line in ``{email}.jsonl``.""" + schema_version: int = 2 event_type: str author: str - timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + timestamp: datetime = Field(default_factory=lambda: datetime.now(UTC)) payload: dict[str, Any] = Field(default_factory=dict) @@ -83,7 +84,10 @@ def _get_git_email(repo_path: str | Path) -> str: try: result = subprocess.run( ["git", "config", "user.email"], - capture_output=True, text=True, timeout=5, cwd=str(repo_path), + capture_output=True, + text=True, + timeout=5, + cwd=str(repo_path), ) email = result.stdout.strip() if email: @@ -117,7 +121,9 @@ def path(self) -> Path: def write(self, event_type: str, payload: dict[str, Any]) -> Path: """Append one event line. Returns the JSONL file path.""" envelope = EventEnvelope( - event_type=event_type, author=self._author, payload=payload, + event_type=event_type, + author=self._author, + payload=payload, ) line = json.dumps(envelope.model_dump(), separators=(",", ":"), default=str) + "\n" with open(self._path, "ab") as f: diff --git a/handlers/action_hints.py b/handlers/action_hints.py index 8fc8f2a5..ad0d2bb8 100644 --- a/handlers/action_hints.py +++ b/handlers/action_hints.py @@ -41,7 +41,6 @@ SearchDecisionsResponse, ) - # ── Message variants ─────────────────────────────────────────────── @@ -127,27 +126,26 @@ def generate_hints_for_search( drifted = [m for m in response.matches if m.status == "drifted"] if drifted: - files = sorted({ - r.file_path - for m in drifted - for r in m.code_regions - if r.file_path - }) - hints.append(ActionHint( - kind="review_drift", - message=_drift_message(len(drifted), guided_mode), - blocking=guided_mode, - refs=[m.decision_id for m in drifted] + files, - )) + files = sorted({r.file_path for m in drifted for r in m.code_regions if r.file_path}) + hints.append( + ActionHint( + kind="review_drift", + message=_drift_message(len(drifted), guided_mode), + blocking=guided_mode, + refs=[m.decision_id for m in drifted] + files, + ) + ) ungrounded = [m for m in response.matches if m.status == "ungrounded"] if ungrounded: - hints.append(ActionHint( - kind="ground_decision", - message=_ground_message(len(ungrounded), guided_mode), - blocking=guided_mode, - refs=[m.decision_id for m in ungrounded], - )) + hints.append( + ActionHint( + kind="ground_decision", + message=_ground_message(len(ungrounded), guided_mode), + blocking=guided_mode, + refs=[m.decision_id for m in ungrounded], + ) + ) return hints @@ -173,21 +171,25 @@ def generate_hints_for_scan_branch( # a symbol but not a file_path directly — fall back to the # response-level files_changed list when per-entry file refs # aren't available. - hints.append(ActionHint( - kind="review_drift", - message=_drift_message(len(drifted), guided_mode), - blocking=guided_mode, - refs=[d.decision_id for d in drifted] + response.files_changed, - )) + hints.append( + ActionHint( + kind="review_drift", + message=_drift_message(len(drifted), guided_mode), + blocking=guided_mode, + refs=[d.decision_id for d in drifted] + response.files_changed, + ) + ) ungrounded = [d for d in response.decisions if d.status == "ungrounded"] if ungrounded: - hints.append(ActionHint( - kind="ground_decision", - message=_ground_message(len(ungrounded), guided_mode), - blocking=guided_mode, - refs=[d.decision_id for d in ungrounded], - )) + hints.append( + ActionHint( + kind="ground_decision", + message=_ground_message(len(ungrounded), guided_mode), + blocking=guided_mode, + refs=[d.decision_id for d in ungrounded], + ) + ) return hints @@ -211,31 +213,34 @@ def generate_hints_from_findings( hints: list[ActionHint] = [] if divergences: - hints.append(ActionHint( - kind="resolve_divergence", - message=_divergence_message(len(divergences), guided_mode), - blocking=guided_mode, - refs=[f"{d.symbol} ({d.file_path})" for d in divergences], - )) + hints.append( + ActionHint( + kind="resolve_divergence", + message=_divergence_message(len(divergences), guided_mode), + blocking=guided_mode, + refs=[f"{d.symbol} ({d.file_path})" for d in divergences], + ) + ) if drift_candidates: - hints.append(ActionHint( - kind="review_drift", - message=_drift_message(len(drift_candidates), guided_mode), - blocking=guided_mode, - refs=[d.decision_id for d in drift_candidates], - )) - - open_q_gaps = [ - g for g in gaps - if "open-question" in g.hint or "open question" in g.hint - ] + hints.append( + ActionHint( + kind="review_drift", + message=_drift_message(len(drift_candidates), guided_mode), + blocking=guided_mode, + refs=[d.decision_id for d in drift_candidates], + ) + ) + + open_q_gaps = [g for g in gaps if "open-question" in g.hint or "open question" in g.hint] if open_q_gaps: - hints.append(ActionHint( - kind="answer_open_questions", - message=_open_questions_message(len(open_q_gaps), guided_mode), - blocking=guided_mode, - refs=[g.description[:140] for g in open_q_gaps], - )) + hints.append( + ActionHint( + kind="answer_open_questions", + message=_open_questions_message(len(open_q_gaps), guided_mode), + blocking=guided_mode, + refs=[g.description[:140] for g in open_q_gaps], + ) + ) return hints diff --git a/handlers/analysis.py b/handlers/analysis.py index dba8970d..24ce7d22 100644 --- a/handlers/analysis.py +++ b/handlers/analysis.py @@ -17,7 +17,6 @@ DecisionMatch, ) - # ── Divergence detection heuristics ───────────────────────────────── _NEGATION_PAIRS: list[tuple[str, str]] = [ @@ -39,14 +38,18 @@ ] _DIVERGENCE_TOKENS = { - " vs ", " vs. ", " or ", "instead of", "rather than", + " vs ", + " vs. ", + " or ", + "instead of", + "rather than", } def _descriptions_conflict(descriptions: list[str]) -> bool: lower = [d.lower() for d in descriptions] for i, a in enumerate(lower): - for b in lower[i + 1:]: + for b in lower[i + 1 :]: for left, right in _NEGATION_PAIRS: if (left in a and right in b) or (left in b and right in a): return True @@ -87,8 +90,14 @@ def _detect_divergences(matches: list[DecisionMatch]) -> list[BriefDivergence]: # ── Gap extraction heuristic ───────────────────────────────────────── _OPEN_QUESTION_MARKERS = ( - "?", " tbd", " tbh", " vs ", " vs. ", - "open question", "should we", "which one", + "?", + " tbd", + " tbh", + " vs ", + " vs. ", + "open question", + "should we", + "which one", ) @@ -102,23 +111,28 @@ def _extract_gaps(matches: list[DecisionMatch]) -> list[BriefGap]: gaps: list[BriefGap] = [] for m in matches: if _looks_like_open_question(m.description): - gaps.append(BriefGap( - description=m.description, - hint="open-question phrasing (vs/or/tbd/?)", - relevant_source_refs=[m.source_ref] if m.source_ref else [], - )) + gaps.append( + BriefGap( + description=m.description, + hint="open-question phrasing (vs/or/tbd/?)", + relevant_source_refs=[m.source_ref] if m.source_ref else [], + ) + ) continue if m.status == "ungrounded": - gaps.append(BriefGap( - description=m.description, - hint="decision recorded but no code grounding — needs implementation or clarification", - relevant_source_refs=[m.source_ref] if m.source_ref else [], - )) + gaps.append( + BriefGap( + description=m.description, + hint="decision recorded but no code grounding — needs implementation or clarification", + relevant_source_refs=[m.source_ref] if m.source_ref else [], + ) + ) return gaps # ── Shape conversion ───────────────────────────────────────────────── + def _to_brief_decision(m: DecisionMatch) -> BriefDecision: return BriefDecision( decision_id=m.decision_id, diff --git a/handlers/bind.py b/handlers/bind.py index c5f91ac1..236d4aae 100644 --- a/handlers/bind.py +++ b/handlers/bind.py @@ -1,7 +1,9 @@ """Handler for bicameral.bind — caller-LLM-driven code region binding.""" from __future__ import annotations + import logging + from contracts import BindResponse, BindResult, PendingComplianceCheck, SyncMetrics from handlers.sync_middleware import repo_write_barrier @@ -48,6 +50,7 @@ async def _do_bind(ctx, bindings: list[dict]) -> BindResponse: effective_ref = authoritative_sha if head_sha and head_sha not in ("HEAD", ""): from handlers.link_commit import _is_ephemeral_commit + if _is_ephemeral_commit(head_sha, repo, authoritative_ref): effective_ref = head_sha @@ -62,46 +65,68 @@ async def _do_bind(ctx, bindings: list[dict]) -> BindResponse: purpose = str(b.get("purpose") or "") if not decision_id or not file_path or not symbol_name: - results.append(BindResult( - decision_id=decision_id, region_id="", content_hash="", - error="decision_id, file_path, and symbol_name are required", - )) + results.append( + BindResult( + decision_id=decision_id, + region_id="", + content_hash="", + error="decision_id, file_path, and symbol_name are required", + ) + ) continue try: exists = await ledger.decision_exists(decision_id) except Exception as exc: - results.append(BindResult( - decision_id=decision_id, region_id="", content_hash="", - error=f"decision lookup failed: {exc}", - )) + results.append( + BindResult( + decision_id=decision_id, + region_id="", + content_hash="", + error=f"decision lookup failed: {exc}", + ) + ) continue if not exists: - results.append(BindResult( - decision_id=decision_id, region_id="", content_hash="", - error=f"unknown_decision_id: {decision_id}", - )) + results.append( + BindResult( + decision_id=decision_id, + region_id="", + content_hash="", + error=f"unknown_decision_id: {decision_id}", + ) + ) continue if start_line is None or end_line is None: from ledger.status import resolve_symbol_lines + resolved = resolve_symbol_lines(file_path, symbol_name, repo, ref=effective_ref) if resolved is None: - results.append(BindResult( - decision_id=decision_id, region_id="", content_hash="", - error=f"symbol '{symbol_name}' not found in {file_path} at {effective_ref}", - )) + results.append( + BindResult( + decision_id=decision_id, + region_id="", + content_hash="", + error=f"symbol '{symbol_name}' not found in {file_path} at {effective_ref}", + ) + ) continue start_line, end_line = resolved else: start_line, end_line = int(start_line), int(end_line) from ledger.status import get_git_content + if get_git_content(file_path, 1, 1, repo, ref=effective_ref) is None: - results.append(BindResult( - decision_id=decision_id, region_id="", content_hash="", - error=f"file '{file_path}' does not exist at {effective_ref} — only bind to existing code, never hypothetical files", - )) + results.append( + BindResult( + decision_id=decision_id, + region_id="", + content_hash="", + error=f"file '{file_path}' does not exist at {effective_ref} — only bind to existing code, never hypothetical files", + ) + ) continue try: @@ -117,10 +142,14 @@ async def _do_bind(ctx, bindings: list[dict]) -> BindResponse: ) except Exception as exc: logger.warning("[bind] bind_decision failed: %s", exc) - results.append(BindResult( - decision_id=decision_id, region_id="", content_hash="", - error=str(exc), - )) + results.append( + BindResult( + decision_id=decision_id, + region_id="", + content_hash="", + error=str(exc), + ) + ) continue region_id = bind_result["region_id"] @@ -151,11 +180,13 @@ async def _do_bind(ctx, bindings: list[dict]) -> BindResponse: except Exception as exc: logger.warning( "[bind] decision_level lookup failed for %s: %s — skipping codegenome write", - decision_id, exc, + decision_id, + exc, ) level = None # treat lookup failure as "skip" — safer than over-writing if level == "L2": from codegenome.bind_service import write_codegenome_identity + try: await write_codegenome_identity( ledger=ledger, @@ -172,12 +203,14 @@ async def _do_bind(ctx, bindings: list[dict]) -> BindResponse: except Exception as exc: logger.warning( "[bind] codegenome identity write failed for %s: %s", - decision_id, exc, + decision_id, + exc, ) else: logger.debug( "[bind] L1 exemption — skipping codegenome write for %s (decision_level=%r)", - decision_id, level, + decision_id, + level, ) pending_check = None @@ -196,15 +229,18 @@ async def _do_bind(ctx, bindings: list[dict]) -> BindResponse: content_hash=content_hash, ) - results.append(BindResult( - decision_id=decision_id, - region_id=region_id, - content_hash=content_hash, - pending_compliance_check=pending_check, - )) + results.append( + BindResult( + decision_id=decision_id, + region_id=region_id, + content_hash=content_hash, + pending_compliance_check=pending_check, + ) + ) try: from dashboard.server import notify_dashboard + await notify_dashboard(ctx) except Exception: pass diff --git a/handlers/decision_status.py b/handlers/decision_status.py index 68a06179..23f701a0 100644 --- a/handlers/decision_status.py +++ b/handlers/decision_status.py @@ -7,7 +7,7 @@ from __future__ import annotations import logging -from datetime import datetime, timezone +from datetime import UTC, datetime from contracts import CodeRegionSummary, DecisionStatusEntry, DecisionStatusResponse @@ -23,6 +23,7 @@ async def handle_decision_status( # Auto-sync to HEAD so status reflects current code state try: from handlers.link_commit import handle_link_commit + await handle_link_commit(ctx, ref) except Exception as exc: logger.warning("[status] auto-sync failed: %s", exc) @@ -50,26 +51,28 @@ async def handle_decision_status( ] _signoff = d.get("signoff") or {} - entries.append(DecisionStatusEntry( - decision_id=d["decision_id"], - description=d["description"], - status=status, - signoff_state=(_signoff.get("state") if isinstance(_signoff, dict) else None), - source_type=d.get("source_type", ""), - source_ref=d.get("source_ref", ""), - ingested_at=d.get("ingested_at", ""), - code_regions=regions, - drift_evidence=d.get("drift_evidence", ""), - blast_radius=d.get("blast_radius", []), - source_excerpt=d.get("source_excerpt", ""), - meeting_date=d.get("meeting_date", ""), - speakers=d.get("speakers", []), - signoff=d.get("signoff"), - )) + entries.append( + DecisionStatusEntry( + decision_id=d["decision_id"], + description=d["description"], + status=status, + signoff_state=(_signoff.get("state") if isinstance(_signoff, dict) else None), + source_type=d.get("source_type", ""), + source_ref=d.get("source_ref", ""), + ingested_at=d.get("ingested_at", ""), + code_regions=regions, + drift_evidence=d.get("drift_evidence", ""), + blast_radius=d.get("blast_radius", []), + source_excerpt=d.get("source_excerpt", ""), + meeting_date=d.get("meeting_date", ""), + speakers=d.get("speakers", []), + signoff=d.get("signoff"), + ) + ) return DecisionStatusResponse( ref=ref, - as_of=datetime.now(timezone.utc).isoformat(), + as_of=datetime.now(UTC).isoformat(), summary=summary, decisions=entries, ) diff --git a/handlers/detect_drift.py b/handlers/detect_drift.py index 05341811..5045aa1f 100644 --- a/handlers/detect_drift.py +++ b/handlers/detect_drift.py @@ -42,7 +42,7 @@ def _resolve_subjects_eligible(decision: dict) -> bool: """ level = decision.get("decision_level") if level is None: - return True # pre-v0.9.3 decisions: eligible by default for backward compat + return True # pre-v0.9.3 decisions: eligible by default for backward compat return level == "L2" @@ -73,18 +73,20 @@ def raw_decisions_to_drift_entries( counts["ungrounded"] += 1 _signoff = d.get("signoff") or {} - entries.append(DriftEntry( - decision_id=d["decision_id"], - description=d["description"], - status=status, - signoff_state=(_signoff.get("state") if isinstance(_signoff, dict) else None), - symbol=region.get("symbol", ""), - lines=tuple(region.get("lines", (0, 0))), - drift_evidence=drift_evidence, - source_ref=d.get("source_ref", ""), - source_excerpt=d.get("source_excerpt", ""), - meeting_date=d.get("meeting_date", ""), - )) + entries.append( + DriftEntry( + decision_id=d["decision_id"], + description=d["description"], + status=status, + signoff_state=(_signoff.get("state") if isinstance(_signoff, dict) else None), + symbol=region.get("symbol", ""), + lines=tuple(region.get("lines", (0, 0))), + drift_evidence=drift_evidence, + source_ref=d.get("source_ref", ""), + source_excerpt=d.get("source_excerpt", ""), + meeting_date=d.get("meeting_date", ""), + ) + ) return entries, counts @@ -101,12 +103,8 @@ async def handle_detect_drift( if os.getenv("USE_REAL_CODE_LOCATOR", "0") == "1": abs_path = str((Path(ctx.repo_path) / file_path).resolve()) all_symbols = await ctx.code_graph.extract_symbols(abs_path) - decision_symbols = { - d.get("code_region", {}).get("symbol", "") for d in raw_decisions - } - undocumented = [ - s["name"] for s in all_symbols if s["name"] not in decision_symbols - ] + decision_symbols = {d.get("code_region", {}).get("symbol", "") for d in raw_decisions} + undocumented = [s["name"] for s in all_symbols if s["name"] not in decision_symbols] else: undocumented = await ctx.ledger.get_undocumented_symbols(file_path) @@ -188,7 +186,12 @@ def _enrich_with_cosmetic_hints( head_range = resolve_symbol_lines(file_path, entry.symbol, repo_path, ref="HEAD") wt_range = resolve_symbol_lines(file_path, entry.symbol, repo_path, ref="working_tree") except Exception as exc: - logger.debug("[detect_drift] resolve_symbol_lines failed for %s/%s: %s", file_path, entry.symbol, exc) + logger.debug( + "[detect_drift] resolve_symbol_lines failed for %s/%s: %s", + file_path, + entry.symbol, + exc, + ) continue if head_range is None or wt_range is None: continue # symbol absent at one side — not a cosmetic case @@ -200,8 +203,8 @@ def _enrich_with_cosmetic_hints( if wt_start <= 0 or wt_end < wt_start: continue - head_slice = "\n".join(head_lines[head_start - 1:head_end]) - wt_slice = "\n".join(wt_lines[wt_start - 1:wt_end]) + head_slice = "\n".join(head_lines[head_start - 1 : head_end]) + wt_slice = "\n".join(wt_lines[wt_start - 1 : wt_end]) if not head_slice or not wt_slice: continue if head_slice == wt_slice: diff --git a/handlers/gap_judge.py b/handlers/gap_judge.py index ba32a52c..15026ca5 100644 --- a/handlers/gap_judge.py +++ b/handlers/gap_judge.py @@ -28,7 +28,7 @@ from __future__ import annotations import logging -from datetime import datetime, timezone +from datetime import UTC, datetime from contracts import ( DecisionMatch, @@ -291,7 +291,7 @@ async def handle_judge_gaps( return GapJudgmentPayload( topic=topic, - as_of=datetime.now(timezone.utc).isoformat(), + as_of=datetime.now(UTC).isoformat(), decisions=context_decisions, phrasing_gaps=phrasing_gaps, rubric=_build_rubric(), diff --git a/handlers/history.py b/handlers/history.py index 67c96071..852af509 100644 --- a/handlers/history.py +++ b/handlers/history.py @@ -50,7 +50,6 @@ def _slugify(name: str) -> str: return slug.strip("-") or "uncategorized" - def _decision_status_for_history( decision_status: str, has_code_regions: bool, @@ -86,14 +85,16 @@ def _row_to_history_decision( if not r: continue symbol = r.get("symbol") or r.get("symbol_name") or None - fulfillments.append(HistoryFulfillment( - file_path=str(r.get("file_path") or ""), - symbol=symbol, - start_line=int(r.get("start_line") or 0), - end_line=int(r.get("end_line") or 0), - baseline_hash=r.get("content_hash") or None, - current_hash=r.get("content_hash") or None, - )) + fulfillments.append( + HistoryFulfillment( + file_path=str(r.get("file_path") or ""), + symbol=symbol, + start_line=int(r.get("start_line") or 0), + end_line=int(r.get("end_line") or 0), + baseline_hash=r.get("content_hash") or None, + current_hash=r.get("content_hash") or None, + ) + ) # Source spans → HistorySource list # get_all_decisions returns source_excerpt + meeting_date extracted from first span. @@ -111,13 +112,15 @@ def _row_to_history_decision( raw_type = str(span.get("source_type") or row.get("source_type") or "manual") speakers = span.get("speakers") or [] speaker = speakers[0] if speakers else None - sources.append(HistorySource( - source_ref=str(span.get("source_ref") or row.get("source_ref") or ""), - source_type=_normalize_source_type(raw_type), # type: ignore[arg-type] - date=str(span.get("meeting_date") or row.get("meeting_date") or ""), - speaker=speaker, - quote=text, - )) + sources.append( + HistorySource( + source_ref=str(span.get("source_ref") or row.get("source_ref") or ""), + source_type=_normalize_source_type(raw_type), # type: ignore[arg-type] + date=str(span.get("meeting_date") or row.get("meeting_date") or ""), + speaker=speaker, + quote=text, + ) + ) else: # Fallback: build a single source from denormalized columns source_excerpt = str(row.get("source_excerpt") or "") @@ -125,13 +128,15 @@ def _row_to_history_decision( source_type = str(row.get("source_type") or "manual") meeting_date = str(row.get("meeting_date") or "") if source_excerpt or source_ref: - sources.append(HistorySource( - source_ref=source_ref, - source_type=_normalize_source_type(source_type), # type: ignore[arg-type] - date=meeting_date, - speaker=None, - quote=source_excerpt or description, - )) + sources.append( + HistorySource( + source_ref=source_ref, + source_type=_normalize_source_type(source_type), # type: ignore[arg-type] + date=meeting_date, + speaker=None, + quote=source_excerpt or description, + ) + ) drift_evidence: str | None = row.get("drift_evidence") or None signoff: dict | None = row.get("signoff") or None @@ -212,7 +217,7 @@ async def _fetch_all_decisions_enriched(ledger) -> list[dict]: for row in rows: ca = row.pop("created_at", None) row.setdefault("ingested_at", str(ca)[:24] if ca else "") - for region in (row.get("code_regions") or []): + for region in row.get("code_regions") or []: if region and "symbol_name" in region: region["symbol"] = region.pop("symbol_name") @@ -289,13 +294,13 @@ async def handle_history( """ # V1 A3: time the catch-up locally so history can report it. import time as _time - from handlers.sync_middleware import ensure_ledger_synced + from contracts import SyncMetrics + from handlers.sync_middleware import ensure_ledger_synced + _t0 = _time.perf_counter() await ensure_ledger_synced(ctx) - sync_metrics = SyncMetrics( - sync_catchup_ms=round((_time.perf_counter() - _t0) * 1000, 3) - ) + sync_metrics = SyncMetrics(sync_catchup_ms=round((_time.perf_counter() - _t0) * 1000, 3)) ledger = ctx.ledger if hasattr(ledger, "connect"): @@ -326,11 +331,13 @@ async def handle_history( if not decisions: continue - features.append(HistoryFeature( - id=feature_id, - name=feature_name, - decisions=decisions, - )) + features.append( + HistoryFeature( + id=feature_id, + name=feature_name, + decisions=decisions, + ) + ) # Apply feature_filter if feature_filter: @@ -351,8 +358,7 @@ async def handle_history( # Mark decisions whose current compliance verdict came from a feature-branch commit. # Only meaningful for decisions that have a status verdict (reflected/drifted). verifiable_ids = [ - d.id for f in features for d in f.decisions - if d.status in ("reflected", "drifted") + d.id for f in features for d in f.decisions if d.status in ("reflected", "drifted") ] ephemeral_ids = await _fetch_ephemeral_decision_ids(ledger, verifiable_ids) if ephemeral_ids: diff --git a/handlers/ingest.py b/handlers/ingest.py index a6bfb781..449038b9 100644 --- a/handlers/ingest.py +++ b/handlers/ingest.py @@ -7,6 +7,7 @@ from __future__ import annotations import logging +from datetime import UTC from contracts import ( ContextForCandidate, @@ -72,13 +73,15 @@ def _normalize_payload(payload: dict) -> dict: # committed to them, no code implements them. signoff.discovered=true # marks them as AI-discovered so consumers can distinguish them from # explicitly ingested decisions without a description prefix hack. - mappings.append({ - "intent": q, - "span": {**source_meta, "text": ""}, - "symbols": [], - "code_regions": [], - "signoff": {"state": "proposed", "discovered": True}, - }) + mappings.append( + { + "intent": q, + "span": {**source_meta, "text": ""}, + "symbols": [], + "code_regions": [], + "signoff": {"state": "proposed", "discovered": True}, + } + ) if not mappings: logger.warning( @@ -191,12 +194,14 @@ async def _find_context_for_candidates( if pair in seen_pairs: continue seen_pairs.add(pair) - candidates.append(ContextForCandidate( - span_id=span_id, - decision_id=decision_id, - decision_description=m.get("description", ""), - overlap_score=float(m.get("overlap_score", 0.0)), - )) + candidates.append( + ContextForCandidate( + span_id=span_id, + decision_id=decision_id, + decision_description=m.get("description", ""), + overlap_score=float(m.get("overlap_score", 0.0)), + ) + ) if len(candidates) >= top_k: return candidates except Exception as exc: @@ -229,14 +234,16 @@ async def handle_ingest( if span.get("source_type") in _SESSION_SOURCE_TYPES and not span.get("speakers"): if _git_email_cache is None: from events.writer import _get_git_email + _git_email_cache = _get_git_email(ctx.repo_path) if _git_email_cache and _git_email_cache != "unknown": span["speakers"] = [_git_email_cache] payload = ctx.code_graph.resolve_symbols(payload) - from datetime import datetime, timezone - _now_iso = datetime.now(timezone.utc).isoformat() + from datetime import datetime + + _now_iso = datetime.now(UTC).isoformat() _session_id = getattr(ctx, "session_id", None) or "" # v0.7.0: every new ingest enters as 'proposed' by default. @@ -262,7 +269,10 @@ async def handle_ingest( "(HEAD=%s); baseline hashes will be stamped against %s so the " "ledger stays branch-independent. Switch to %s if you want " "baselines pinned to the current working tree.", - authoritative_ref, head_sha[:8], authoritative_ref, authoritative_ref, + authoritative_ref, + head_sha[:8], + authoritative_ref, + authoritative_ref, ) # v0.4.8: writes always invalidate the within-call sync cache. In the @@ -272,6 +282,7 @@ async def handle_ingest( # then writes would leave a stale cache covering post-write reads. try: from handlers.link_commit import handle_link_commit, invalidate_sync_cache + invalidate_sync_cache(ctx) except Exception: pass @@ -305,6 +316,7 @@ async def handle_ingest( topics = _derive_topics(payload) if topics: from handlers.gap_judge import handle_judge_gaps + for topic in topics: jp = await handle_judge_gaps(ctx, topic=topic) if jp is not None: @@ -314,7 +326,9 @@ async def handle_ingest( judgment_payload = judgment_payloads[0] if judgment_payloads else None cursor_summary = None - source_type = str(((payload.get("mappings") or [{}])[0].get("span") or {}).get("source_type", "manual")) + source_type = str( + ((payload.get("mappings") or [{}])[0].get("span") or {}).get("source_type", "manual") + ) last_source_ref = _derive_last_source_ref(payload) if hasattr(ledger, "upsert_source_cursor"): cursor_row = await ledger.upsert_source_cursor( @@ -370,8 +384,7 @@ async def handle_ingest( for d in result.get("created_decisions", []) ], pending_grounding_decisions=[ - d for d in result.get("ungrounded_decisions", []) - if d.get("decision_level") != "L1" + d for d in result.get("ungrounded_decisions", []) if d.get("decision_level") != "L1" ], context_for_candidates=context_for_candidates, source_cursor=cursor_summary, @@ -382,6 +395,7 @@ async def handle_ingest( try: from dashboard.server import notify_dashboard + await notify_dashboard(ctx) except Exception: pass diff --git a/handlers/link_commit.py b/handlers/link_commit.py index 9c37b06f..56c8509b 100644 --- a/handlers/link_commit.py +++ b/handlers/link_commit.py @@ -109,6 +109,7 @@ def _build_verification_instruction( parts.append(_GROUNDING_INSTRUCTION_RELOCATION) return "".join(parts) + logger = logging.getLogger(__name__) @@ -125,6 +126,7 @@ def _read_current_head_sha(repo_path: str) -> str: """ try: import subprocess + result = subprocess.run( ["git", "rev-parse", "HEAD"], cwd=repo_path, @@ -230,6 +232,7 @@ def invalidate_sync_cache(ctx) -> None: sync_state.pop("last_sync_response", None) sync_state.pop("pending_flow_id", None) from handlers.sync_middleware import invalidate_process_cache + invalidate_process_cache() @@ -252,7 +255,8 @@ async def handle_link_commit(ctx, commit_hash: str = "HEAD") -> LinkCommitRespon try: if hasattr(ctx.ledger, "backfill_empty_hashes"): await ctx.ledger.backfill_empty_hashes( - ctx.repo_path, drift_analyzer=ctx.drift_analyzer, + ctx.repo_path, + drift_analyzer=ctx.drift_analyzer, ) except Exception as exc: logger.warning("[link_commit] backfill failed: %s", exc) @@ -281,9 +285,7 @@ async def handle_link_commit(ctx, commit_hash: str = "HEAD") -> LinkCommitRespon has_action_items = bool(pending) or bool(pending_grounding_raw) verification_text = ( - _build_verification_instruction(pending, pending_grounding_raw) - if has_action_items - else "" + _build_verification_instruction(pending, pending_grounding_raw) if has_action_items else "" ) is_ephemeral = _is_ephemeral_commit( @@ -318,6 +320,7 @@ async def handle_link_commit(ctx, commit_hash: str = "HEAD") -> LinkCommitRespon try: from dashboard.server import notify_dashboard + await notify_dashboard(ctx) except Exception: pass diff --git a/handlers/preflight.py b/handlers/preflight.py index ec2ac5ec..147b65b7 100644 --- a/handlers/preflight.py +++ b/handlers/preflight.py @@ -31,20 +31,16 @@ import logging import os import time -from datetime import datetime, timezone from pathlib import Path from contracts import ( - ActionHint, BriefDecision, - BriefDivergence, - BriefGap, CodeRegionSummary, DecisionMatch, PreflightResponse, ) -from handlers.analysis import _to_brief_decision from handlers.action_hints import generate_hints_from_findings +from handlers.analysis import _to_brief_decision logger = logging.getLogger(__name__) @@ -76,22 +72,86 @@ def _should_show_product_stage() -> bool: except Exception: return False -_GENERIC_TOPICS = frozenset({ - "code", "project", "everything", "anything", "stuff", - "thing", "things", "feature", "features", "system", - "module", "function", "method", -}) -_STOPWORDS = frozenset({ - "the", "and", "for", "that", "this", "with", "are", "from", "have", - "will", "when", "then", "been", "also", "into", "about", "should", - "must", "need", "each", "they", "their", "there", "which", "where", - "what", "than", "some", "more", "such", "only", "very", "just", - "like", "make", "made", "use", "used", "using", "after", "before", - "over", "under", "between", "through", "against", "implement", - "build", "create", "modify", "refactor", "update", "change", "fix", - "edit", "remove", "delete", -}) +_GENERIC_TOPICS = frozenset( + { + "code", + "project", + "everything", + "anything", + "stuff", + "thing", + "things", + "feature", + "features", + "system", + "module", + "function", + "method", + } +) + +_STOPWORDS = frozenset( + { + "the", + "and", + "for", + "that", + "this", + "with", + "are", + "from", + "have", + "will", + "when", + "then", + "been", + "also", + "into", + "about", + "should", + "must", + "need", + "each", + "they", + "their", + "there", + "which", + "where", + "what", + "than", + "some", + "more", + "such", + "only", + "very", + "just", + "like", + "make", + "made", + "use", + "used", + "using", + "after", + "before", + "over", + "under", + "between", + "through", + "against", + "implement", + "build", + "create", + "modify", + "refactor", + "update", + "change", + "fix", + "edit", + "remove", + "delete", + } +) def _content_tokens(text: str) -> set[str]: @@ -99,6 +159,7 @@ def _content_tokens(text: str) -> set[str]: shape but with implementation verbs added to the stopword set so 'implement Stripe webhook' yields ['stripe', 'webhook'].""" import re + raw = re.findall(r"[A-Za-z]{4,}", text or "") return {t.lower() for t in raw if t.lower() not in _STOPWORDS} @@ -153,16 +214,26 @@ def _check_dedup(ctx, topic: str) -> bool: async def _region_anchored_preflight( ctx, file_paths: list[str], -) -> list[DecisionMatch]: +) -> tuple[list[DecisionMatch], bool]: """file_paths (caller-supplied) → decisions pinned to those regions. The caller LLM is responsible for resolving which files a proposed change will touch — preflight then looks up decisions pinned to those files in - the ledger. Returns DecisionMatch objects with confidence=0.9 (direct - pin, not keyword match). + the ledger. Before the lookup, run a 1-hop code-graph expansion via the + code-locator adapter (#173): caller-LLM discovery is imprecise, and a + decision bound to ``app/src/lib/git/reorder.ts`` should still surface + when the caller passes the structurally-near ``app/src/ui/multi-commit- + operation/reorder.tsx``. Expansion is deterministic, no LLM in the path, + bounded by ``code_locator/config.py::max_neighbors_per_result``. + + Returns ``(matches, expanded)`` where ``expanded`` is True iff the graph + expansion produced extra paths beyond the caller-supplied set, so the + caller can record ``"graph"`` in ``sources_chained``. Direct-pin matches + carry ``confidence=0.9``; matches surfaced only via expanded paths carry + ``confidence=0.7``. """ if not file_paths: - return [] + return [], False # Dedup + normalize while preserving caller-supplied order. seen_paths: set[str] = set() @@ -173,16 +244,34 @@ async def _region_anchored_preflight( seen_paths.add(fp) ordered.append(fp) if not ordered: - return [] + return [], False + + # Graph expansion. Defensive: code_graph may be absent (mock contexts) or + # the adapter may not implement the method (older deployments). Either + # case falls back to direct file_paths only. + direct_paths: set[str] = set(ordered) + expanded_paths = list(ordered) + expanded_only_paths: set[str] = set() + code_graph = getattr(ctx, "code_graph", None) + expander = getattr(code_graph, "expand_file_paths_via_graph", None) if code_graph else None + if expander is not None: + try: + expanded_paths, added_paths = expander(ordered, hops=1) + expanded_only_paths = set(added_paths) + except Exception as exc: + logger.debug("[preflight:region] graph expansion failed: %s", exc) + expanded_paths = list(ordered) + expanded_only_paths = set() try: - raw = await ctx.ledger.get_decisions_for_files(ordered) + raw = await ctx.ledger.get_decisions_for_files(expanded_paths) except Exception as exc: logger.debug("[preflight:region] ledger region lookup failed: %s", exc) - return [] + return [], False matches: list[DecisionMatch] = [] seen_ids: set[str] = set() + surfaced_via_expansion = False for d in raw: did = d.get("decision_id", "") if did in seen_ids: @@ -191,35 +280,55 @@ async def _region_anchored_preflight( region_dict = d.get("code_region") regions = [] if region_dict: - regions = [CodeRegionSummary( - file_path=region_dict.get("file_path", ""), - symbol=region_dict.get("symbol", ""), - lines=tuple(region_dict.get("lines", (0, 0))), - purpose=region_dict.get("purpose", ""), - )] + regions = [ + CodeRegionSummary( + file_path=region_dict.get("file_path", ""), + symbol=region_dict.get("symbol", ""), + lines=tuple(region_dict.get("lines", (0, 0))), + purpose=region_dict.get("purpose", ""), + ) + ] status = str(d.get("status") or "ungrounded") if status not in ("reflected", "drifted", "pending", "ungrounded"): status = "ungrounded" if not regions else "pending" - _sf = d.get("signoff") or {} - matches.append(DecisionMatch( - decision_id=d.get("decision_id", ""), - description=d.get("description", ""), - status=status, - signoff_state=(_sf.get("state") if isinstance(_sf, dict) else None), - confidence=0.9, - source_ref=d.get("source_ref", ""), - code_regions=regions, - drift_evidence="", - related_constraints=[], - source_excerpt=d.get("source_excerpt", ""), - meeting_date=d.get("meeting_date", ""), - signoff=d.get("signoff"), - )) + # Provenance: a decision is "directly pinned" if any of its bound + # code_regions live in a caller-supplied path; otherwise it was only + # reached via 1-hop graph expansion. Caller can de-prioritize the + # latter (lower confidence) without losing recall. + bound_paths = { + (r.get("file_path") or "").strip() + for r in (d.get("code_regions") or []) + if r and (r.get("file_path") or "").strip() + } + # Single-region decisions also have a top-level ``code_region`` (used + # above); include it in the provenance check. + if region_dict and (region_dict.get("file_path") or "").strip(): + bound_paths.add(region_dict["file_path"].strip()) + is_direct = bool(bound_paths & direct_paths) if bound_paths else not expanded_only_paths + if not is_direct: + surfaced_via_expansion = True - return matches + _sf = d.get("signoff") or {} + matches.append( + DecisionMatch( + decision_id=d.get("decision_id", ""), + description=d.get("description", ""), + status=status, + signoff_state=(_sf.get("state") if isinstance(_sf, dict) else None), + confidence=0.9 if is_direct else 0.7, + source_ref=d.get("source_ref", ""), + code_regions=regions, + drift_evidence="", + related_constraints=[], + source_excerpt=d.get("source_excerpt", ""), + meeting_date=d.get("meeting_date", ""), + signoff=d.get("signoff"), + ) + ) + return matches, surfaced_via_expansion async def handle_preflight( @@ -233,7 +342,10 @@ async def handle_preflight( # Explicit mute via env var — one-line off-switch for the session. if os.getenv("BICAMERAL_PREFLIGHT_MUTE", "").strip().lower() in ( - "1", "true", "yes", "on", + "1", + "true", + "yes", + "on", ): return PreflightResponse( topic=topic, @@ -254,13 +366,13 @@ async def handle_preflight( # V1 A3: time the call locally so the metric reflects THIS handler's catch-up. import time as _time - from handlers.sync_middleware import ensure_ledger_synced + from contracts import SyncMetrics + from handlers.sync_middleware import ensure_ledger_synced + _t0 = _time.perf_counter() await ensure_ledger_synced(ctx) - sync_metrics = SyncMetrics( - sync_catchup_ms=round((_time.perf_counter() - _t0) * 1000, 3) - ) + sync_metrics = SyncMetrics(sync_catchup_ms=round((_time.perf_counter() - _t0) * 1000, 3)) sources_chained: list[str] = [] @@ -271,9 +383,11 @@ async def handle_preflight( region_matches: list[DecisionMatch] = [] if file_paths: try: - region_matches = await _region_anchored_preflight(ctx, file_paths) + region_matches, used_graph_expansion = await _region_anchored_preflight(ctx, file_paths) if region_matches: sources_chained.append("region") + if used_graph_expansion: + sources_chained.append("graph") except Exception as exc: logger.debug("[preflight] region lookup failed: %s", exc) @@ -285,28 +399,33 @@ async def handle_preflight( context_pending_ready: list[BriefDecision] = [] try: from ledger.queries import get_collision_pending_decisions, get_context_for_ready_decisions + inner = getattr(ctx.ledger, "_inner", ctx.ledger) client = inner._client coll_rows = await get_collision_pending_decisions(client) for r in coll_rows: _sf = r.get("signoff") or {} - unresolved_collisions.append(BriefDecision( - decision_id=r["decision_id"], - description=r["description"], - status=r.get("status") or "ungrounded", - signoff_state=(_sf.get("state") if isinstance(_sf, dict) else None), - signoff=r.get("signoff"), - )) + unresolved_collisions.append( + BriefDecision( + decision_id=r["decision_id"], + description=r["description"], + status=r.get("status") or "ungrounded", + signoff_state=(_sf.get("state") if isinstance(_sf, dict) else None), + signoff=r.get("signoff"), + ) + ) ctx_rows = await get_context_for_ready_decisions(client) for r in ctx_rows: _sf = r.get("signoff") or {} - context_pending_ready.append(BriefDecision( - decision_id=r["decision_id"], - description=r["description"], - status=r.get("status") or "ungrounded", - signoff_state=(_sf.get("state") if isinstance(_sf, dict) else None), - signoff=r.get("signoff"), - )) + context_pending_ready.append( + BriefDecision( + decision_id=r["decision_id"], + description=r["description"], + status=r.get("status") or "ungrounded", + signoff_state=(_sf.get("state") if isinstance(_sf, dict) else None), + signoff=r.get("signoff"), + ) + ) except Exception as exc: logger.debug("[preflight] HITL annotation queries failed: %s", exc) diff --git a/handlers/ratify.py b/handlers/ratify.py index cf8a7c4a..32594690 100644 --- a/handlers/ratify.py +++ b/handlers/ratify.py @@ -10,13 +10,15 @@ No unratify. Rescinding ratification or rejection requires writing a new decision that supersedes the previous one — clean audit trail, no rollback. """ + from __future__ import annotations import logging -from datetime import datetime, timezone +from datetime import UTC, datetime from contracts import RatifyResponse from ledger.queries import decision_exists, project_decision_status + # triage-adapt: dropped preflight_telemetry import from auto-merge — module # is on dev (#65 preflight telemetry) but not on triage; the cherry-picked # body doesn't actually reference it (intent of e6d4b8f for this file is @@ -62,7 +64,11 @@ async def handle_ratify( ) existing_signoff = (rows[0].get("signoff") if rows else None) or None - if existing_signoff and isinstance(existing_signoff, dict) and existing_signoff.get("state") == target_state: + if ( + existing_signoff + and isinstance(existing_signoff, dict) + and existing_signoff.get("state") == target_state + ): projected = await project_decision_status(client, decision_id) return RatifyResponse( decision_id=decision_id, @@ -73,7 +79,7 @@ async def handle_ratify( head_ref = getattr(ctx, "authoritative_sha", "") or "" session_id = getattr(ctx, "session_id", None) or "" - now_iso = datetime.now(timezone.utc).isoformat() + now_iso = datetime.now(UTC).isoformat() if action == "ratify": signoff = { @@ -100,7 +106,10 @@ async def handle_ratify( logger.info( "[ratify] decision=%s action=%s signer=%s projected_status=%s", - decision_id, action, signer, projected, + decision_id, + action, + signer, + projected, ) return RatifyResponse( diff --git a/handlers/reset.py b/handlers/reset.py index 1b3de739..2814ddb1 100644 --- a/handlers/reset.py +++ b/handlers/reset.py @@ -48,7 +48,11 @@ async def handle_reset( ledger = ctx.ledger if hasattr(ledger, "connect"): await ledger.connect() - if confirm and hasattr(ledger, "force_migrate") and getattr(ledger, "_pending_destructive", None): + if ( + confirm + and hasattr(ledger, "force_migrate") + and getattr(ledger, "_pending_destructive", None) + ): await ledger.force_migrate() cursors = await _get_cursors(ledger, ctx.repo_path) @@ -68,7 +72,11 @@ async def handle_reset( if not confirm: if wipe_mode == "full": - dir_desc = f" and the entire .bicameral/ directory at {bicameral_dir!r}" if bicameral_dir else "" + dir_desc = ( + f" and the entire .bicameral/ directory at {bicameral_dir!r}" + if bicameral_dir + else "" + ) next_action = ( f"DRY RUN — FULL WIPE. Would delete {cursors_before} source_cursor row(s), " f"every bicameral node/edge scoped to {ctx.repo_path!r}{dir_desc}. " @@ -95,6 +103,7 @@ async def handle_reset( # Invalidate within-call sync cache before any destructive operation. try: from handlers.link_commit import invalidate_sync_cache + invalidate_sync_cache(ctx) except Exception: pass @@ -123,7 +132,10 @@ async def handle_reset( logger.info( "[reset] wipe_mode=%s, wiped %d source_cursor(s) for repo=%s bicameral_dir=%r", - wipe_mode, cursors_before, ctx.repo_path, bicameral_dir, + wipe_mode, + cursors_before, + ctx.repo_path, + bicameral_dir, ) if wipe_mode == "full": @@ -165,15 +177,14 @@ async def _wipe_ledger(ledger, repo_path: str) -> None: inner = getattr(ledger, "_inner", ledger) client = getattr(inner, "_client", None) if client is None: - raise RuntimeError( - "reset: ledger adapter does not expose wipe_all_rows or an inner client" - ) + raise RuntimeError("reset: ledger adapter does not expose wipe_all_rows or an inner client") import shutil + url = getattr(inner, "_url", "") await client.close() inner._connected = False if url.startswith("surrealkv://"): - db_path = url[len("surrealkv://"):] + db_path = url[len("surrealkv://") :] if db_path: shutil.rmtree(db_path, ignore_errors=True) await inner._ensure_connected() @@ -219,7 +230,7 @@ def _resolve_bicameral_dir(ledger) -> str: continue url = getattr(obj, "_url", "") if url.startswith("surrealkv://"): - db_path = url[len("surrealkv://"):] + db_path = url[len("surrealkv://") :] if db_path: return str(Path(db_path).expanduser().parent) return "" @@ -251,4 +262,5 @@ def _resolve_ledger_url(ctx, ledger) -> str: if v: return str(v) import os + return os.environ.get("SURREAL_URL", "") diff --git a/handlers/resolve_collision.py b/handlers/resolve_collision.py index eb739b3f..57730514 100644 --- a/handlers/resolve_collision.py +++ b/handlers/resolve_collision.py @@ -21,7 +21,7 @@ from __future__ import annotations import logging -from datetime import datetime, timezone +from datetime import UTC, datetime from contracts import ResolveCollisionResponse from ledger.queries import ( @@ -39,7 +39,7 @@ async def handle_resolve_collision( # Collision mode params new_id: str | None = None, old_id: str | None = None, - action: str | None = None, # 'supersede' | 'keep_both' + action: str | None = None, # 'supersede' | 'keep_both' # Context-for mode params span_id: str | None = None, decision_id: str | None = None, @@ -54,14 +54,16 @@ async def handle_resolve_collision( client = inner._client _session_id = getattr(ctx, "session_id", None) or "" - _now_iso = datetime.now(timezone.utc).isoformat() + _now_iso = datetime.now(UTC).isoformat() # ── Collision mode ──────────────────────────────────────────────────── if action is not None: if not new_id or not old_id: raise ValueError("collision mode requires new_id and old_id") if action not in ("supersede", "keep_both", "link_parent"): - raise ValueError(f"action must be 'supersede', 'keep_both', or 'link_parent', got {action!r}") + raise ValueError( + f"action must be 'supersede', 'keep_both', or 'link_parent', got {action!r}" + ) if not await decision_exists(client, new_id): raise ValueError(f"No decision row for new_id={new_id}") @@ -85,9 +87,7 @@ async def handle_resolve_collision( ) old_status = result.get("old_status", "superseded") - logger.info( - "[resolve_collision] supersede: %s supersedes %s", new_id, old_id - ) + logger.info("[resolve_collision] supersede: %s supersedes %s", new_id, old_id) elif action == "link_parent": # Cross-level parent-child link: write parent_decision_id on the child (new_id). @@ -116,9 +116,7 @@ async def handle_resolve_collision( else: # keep_both old_status = "" - logger.info( - "[resolve_collision] keep_both: %s and %s both remain", new_id, old_id - ) + logger.info("[resolve_collision] keep_both: %s and %s both remain", new_id, old_id) # Clear collision_pending on new decision so it enters normal flow _proposed_signoff = { @@ -150,7 +148,9 @@ async def handle_resolve_collision( state = "confirmed" if confirmed else "rejected" await relate_context_for( - client, span_id, decision_id, + client, + span_id, + decision_id, state=state, relevance_score=0.0, reason=f"human-{state} via resolve_collision session={_session_id}", @@ -158,7 +158,9 @@ async def handle_resolve_collision( logger.info( "[resolve_collision] context_for: span=%s decision=%s state=%s", - span_id, decision_id, state, + span_id, + decision_id, + state, ) return ResolveCollisionResponse( diff --git a/handlers/resolve_compliance.py b/handlers/resolve_compliance.py index 7e16beae..cfac4439 100644 --- a/handlers/resolve_compliance.py +++ b/handlers/resolve_compliance.py @@ -21,10 +21,11 @@ A missing or mismatched flow_id logs a warning (stale/orphaned call). This will become a hard error once the codebase fully migrates to flow_id usage. """ + from __future__ import annotations import logging -from typing import Iterable +from collections.abc import Iterable from contracts import ( ComplianceVerdict, @@ -80,9 +81,7 @@ async def handle_resolve_compliance( last-verdict-wins caveat from v0.4.x). """ if phase not in _VALID_PHASES: - raise ValueError( - f"Unknown phase {phase!r} — must be one of {sorted(_VALID_PHASES)}" - ) + raise ValueError(f"Unknown phase {phase!r} — must be one of {sorted(_VALID_PHASES)}") sync_state = getattr(ctx, "_sync_state", None) is_ephemeral = False @@ -92,7 +91,8 @@ async def handle_resolve_compliance( logger.warning( "[resolve_compliance] flow_id mismatch: expected %s, got %s — " "verdicts may be stale or from a different link_commit call", - expected_flow_id[:8], (flow_id or "missing")[:8], + expected_flow_id[:8], + (flow_id or "missing")[:8], ) elif expected_flow_id and not flow_id: logger.warning( @@ -117,21 +117,25 @@ async def handle_resolve_compliance( for v in parsed: if not await decision_exists(client, v.decision_id): - rejected.append(ResolveComplianceRejection( - decision_id=v.decision_id, - region_id=v.region_id, - reason="unknown_decision_id", - detail=f"no decision row for {v.decision_id}", - )) + rejected.append( + ResolveComplianceRejection( + decision_id=v.decision_id, + region_id=v.region_id, + reason="unknown_decision_id", + detail=f"no decision row for {v.decision_id}", + ) + ) continue if not await region_exists(client, v.region_id): - rejected.append(ResolveComplianceRejection( - decision_id=v.decision_id, - region_id=v.region_id, - reason="unknown_region_id", - detail=f"no code_region row for {v.region_id}", - )) + rejected.append( + ResolveComplianceRejection( + decision_id=v.decision_id, + region_id=v.region_id, + reason="unknown_region_id", + detail=f"no code_region row for {v.region_id}", + ) + ) continue is_pruned = v.verdict == "not_relevant" @@ -145,7 +149,8 @@ async def handle_resolve_compliance( except Exception as exc: logger.warning( "[resolve_compliance] promote_ephemeral_verdict failed for %s: %s", - v.decision_id, exc, + v.decision_id, + exc, ) await upsert_compliance_check( @@ -169,12 +174,14 @@ async def handle_resolve_compliance( affected_decision_ids.add(v.decision_id) - accepted.append(ResolveComplianceAccepted( - decision_id=v.decision_id, - region_id=v.region_id, - phase=phase, - verdict=v.verdict, - )) + accepted.append( + ResolveComplianceAccepted( + decision_id=v.decision_id, + region_id=v.region_id, + phase=phase, + verdict=v.verdict, + ) + ) # Sync code_region.content_hash to the verdict hash for every accepted verdict. # project_decision_status looks up verdicts by (decision_id, region_id, @@ -187,7 +194,9 @@ async def handle_resolve_compliance( try: await update_region_hash(client, v.region_id, v.content_hash) except Exception as exc: - logger.warning("[resolve_compliance] update_region_hash failed for %s: %s", v.region_id, exc) + logger.warning( + "[resolve_compliance] update_region_hash failed for %s: %s", v.region_id, exc + ) # v0.5.0: holistic status projection after the full batch is written. # Replaces the per-verdict last-verdict-wins update from v0.4.x. @@ -197,11 +206,15 @@ async def handle_resolve_compliance( logger.info( "[resolve_compliance] phase=%s accepted=%d rejected=%d commit=%s", - phase, len(accepted), len(rejected), (commit_hash or "")[:8] or "n/a", + phase, + len(accepted), + len(rejected), + (commit_hash or "")[:8] or "n/a", ) try: from dashboard.server import notify_dashboard + await notify_dashboard(ctx) except Exception: pass diff --git a/handlers/search_decisions.py b/handlers/search_decisions.py index c85d3e13..8913e5b6 100644 --- a/handlers/search_decisions.py +++ b/handlers/search_decisions.py @@ -8,7 +8,13 @@ import time -from contracts import CodeRegionSummary, DecisionMatch, LinkCommitResponse, SearchDecisionsResponse, SyncMetrics +from contracts import ( + CodeRegionSummary, + DecisionMatch, + LinkCommitResponse, + SearchDecisionsResponse, + SyncMetrics, +) from handlers.action_hints import generate_hints_for_search from handlers.link_commit import handle_link_commit @@ -29,7 +35,9 @@ async def handle_search_decisions( sync_status: LinkCommitResponse = await handle_link_commit(ctx, "HEAD") catchup_ms = round((time.perf_counter() - t0) * 1000, 3) - raw_matches = await ctx.ledger.search_by_query(query, max_results=max_results, min_confidence=min_confidence) + raw_matches = await ctx.ledger.search_by_query( + query, max_results=max_results, min_confidence=min_confidence + ) matches: list[DecisionMatch] = [] suggested_review: list[str] = [] @@ -58,20 +66,22 @@ async def handle_search_decisions( suggested_review.append(m["decision_id"]) _signoff = m.get("signoff") or {} - matches.append(DecisionMatch( - decision_id=m["decision_id"], - description=m["description"], - status=status, - signoff_state=(_signoff.get("state") if isinstance(_signoff, dict) else None), - confidence=m.get("confidence", 0.5), - source_ref=m.get("source_ref", ""), - code_regions=regions, - drift_evidence=m.get("drift_evidence", ""), - related_constraints=m.get("related_constraints", []), - source_excerpt=m.get("source_excerpt", ""), - meeting_date=m.get("meeting_date", ""), - signoff=m.get("signoff"), - )) + matches.append( + DecisionMatch( + decision_id=m["decision_id"], + description=m["description"], + status=status, + signoff_state=(_signoff.get("state") if isinstance(_signoff, dict) else None), + confidence=m.get("confidence", 0.5), + source_ref=m.get("source_ref", ""), + code_regions=regions, + drift_evidence=m.get("drift_evidence", ""), + related_constraints=m.get("related_constraints", []), + source_excerpt=m.get("source_excerpt", ""), + meeting_date=m.get("meeting_date", ""), + signoff=m.get("signoff"), + ) + ) ungrounded_count = sum(1 for m in matches if m.status == "ungrounded") @@ -83,7 +93,8 @@ async def handle_search_decisions( suggested_review=suggested_review, ) response.action_hints = generate_hints_for_search( - response, guided_mode=getattr(ctx, "guided_mode", False), + response, + guided_mode=getattr(ctx, "guided_mode", False), ) response.sync_metrics = SyncMetrics(sync_catchup_ms=catchup_ms) return response diff --git a/handlers/sync_middleware.py b/handlers/sync_middleware.py index 9d582b41..52e376d5 100644 --- a/handlers/sync_middleware.py +++ b/handlers/sync_middleware.py @@ -17,7 +17,7 @@ import logging import time from contextlib import asynccontextmanager -from datetime import datetime, timezone +from datetime import UTC, datetime from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -30,7 +30,6 @@ _LAST_SYNCED_SHA: str | None = None - # ── V1 A2-light: per-repo write barrier ───────────────────────────────── # Module-level registry of per-repo asyncio.Locks. Serializes mutating # handlers against the same repo inside a single MCP server process. @@ -95,6 +94,7 @@ class BarrierTiming: Handlers read it after the ``async with`` block to attach the number to their ``SyncMetrics`` response field. """ + __slots__ = ("held_ms",) def __init__(self) -> None: @@ -129,7 +129,7 @@ def _reset_repo_locks_for_tests() -> None: _BANNER_MAX_ITEMS = 10 -async def get_session_start_banner(ctx) -> "SessionStartBanner | None": +async def get_session_start_banner(ctx) -> SessionStartBanner | None: """Return open-decision summary for session start, or None if nothing actionable. Fires exactly once per session (keyed on ctx._sync_state["session_started"]). @@ -150,17 +150,14 @@ async def get_session_start_banner(ctx) -> "SessionStartBanner | None": except Exception: return None - now = datetime.now(timezone.utc) + now = datetime.now(UTC) drifted_rows = [r for r in rows if r.get("status") == "drifted"] - proposal_rows = [ - r for r in rows - if (r.get("signoff") or {}).get("state") == "proposed" - ] + proposal_rows = [r for r in rows if (r.get("signoff") or {}).get("state") == "proposed"] real_ungrounded_rows = [ - r for r in rows - if r.get("status") == "ungrounded" - and (r.get("signoff") or {}).get("state") != "proposed" + r + for r in rows + if r.get("status") == "ungrounded" and (r.get("signoff") or {}).get("state") != "proposed" ] stale_proposals = [] @@ -191,13 +188,15 @@ async def get_session_start_banner(ctx) -> "SessionStartBanner | None": items = [] for r in visible: signoff = r.get("signoff") or {} - items.append({ - "decision_id": r.get("decision_id", r.get("id", "")), - "description": r.get("description", ""), - "status": r.get("status", ""), - "signoff_state": signoff.get("state"), - "source_ref": r.get("source_ref", ""), - }) + items.append( + { + "decision_id": r.get("decision_id", r.get("id", "")), + "description": r.get("description", ""), + "status": r.get("status", ""), + "signoff_state": signoff.get("state"), + "source_ref": r.get("source_ref", ""), + } + ) parts = [] if drifted_count: @@ -222,7 +221,7 @@ async def get_session_start_banner(ctx) -> "SessionStartBanner | None": ) -async def ensure_ledger_synced(ctx) -> "LinkCommitResponse | None": +async def ensure_ledger_synced(ctx) -> LinkCommitResponse | None: """Sync ledger to HEAD if it has moved since the last sync in this process. Returns the LinkCommitResponse when a new commit was processed — callers @@ -232,7 +231,8 @@ async def ensure_ledger_synced(ctx) -> "LinkCommitResponse | None": global _LAST_SYNCED_SHA try: - from handlers.link_commit import handle_link_commit, _read_current_head_sha + from handlers.link_commit import _read_current_head_sha, handle_link_commit + live_head = _read_current_head_sha(getattr(ctx, "repo_path", "") or ".") if live_head and live_head != _LAST_SYNCED_SHA: result = await handle_link_commit(ctx, "HEAD") diff --git a/handlers/update.py b/handlers/update.py index 229c755f..a743b7e2 100644 --- a/handlers/update.py +++ b/handlers/update.py @@ -17,7 +17,7 @@ import sys import time import urllib.request -from typing import Optional +from pathlib import Path logger = logging.getLogger(__name__) @@ -45,7 +45,7 @@ def _save_cache(data: dict) -> None: pass -def _fetch_recommended_version() -> Optional[str]: +def _fetch_recommended_version() -> str | None: """Fetch RECOMMENDED_VERSION from GitHub with a 1-hour cache.""" cache = _load_cache() now = time.time() @@ -84,7 +84,7 @@ def get_update_notice(current_version: str) -> dict | None: "action_required": ( f"Ask the user: 'bicameral-mcp v{recommended} is available " f"(you are on v{current_version}) — upgrade now? (yes/no)'. " - "If yes, call bicameral.update {\"action\": \"apply\"}." + 'If yes, call bicameral.update {"action": "apply"}.' ), } @@ -134,12 +134,12 @@ def _apply_pending_migration(repo_path: str) -> dict: replay_plan: list[dict] (only when migrated=True) error: str (only on failure) """ - import tempfile, os + import os + import tempfile + tmp = None try: - with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False - ) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: f.write(_MIGRATION_SCRIPT) tmp = f.name result = subprocess.run( @@ -167,6 +167,7 @@ def _read_guided_from_config(repo_path: str) -> bool: """Return the guided: flag from .bicameral/config.yaml, defaulting to False.""" try: import re + config_path = Path(repo_path) / ".bicameral" / "config.yaml" if not config_path.exists(): return False @@ -191,7 +192,7 @@ def _reinstall_skills(repo_path: str) -> int: f"rp = Path(r'{repo_path}'); " f"n = _install_skills(rp); " f"_install_claude_hooks(rp); " - + (f"_install_git_post_commit_hook(rp); " if guided else "") + + ("_install_git_post_commit_hook(rp); " if guided else "") + "print(n)" ) result = subprocess.run( @@ -249,6 +250,7 @@ async def handle_update(action: str, current_version: str, repo_path: str = "") # and handles externally-managed-environment restrictions on macOS. # Fall back to pip for venv/dev installs. import shutil + if shutil.which("pipx"): cmd = ["pipx", "install", target, "--force"] else: @@ -270,7 +272,9 @@ async def handle_update(action: str, current_version: str, repo_path: str = "") ) # Auto-apply any pending destructive migration using the new binary. - migration_result = _apply_pending_migration(repo_path) if repo_path else {"migrated": False} + migration_result = ( + _apply_pending_migration(repo_path) if repo_path else {"migrated": False} + ) if migration_result.get("migrated"): cursors_wiped = migration_result.get("cursors_wiped", 0) replay_plan = migration_result.get("replay_plan", []) diff --git a/handlers/usage_summary.py b/handlers/usage_summary.py index c3ddd69a..8d0bbfb9 100644 --- a/handlers/usage_summary.py +++ b/handlers/usage_summary.py @@ -11,7 +11,7 @@ from __future__ import annotations import logging -from datetime import datetime, timedelta, timezone +from datetime import UTC, datetime, timedelta from local_counters import read_counters @@ -54,7 +54,7 @@ async def handle_usage_summary(ctx, days: int = 7) -> dict: try: ledger = ctx.ledger - cutoff = (datetime.now(timezone.utc) - timedelta(days=period_days)).isoformat() + cutoff = (datetime.now(UTC) - timedelta(days=period_days)).isoformat() client = getattr(getattr(ledger, "_inner", ledger), "_client", None) if client is None: return base @@ -89,9 +89,7 @@ async def handle_usage_summary(ctx, days: int = 7) -> dict: f"WHERE checked_at > <datetime>'{cutoff}' " "AND verdict IN ['drifted', 'cosmetic_autopass'] GROUP BY verdict" ) - cc_counts = { - r.get("verdict"): int(r.get("n", 0)) for r in (cc_rows or []) - } + cc_counts = {r.get("verdict"): int(r.get("n", 0)) for r in (cc_rows or [])} cosmetic = cc_counts.get("cosmetic_autopass", 0) drift_total = cosmetic + cc_counts.get("drifted", 0) if drift_total > 0: diff --git a/ledger/__init__.py b/ledger/__init__.py index de51d781..0217c078 100644 --- a/ledger/__init__.py +++ b/ledger/__init__.py @@ -1,4 +1,5 @@ """Decision Ledger — SurrealDB-backed implementation for Phase 2.""" + from .adapter import SurrealDBLedgerAdapter from .client import LedgerClient diff --git a/ledger/adapter.py b/ledger/adapter.py index bee2b755..ab26eb5e 100644 --- a/ledger/adapter.py +++ b/ledger/adapter.py @@ -16,11 +16,10 @@ from .client import LedgerClient from .queries import ( decision_exists, - delete_binds_to_edge, find_subject_identities_for_decision, get_all_decisions, - get_decision_level, get_compliance_verdict, + get_decision_level, get_decisions_for_file, get_decisions_for_files, get_pending_decisions_with_regions, @@ -35,7 +34,6 @@ lookup_vocab_cache, project_decision_status, promote_ephemeral_verdict, - region_exists, relate_binds_to, relate_has_identity, relate_locates, @@ -65,7 +63,6 @@ resolve_ref, ) - _CODE_BODY_LINE_CAP = 200 @@ -100,6 +97,7 @@ def _get_branch_delta_files(authoritative_ref: str, commit_hash: str, repo_path: Returns [] if the command fails or authoritative_ref is unreachable. """ import subprocess as _sp + try: result = _sp.run( ["git", "diff", f"{authoritative_ref}...{commit_hash}", "--name-only"], @@ -206,7 +204,6 @@ async def search_by_query( async def decision_exists(self, decision_id: str) -> bool: await self._ensure_connected() - from .queries import decision_exists return await decision_exists(self._client, decision_id) async def get_decision_description(self, decision_id: str) -> str: @@ -254,7 +251,9 @@ async def bind_decision( raise ValueError(f"upsert_code_region returned empty id for {file_path}:{symbol_name}") await relate_binds_to( - self._client, decision_id, region_id, + self._client, + decision_id, + region_id, confidence=0.95, provenance={"method": "caller_llm"}, ) @@ -307,7 +306,10 @@ async def relate_has_identity( ) -> None: await self._ensure_connected() await relate_has_identity( - self._client, code_subject_id, subject_identity_id, confidence=confidence, + self._client, + code_subject_id, + subject_identity_id, + confidence=confidence, ) async def link_decision_to_subject( @@ -318,7 +320,10 @@ async def link_decision_to_subject( ) -> None: await self._ensure_connected() await link_decision_to_subject( - self._client, decision_id, code_subject_id, confidence=confidence, + self._client, + decision_id, + code_subject_id, + confidence=confidence, ) async def find_subject_identities_for_decision( @@ -392,6 +397,7 @@ async def ingest_commit( if drift_analyzer is None: from .drift import HashDriftAnalyzer + drift_analyzer = HashDriftAnalyzer() if commit_hash == "HEAD": @@ -402,6 +408,7 @@ async def ingest_commit( is_authoritative = True if authoritative_ref: import subprocess + try: result = subprocess.run( ["git", "rev-parse", "--abbrev-ref", "HEAD"], @@ -418,7 +425,8 @@ async def ingest_commit( logger.info( "[link_commit] current branch %s != authoritative %s — " "running in read-only mode (no baseline writes)", - current_branch, authoritative_ref, + current_branch, + authoritative_ref, ) state = await get_sync_state(self._client, repo_path) @@ -440,18 +448,22 @@ async def ingest_commit( if not current_hash: continue code_body = _extract_code_body(fp, sl, el, repo_path, ref=commit_hash) - pending_checks.append({ - "phase": "ingest", - "decision_id": str(row.get("decision_id", "")), - "region_id": region_id, - "decision_description": str(row.get("description", "")), - "file_path": fp, - "symbol": row.get("symbol_name", ""), - "content_hash": current_hash, - "code_body": code_body, - }) + pending_checks.append( + { + "phase": "ingest", + "decision_id": str(row.get("decision_id", "")), + "region_id": region_id, + "decision_description": str(row.get("description", "")), + "file_path": fp, + "symbol": row.get("symbol_name", ""), + "content_hash": current_hash, + "code_body": code_body, + } + ) except Exception as exc: - logger.warning("[link_commit] could not surface pending decisions on already_synced: %s", exc) + logger.warning( + "[link_commit] could not surface pending decisions on already_synced: %s", exc + ) # Repair stale ephemeral hashes on authoritative branches. # A feature-branch bind sets code_region.content_hash = H_branch. @@ -480,7 +492,7 @@ async def ingest_commit( continue await update_region_hash(self._client, region_id, actual_hash, commit_hash) regions_repaired += 1 - for decision in (region.get("decisions") or []): + for decision in region.get("decisions") or []: if decision is None: continue decision_id = str(decision.get("id", "")) @@ -520,7 +532,8 @@ async def ingest_commit( if range_files is None: logger.warning( "[link_commit] range %s..%s unreachable, falling back to head-only sweep", - last_synced[:8], commit_hash[:8], + last_synced[:8], + commit_hash[:8], ) changed_files = get_changed_files(commit_hash, repo_path) sweep_scope = "head_only" @@ -530,7 +543,8 @@ async def ingest_commit( if len(changed_files) > _MAX_SWEEP_FILES: logger.warning( "[link_commit] range sweep capped at %d files (would have swept %d).", - _MAX_SWEEP_FILES, len(changed_files), + _MAX_SWEEP_FILES, + len(changed_files), ) changed_files = changed_files[:_MAX_SWEEP_FILES] sweep_scope = "range_truncated" @@ -613,10 +627,13 @@ async def ingest_commit( if is_authoritative: await update_region_hash(self._client, region_id, actual_hash, commit_hash) from .status import resolve_symbol_lines + resolved = resolve_symbol_lines(file_path, symbol_name, repo_path, ref=commit_hash) if resolved is None: symbol_disappeared = True - elif resolved[0] != region.get("start_line") or resolved[1] != region.get("end_line"): + elif resolved[0] != region.get("start_line") or resolved[1] != region.get( + "end_line" + ): await self._client.query( f"UPDATE {region_id} SET start_line = $sl, end_line = $el", {"sl": resolved[0], "el": resolved[1]}, @@ -627,7 +644,7 @@ async def ingest_commit( phase = "ingest" if not stored_hash else "drift" # v0.5.0: decisions are accessed via binds_to (renamed from intents via maps_to) - for decision in (region.get("decisions") or []): + for decision in region.get("decisions") or []: if decision is None: continue decision_id = str(decision.get("id", "")) @@ -654,20 +671,25 @@ async def ingest_commit( if symbol_disappeared: # L1 decisions are intentionally ungrounded — skip grounding alarm. if decision.get("decision_level") != "L1": - pending_grounding_checks.append({ - "decision_id": decision_id, - "description": str(decision.get("description", "")), - "reason": "symbol_disappeared", - "file_path": file_path, - "symbol": symbol_name, - "original_lines": [start_line, end_line], - }) + pending_grounding_checks.append( + { + "decision_id": decision_id, + "description": str(decision.get("description", "")), + "reason": "symbol_disappeared", + "file_path": file_path, + "symbol": symbol_name, + "original_lines": [start_line, end_line], + } + ) continue verdict: dict | None = None if actual_hash: verdict = await get_compliance_verdict( - self._client, decision_id, region_id, actual_hash, + self._client, + decision_id, + region_id, + actual_hash, ) new_status = derive_status(stored_hash, actual_hash, cached_verdict=verdict) @@ -675,7 +697,9 @@ async def ingest_commit( if is_authoritative: # V2: promote ephemeral verdict when same hash lands on authoritative branch if actual_hash: - await promote_ephemeral_verdict(self._client, decision_id, region_id, actual_hash) + await promote_ephemeral_verdict( + self._client, decision_id, region_id, actual_hash + ) # v0.5.0: holistic status projection from DB projected = await project_decision_status(self._client, decision_id) await update_decision_status(self._client, decision_id, projected) @@ -689,8 +713,12 @@ async def ingest_commit( fb_status = "pending" elif actual_hash == stored_hash: if verdict is not None and not verdict.get("pruned"): - fb_status = "reflected" if verdict.get("verdict") == "compliant" else "drifted" - elif await has_prior_compliant_verdict(self._client, decision_id, region_id): + fb_status = ( + "reflected" if verdict.get("verdict") == "compliant" else "drifted" + ) + elif await has_prior_compliant_verdict( + self._client, decision_id, region_id + ): fb_status = "drifted" else: fb_status = "pending" @@ -710,18 +738,24 @@ async def ingest_commit( if actual_hash and verdict is None: if region_code_body is None: region_code_body = _extract_code_body( - file_path, start_line, end_line, repo_path, ref=commit_hash, + file_path, + start_line, + end_line, + repo_path, + ref=commit_hash, ) - pending_checks.append({ - "phase": phase, - "decision_id": decision_id, - "region_id": region_id, - "decision_description": str(decision.get("description", "")), - "file_path": file_path, - "symbol": symbol_name, - "content_hash": actual_hash, - "code_body": region_code_body, - }) + pending_checks.append( + { + "phase": phase, + "decision_id": decision_id, + "region_id": region_id, + "decision_description": str(decision.get("description", "")), + "file_path": file_path, + "symbol": symbol_name, + "content_hash": actual_hash, + "code_body": region_code_body, + } + ) decisions = [i for i in (region.get("decisions") or []) if i is not None] if not decisions and symbol_name: @@ -740,11 +774,13 @@ async def ingest_commit( # `d["id"]` returns "" and produces unusable grounding # checks the caller cannot bind against. Surfaced by V1 F1 # regression coverage. - pending_grounding_checks.append({ - "decision_id": str(d.get("decision_id") or d.get("id", "")), - "description": str(d.get("description", "")), - "reason": "ungrounded", - }) + pending_grounding_checks.append( + { + "decision_id": str(d.get("decision_id") or d.get("id", "")), + "description": str(d.get("description", "")), + "reason": "ungrounded", + } + ) except Exception as exc: logger.warning("[link_commit] could not query ungrounded decisions: %s", exc) @@ -765,16 +801,18 @@ async def ingest_commit( if not current_hash: continue code_body = _extract_code_body(fp, sl, el, repo_path, ref=commit_hash) - pending_checks.append({ - "phase": "drift", - "decision_id": str(row.get("decision_id", "")), - "region_id": region_id, - "decision_description": str(row.get("description", "")), - "file_path": fp, - "symbol": row.get("symbol_name", ""), - "content_hash": current_hash, - "code_body": code_body, - }) + pending_checks.append( + { + "phase": "drift", + "decision_id": str(row.get("decision_id", "")), + "region_id": region_id, + "decision_description": str(row.get("description", "")), + "file_path": fp, + "symbol": row.get("symbol_name", ""), + "content_hash": current_hash, + "code_body": code_body, + } + ) except Exception as exc: logger.warning("[link_commit] could not surface stale pending decisions: %s", exc) @@ -805,6 +843,7 @@ async def backfill_empty_hashes( if drift_analyzer is None: from .drift import HashDriftAnalyzer + drift_analyzer = HashDriftAnalyzer() legacy = await get_regions_without_hash(self._client, repo=repo_path) @@ -842,7 +881,7 @@ async def backfill_empty_hashes( await update_region_hash(self._client, region_id, drift_result.content_hash, ref) new_status = drift_result.status - for decision in (region.get("decisions") or []): + for decision in region.get("decisions") or []: if decision is None: continue decision_id = str(decision.get("id", "")) @@ -971,10 +1010,9 @@ async def ingest_payload(self, payload: dict, ctx=None) -> dict: # contexts — fall through with empty hash so the decision # is created as ungrounded (matches pre-v0.10.7 behavior). repo_on_disk = Path(repo).resolve().is_dir() - ref_resolves = ( - repo_on_disk - and (effective_ref == "working_tree" - or resolve_ref(effective_ref, repo) is not None) + ref_resolves = repo_on_disk and ( + effective_ref == "working_tree" + or resolve_ref(effective_ref, repo) is not None ) if repo_on_disk and ref_resolves: _computed = compute_content_hash( @@ -984,7 +1022,9 @@ async def ingest_payload(self, payload: dict, ctx=None) -> dict: logger.warning( "[ingest] skipping region: file '%s' not found at %s in %s" " — only bind to existing code, never hypothetical files", - file_path, effective_ref, repo, + file_path, + effective_ref, + repo, ) continue content_hash = _computed @@ -1025,7 +1065,9 @@ async def ingest_payload(self, payload: dict, ctx=None) -> dict: provenance["grounding_tier"] = grounding_tier provenance["method"] = "auto_ground" await relate_binds_to( - self._client, decision_id, region_id, + self._client, + decision_id, + region_id, confidence=region_data.get("confidence", 0.8), provenance=provenance, ) @@ -1107,12 +1149,13 @@ async def wipe_all_rows(self, repo: str) -> None: immediately ready for use after this call returns. """ import shutil + await self._ensure_connected() await self._client.close() self._connected = False url = self._url if url.startswith("surrealkv://"): - db_path = url[len("surrealkv://"):] + db_path = url[len("surrealkv://") :] if db_path: shutil.rmtree(db_path, ignore_errors=True) await self._ensure_connected() @@ -1156,9 +1199,7 @@ async def apply_supersede( new_id, old_id, confidence=1.0, - reason=( - f"human-confirmed supersession via resolve_collision session={session_id}" - ), + reason=(f"human-confirmed supersession via resolve_collision session={session_id}"), ) rows = await self._client.query(f"SELECT signoff FROM {old_id} LIMIT 1") old_signoff: dict = {} diff --git a/ledger/ast_diff.py b/ledger/ast_diff.py index e452fad8..ac4e90e7 100644 --- a/ledger/ast_diff.py +++ b/ledger/ast_diff.py @@ -41,18 +41,20 @@ # Languages B1 actually classifies. Anything else returns False (fail-safe). # Matches the set wired into code_locator/indexing/symbol_extractor.py so # the cosmetic detector never silently diverges from the indexer. -SUPPORTED_LANGUAGES: frozenset[str] = frozenset({ - "python", - "javascript", - "typescript", - "java", - "go", - "rust", - "c_sharp", - # via LANGUAGE_FALLBACK - "jsx", - "tsx", -}) +SUPPORTED_LANGUAGES: frozenset[str] = frozenset( + { + "python", + "javascript", + "typescript", + "java", + "go", + "rust", + "c_sharp", + # via LANGUAGE_FALLBACK + "jsx", + "tsx", + } +) def is_cosmetic_change(before: str, after: str, lang: str) -> bool: @@ -93,8 +95,9 @@ def is_cosmetic_change(before: str, after: str, lang: str) -> bool: # If either input doesn't parse cleanly, refuse to call it cosmetic. if tree_before.root_node.has_error or tree_after.root_node.has_error: return False - return _signature(tree_before.root_node, before_bytes) == \ - _signature(tree_after.root_node, after_bytes) + return _signature(tree_before.root_node, before_bytes) == _signature( + tree_after.root_node, after_bytes + ) except (Exception, RecursionError) as exc: logger.debug("[ast_diff] classifier failed for %s: %s", normalized, exc) return False @@ -114,7 +117,7 @@ def _signature(node: Any, source: bytes) -> tuple: produces a signature mismatch. """ if node.child_count == 0: - return (node.type, source[node.start_byte:node.end_byte]) + return (node.type, source[node.start_byte : node.end_byte]) return ( node.type, tuple(_signature(child, source) for child in node.children), diff --git a/ledger/canonical.py b/ledger/canonical.py index 67d9e8b5..e05bad85 100644 --- a/ledger/canonical.py +++ b/ledger/canonical.py @@ -42,8 +42,7 @@ import json import re import unicodedata -from uuid import NAMESPACE_URL, UUID, uuid5 - +from uuid import NAMESPACE_URL, uuid5 # Stable namespace UUID for bicameral canonical IDs. Derived from a # bicameral-specific URL via UUIDv5(NAMESPACE_URL, "https://bicameral.dev/v0.4.13/canonical"). diff --git a/ledger/client.py b/ledger/client.py index 54b7c26d..7852d751 100644 --- a/ledger/client.py +++ b/ledger/client.py @@ -11,6 +11,7 @@ from typing import Any from surrealdb import AsyncSurreal, RecordID + try: from surrealdb import SurrealError except ImportError: diff --git a/ledger/drift.py b/ledger/drift.py index 6adfae07..7d2e32eb 100644 --- a/ledger/drift.py +++ b/ledger/drift.py @@ -11,6 +11,7 @@ from __future__ import annotations from ports import DriftResult + from .status import compute_content_hash, derive_status, resolve_symbol_lines @@ -38,9 +39,7 @@ async def analyze_region( start_line, end_line = resolved # Compute actual hash at this ref - actual_hash = compute_content_hash( - file_path, start_line, end_line, repo_path, ref=ref - ) + actual_hash = compute_content_hash(file_path, start_line, end_line, repo_path, ref=ref) # Self-heal legacy regions that were persisted before v0.4.5's # baseline-stamping fix. If we have no stored hash but the code diff --git a/ledger/queries.py b/ledger/queries.py index 42d02276..5c1bc72d 100644 --- a/ledger/queries.py +++ b/ledger/queries.py @@ -11,7 +11,7 @@ from __future__ import annotations import logging -from datetime import datetime, timezone +from datetime import UTC, datetime from .client import LedgerClient, LedgerError @@ -25,6 +25,7 @@ # Team-mode event replay re-issues every RELATE; duplicates are rejected by the # DB and treated as a no-op success here. + async def _execute_idempotent_edge( client: LedgerClient, sql: str, vars: dict | None = None ) -> None: @@ -132,7 +133,7 @@ async def upsert_source_cursor( "source_scope": source_scope, "cursor": cursor, "last_source_ref": last_source_ref, - "synced_at": str(datetime.now(timezone.utc).isoformat()), + "synced_at": str(datetime.now(UTC).isoformat()), "status": status, "error": error, } @@ -193,16 +194,13 @@ async def get_all_decisions( ca = row.pop("created_at", None) row.setdefault("ingested_at", str(ca)[:24] if ca else "") for row in rows: - for region in (row.get("code_regions") or []): + for region in row.get("code_regions") or []: if region and "symbol_name" in region: region["symbol"] = region.pop("symbol_name") for row in rows: spans = row.pop("source_spans", None) or [] description = row.get("description", "") - real_spans = [ - s for s in spans - if s and s.get("text") and s.get("text") != description - ] + real_spans = [s for s in spans if s and s.get("text") and s.get("text") != description] first_span = real_spans[0] if real_spans else None row["source_excerpt"] = (first_span.get("text") if first_span else "") or "" if not row.get("meeting_date"): @@ -253,15 +251,12 @@ async def search_by_bm25( ca = row.pop("created_at", None) row.setdefault("ingested_at", str(ca)[:24] if ca else "") row["confidence"] = round(1.0 - (i / max(total, 1)) * 0.4, 2) - for region in (row.get("code_regions") or []): + for region in row.get("code_regions") or []: if region and "symbol_name" in region: region["symbol"] = region.pop("symbol_name") spans = row.pop("source_spans", None) or [] description = row.get("description", "") - real_spans = [ - s for s in spans - if s and s.get("text") and s.get("text") != description - ] + real_spans = [s for s in spans if s and s.get("text") and s.get("text") != description] first_span = real_spans[0] if real_spans else None row["source_excerpt"] = (first_span.get("text") if first_span else "") or "" row["meeting_date"] = (first_span.get("meeting_date") if first_span else "") or "" @@ -376,7 +371,7 @@ async def get_decisions_for_file( "purpose": region_row.get("purpose", ""), "content_hash": region_row.get("content_hash", ""), } - for decision in (region_row.get("decisions") or []): + for decision in region_row.get("decisions") or []: if decision is None: continue did = str(decision.get("id", "")) @@ -384,19 +379,21 @@ async def get_decisions_for_file( continue seen_decision_ids.add(did) decision_id_set.add(did) - results.append({ - "decision_id": did, - "description": decision.get("description", ""), - "source_type": decision.get("source_type", ""), - "source_ref": decision.get("source_ref", ""), - "source_excerpt": "", - "meeting_date": "", - "speaker": "", - "ingested_at": str(decision.get("created_at", "")), - "status": decision.get("status", "ungrounded"), - "signoff": decision.get("signoff"), - "code_region": region, - }) + results.append( + { + "decision_id": did, + "description": decision.get("description", ""), + "source_type": decision.get("source_type", ""), + "source_ref": decision.get("source_ref", ""), + "source_excerpt": "", + "meeting_date": "", + "speaker": "", + "ingested_at": str(decision.get("created_at", "")), + "status": decision.get("status", "ungrounded"), + "signoff": decision.get("signoff"), + "code_region": region, + } + ) # Backfill source_excerpt + meeting_date via yields reverse edge if decision_id_set: @@ -412,14 +409,11 @@ async def get_decisions_for_file( ) excerpt_by_decision: dict[str, tuple[str, str]] = {} desc_by_decision = {e["decision_id"]: e.get("description", "") for e in results} - for r in (excerpt_rows or []): + for r in excerpt_rows or []: did = str(r.get("decision_id", "")) desc = desc_by_decision.get(did, "") spans = r.get("source_spans") or [] - real_spans = [ - s for s in spans - if s and s.get("text") and s.get("text") != desc - ] + real_spans = [s for s in spans if s and s.get("text") and s.get("text") != desc] first = real_spans[0] if real_spans else None if first: excerpt_by_decision[did] = ( @@ -486,7 +480,7 @@ async def get_decisions_for_files( "purpose": region_row.get("purpose", ""), "content_hash": region_row.get("content_hash", ""), } - for decision in (region_row.get("decisions") or []): + for decision in region_row.get("decisions") or []: if decision is None: continue did = str(decision.get("id", "")) @@ -494,18 +488,20 @@ async def get_decisions_for_files( continue seen_decision_ids.add(did) decision_id_set.add(did) - results.append({ - "decision_id": did, - "description": decision.get("description", ""), - "source_type": decision.get("source_type", ""), - "source_ref": decision.get("source_ref", ""), - "source_excerpt": "", - "meeting_date": "", - "ingested_at": str(decision.get("created_at", "")), - "status": decision.get("status", "ungrounded"), - "signoff": decision.get("signoff"), - "code_region": region, - }) + results.append( + { + "decision_id": did, + "description": decision.get("description", ""), + "source_type": decision.get("source_type", ""), + "source_ref": decision.get("source_ref", ""), + "source_excerpt": "", + "meeting_date": "", + "ingested_at": str(decision.get("created_at", "")), + "status": decision.get("status", "ungrounded"), + "signoff": decision.get("signoff"), + "code_region": region, + } + ) # Backfill source_excerpt + meeting_date if decision_id_set: @@ -521,14 +517,11 @@ async def get_decisions_for_files( ) desc_by_decision = {e["decision_id"]: e.get("description", "") for e in results} excerpt_by_decision: dict[str, tuple[str, str]] = {} - for r in (excerpt_rows or []): + for r in excerpt_rows or []: did = str(r.get("decision_id", "")) desc = desc_by_decision.get(did, "") spans = r.get("source_spans") or [] - real_spans = [ - s for s in spans - if s and s.get("text") and s.get("text") != desc - ] + real_spans = [s for s in spans if s and s.get("text") and s.get("text") != desc] first = real_spans[0] if real_spans else None if first: excerpt_by_decision[did] = ( @@ -710,9 +703,13 @@ async def upsert_code_region( WHERE file_path = $file_path AND symbol_name = $symbol_name """, { - "file_path": file_path, "symbol_name": symbol_name, - "start_line": start_line, "end_line": end_line, - "purpose": purpose, "repo": repo, "content_hash": content_hash, + "file_path": file_path, + "symbol_name": symbol_name, + "start_line": start_line, + "end_line": end_line, + "purpose": purpose, + "repo": repo, + "content_hash": content_hash, }, ) if rows: @@ -949,6 +946,7 @@ async def update_decision_status( # stable across authors and machines, so it's the only id safe to ship # across the JSONL event log. + async def get_canonical_id( client: LedgerClient, decision_id: str, @@ -1389,13 +1387,17 @@ async def search_context_pending_by_text( total = len(rows) for i, row in enumerate(rows): signoff = row.get("signoff") - if not (signoff and isinstance(signoff, dict) and signoff.get("state") == "context_pending"): + if not ( + signoff and isinstance(signoff, dict) and signoff.get("state") == "context_pending" + ): continue - results.append({ - "decision_id": row.get("decision_id", ""), - "description": row.get("description", ""), - "overlap_score": round(1.0 - (i / max(total, 1)) * 0.4, 2), - }) + results.append( + { + "decision_id": row.get("decision_id", ""), + "description": row.get("description", ""), + "overlap_score": round(1.0 - (i / max(total, 1)) * 0.4, 2), + } + ) if len(results) >= top_k: break return results @@ -1471,7 +1473,7 @@ async def get_context_for_ready_decisions( # shape and raises ``LedgerError`` on mismatch — a single choke point # per call instead of trusting upstream callers. -import re as _re +import re as _re # noqa: E402 _RECORD_ID_RE = _re.compile(r"^[A-Za-z_][A-Za-z0-9_]*:[A-Za-z0-9_\-]+$") @@ -1514,8 +1516,10 @@ async def upsert_code_subject( WHERE kind = $kind AND canonical_name = $name """, { - "kind": kind, "name": canonical_name, - "repo_ref": repo_ref, "conf": current_confidence, + "kind": kind, + "name": canonical_name, + "repo_ref": repo_ref, + "conf": current_confidence, }, ) if rows: @@ -1524,8 +1528,10 @@ async def upsert_code_subject( "CREATE code_subject SET kind=$kind, canonical_name=$name, " "repo_ref=$repo_ref, current_confidence=$conf", { - "kind": kind, "name": canonical_name, - "repo_ref": repo_ref, "conf": current_confidence, + "kind": kind, + "name": canonical_name, + "repo_ref": repo_ref, + "conf": current_confidence, }, ) return str(rows[0].get("id", "")) if rows else "" @@ -1608,8 +1614,7 @@ async def relate_has_identity( siid = _validated_record_id(subject_identity_id, "subject_identity") await _execute_idempotent_edge( client, - f"RELATE {csid}->has_identity->{siid} " - "SET confidence=$c, created_at=time::now()", + f"RELATE {csid}->has_identity->{siid} SET confidence=$c, created_at=time::now()", {"c": confidence}, ) @@ -1625,8 +1630,7 @@ async def link_decision_to_subject( csid = _validated_record_id(code_subject_id, "code_subject") await _execute_idempotent_edge( client, - f"RELATE {did}->about->{csid} " - "SET confidence=$c, created_at=time::now()", + f"RELATE {did}->about->{csid} SET confidence=$c, created_at=time::now()", {"c": confidence}, ) diff --git a/ledger/schema.py b/ledger/schema.py index f7d475f6..0a417bd6 100644 --- a/ledger/schema.py +++ b/ledger/schema.py @@ -14,6 +14,7 @@ from __future__ import annotations import logging +from datetime import UTC from .client import LedgerClient, LedgerError @@ -38,7 +39,7 @@ 7: "0.8.0", 8: "0.9.0", 9: "0.9.3", - 11: "0.11.0", # placeholder; release-eng pins final value at PR merge + 11: "0.11.0", # placeholder; release-eng pins final value at PR merge } # Migrations that drop or recreate tables/data. These are never auto-applied; @@ -72,16 +73,14 @@ class SchemaVersionTooNew(LedgerError): # Core tables _TABLES = [ # ── Decision tier ──────────────────────────────────────────────────── - # input_span — raw verbatim text excerpt from a meeting, PRD, Slack, or # implementation-time rationale. "What was said / written." # text is required — no DEFAULT. A span without verbatim text is rejected # at the ingest contract boundary (IngestDecision.source_excerpt must be # non-empty). See v0.5.0 plan §Core Principle. "DEFINE TABLE input_span SCHEMAFULL", - "DEFINE FIELD text ON input_span TYPE string " - "ASSERT string::len($value) > 0", - "DEFINE FIELD source_type ON input_span TYPE string", # transcript | notion | slack | document | manual | implementation_choice + "DEFINE FIELD text ON input_span TYPE string ASSERT string::len($value) > 0", + "DEFINE FIELD source_type ON input_span TYPE string", # transcript | notion | slack | document | manual | implementation_choice "DEFINE FIELD source_ref ON input_span TYPE string DEFAULT ''", # meeting ID, page URL, etc. "DEFINE FIELD speakers ON input_span TYPE array<string> DEFAULT []", "DEFINE FIELD meeting_date ON input_span TYPE string DEFAULT ''", @@ -89,7 +88,6 @@ class SchemaVersionTooNew(LedgerError): "DEFINE INDEX idx_input_span_ref ON input_span FIELDS source_type, source_ref", # Dedup: same excerpt from same source is the same span "DEFINE INDEX idx_input_span_dedup ON input_span FIELDS source_type, source_ref, text UNIQUE", - # decision — extracted decision / requirement. "What was decided." # Denormalized source fields (source_type, source_ref, speakers, meeting_date) # are kept for query speed; they mirror the linked input_span but are never @@ -122,9 +120,7 @@ class SchemaVersionTooNew(LedgerError): "SEARCH ANALYZER biz_analyzer BM25(1.2, 0.75) HIGHLIGHTS", # Powers the "awaiting signoff" PM dashboard queue "DEFINE INDEX idx_decision_signoff ON decision FIELDS signoff", - # ── Shared / unchanged ────────────────────────────────────────────── - # symbol — a named code entity (function, class, file). Retrieval-tier only. "DEFINE TABLE symbol SCHEMAFULL", "DEFINE FIELD name ON symbol TYPE string", @@ -134,12 +130,11 @@ class SchemaVersionTooNew(LedgerError): "DEFINE FIELD hit_count ON symbol TYPE int DEFAULT 0", "DEFINE INDEX idx_sym_name ON symbol FIELDS name SEARCH ANALYZER code_analyzer BM25(1.2, 0.75)", "DEFINE INDEX idx_sym_file ON symbol FIELDS file_path", - # code_region — a specific span within a file. Shared between the two tiers: # decision tier addresses it via binds_to; retrieval tier via locates. "DEFINE TABLE code_region SCHEMAFULL CHANGEFEED 30d INCLUDE ORIGINAL", "DEFINE FIELD file_path ON code_region TYPE string", - "DEFINE FIELD symbol_name ON code_region TYPE string", # display-only metadata, not a graph edge target + "DEFINE FIELD symbol_name ON code_region TYPE string", # display-only metadata, not a graph edge target "DEFINE FIELD start_line ON code_region TYPE int", "DEFINE FIELD end_line ON code_region TYPE int", "DEFINE FIELD purpose ON code_region TYPE string DEFAULT ''", @@ -148,7 +143,6 @@ class SchemaVersionTooNew(LedgerError): "DEFINE FIELD content_hash ON code_region TYPE string DEFAULT ''", "DEFINE INDEX idx_region_sym ON code_region FIELDS symbol_name", "DEFINE INDEX idx_region_file ON code_region FIELDS repo, file_path", - # vocab_cache — grounding reuse cache for query→code_region lookups "DEFINE TABLE vocab_cache SCHEMAFULL", "DEFINE FIELD query_text ON vocab_cache TYPE string", @@ -158,14 +152,12 @@ class SchemaVersionTooNew(LedgerError): "DEFINE FIELD last_hit ON vocab_cache TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_vocab_query ON vocab_cache FIELDS query_text SEARCH ANALYZER biz_analyzer BM25(1.2, 0.75)", "DEFINE INDEX idx_vocab_repo ON vocab_cache FIELDS repo", - # ledger_sync — idempotency cursor (last synced commit per repo) "DEFINE TABLE ledger_sync SCHEMAFULL", "DEFINE FIELD repo ON ledger_sync TYPE string", "DEFINE FIELD last_synced_commit ON ledger_sync TYPE string", "DEFINE FIELD synced_at ON ledger_sync TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_sync_repo ON ledger_sync FIELDS repo UNIQUE", - # source_cursor — upstream ingestion checkpoint per source stream "DEFINE TABLE source_cursor SCHEMAFULL", "DEFINE FIELD repo ON source_cursor TYPE string", @@ -177,13 +169,12 @@ class SchemaVersionTooNew(LedgerError): "DEFINE FIELD status ON source_cursor TYPE string DEFAULT 'ok'", "DEFINE FIELD error ON source_cursor TYPE string DEFAULT ''", "DEFINE INDEX idx_source_cursor ON source_cursor FIELDS repo, source_type, source_scope UNIQUE", - # compliance_check — LLM verification cache. # Cache key: (decision_id, region_id, content_hash) — one verdict per code shape. # pruned=true means the caller said "not_relevant" — retrieval mistake, binds_to # edge has been deleted. Row kept for audit trail. "DEFINE TABLE compliance_check SCHEMAFULL", - "DEFINE FIELD decision_id ON compliance_check TYPE string", # renamed from intent_id + "DEFINE FIELD decision_id ON compliance_check TYPE string", # renamed from intent_id "DEFINE FIELD region_id ON compliance_check TYPE string", "DEFINE FIELD content_hash ON compliance_check TYPE string", "DEFINE FIELD commit_hash ON compliance_check TYPE string DEFAULT ''", @@ -203,7 +194,6 @@ class SchemaVersionTooNew(LedgerError): "DEFINE INDEX idx_cc_region ON compliance_check FIELDS region_id", "DEFINE INDEX idx_cc_commit ON compliance_check FIELDS commit_hash", "DEFINE INDEX idx_cc_ephemeral ON compliance_check FIELDS ephemeral", - # graph_proposal — AI-generated edge proposals for human review. # from_id / to_id are TYPE string (not TYPE record) because this table can # link across different node types. Traverse via type::thing($from_id). @@ -220,12 +210,10 @@ class SchemaVersionTooNew(LedgerError): "DEFINE FIELD session_id ON graph_proposal TYPE string DEFAULT ''", "DEFINE FIELD created_at ON graph_proposal TYPE datetime DEFAULT time::now()", "DEFINE FIELD reviewed_at ON graph_proposal TYPE option<datetime> DEFAULT NONE", - # ── CodeGenome tier (v11, additive — Phase 1+2 / #59) ─────────────── # All writes are gated by codegenome.write_identity_records=True at the # handler boundary. Tables exist unconditionally so toggling the flag # mid-deployment does not require a migration. - # code_subject — a conceptual code target (function, class, module…) # that can survive movement across files. Distinct from `symbol`, # which is keyed on name+kind at one point in time. @@ -233,13 +221,10 @@ class SchemaVersionTooNew(LedgerError): "DEFINE FIELD kind ON code_subject TYPE string", "DEFINE FIELD canonical_name ON code_subject TYPE string", "DEFINE FIELD repo_ref ON code_subject TYPE option<string>", - "DEFINE FIELD current_confidence ON code_subject TYPE float " - "ASSERT $value >= 0 AND $value <= 1", + "DEFINE FIELD current_confidence ON code_subject TYPE float ASSERT $value >= 0 AND $value <= 1", "DEFINE FIELD created_at ON code_subject TYPE datetime DEFAULT time::now()", "DEFINE FIELD updated_at ON code_subject TYPE datetime DEFAULT time::now()", - "DEFINE INDEX idx_code_subject_canonical " - "ON code_subject FIELDS kind, canonical_name UNIQUE", - + "DEFINE INDEX idx_code_subject_canonical ON code_subject FIELDS kind, canonical_name UNIQUE", # subject_identity — durable fingerprint for one observation of a # code_subject. Phase 3 (#60) will add a supersedes edge between # identities; not defined yet. @@ -255,7 +240,6 @@ class SchemaVersionTooNew(LedgerError): "DEFINE FIELD model_version ON subject_identity TYPE string", "DEFINE FIELD created_at ON subject_identity TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_subject_identity_address ON subject_identity FIELDS address UNIQUE", - # subject_version — concrete location/symbol observation at one # repo_ref. Phase 3 (#60) will write versions when a continuity match # resolves a relocation; Phase 1+2 only defines the table (foundation @@ -291,27 +275,23 @@ class SchemaVersionTooNew(LedgerError): "DEFINE TABLE yields SCHEMAFULL TYPE RELATION IN input_span OUT decision", "DEFINE FIELD created_at ON yields TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_yields_unique ON yields FIELDS in, out UNIQUE", - # decision → code_region (direct binding — decision tier only) "DEFINE TABLE binds_to SCHEMAFULL TYPE RELATION IN decision OUT code_region", "DEFINE FIELD confidence ON binds_to TYPE float ASSERT $value >= 0 AND $value <= 1", "DEFINE FIELD provenance ON binds_to TYPE object DEFAULT {}", "DEFINE FIELD created_at ON binds_to TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_binds_to_unique ON binds_to FIELDS in, out UNIQUE", - # symbol → code_region (retrieval tier — BM25 / graph / future embeddings) "DEFINE TABLE locates SCHEMAFULL TYPE RELATION IN symbol OUT code_region", "DEFINE FIELD confidence ON locates TYPE float ASSERT $value >= 0 AND $value <= 1", "DEFINE FIELD created_at ON locates TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_locates_unique ON locates FIELDS in, out UNIQUE", - # decision → decision (human-confirmed supersession — v0.8.0 HITL) "DEFINE TABLE supersedes SCHEMAFULL TYPE RELATION IN decision OUT decision", "DEFINE FIELD confidence ON supersedes TYPE float", "DEFINE FIELD reason ON supersedes TYPE string DEFAULT ''", "DEFINE FIELD created_at ON supersedes TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_supersedes_unique ON supersedes FIELDS in, out UNIQUE", - # input_span → decision (human-confirmed context provision — v0.8.0 HITL) "DEFINE TABLE context_for SCHEMAFULL TYPE RELATION IN input_span OUT decision", "DEFINE FIELD relevance_score ON context_for TYPE float", @@ -320,34 +300,26 @@ class SchemaVersionTooNew(LedgerError): "ASSERT $value IN ['proposed', 'confirmed', 'rejected'] DEFAULT 'proposed'", "DEFINE FIELD created_at ON context_for TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_ctx_unique ON context_for FIELDS in, out UNIQUE", - # code_region → code_region (structural dependency — unchanged) "DEFINE TABLE depends_on SCHEMAFULL TYPE RELATION IN code_region OUT code_region", "DEFINE FIELD edge_type ON depends_on TYPE string", "DEFINE FIELD created_at ON depends_on TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_depends_on_unique ON depends_on FIELDS in, out, edge_type UNIQUE", - # ── CodeGenome edges (v11, additive — Phase 1+2 / #59) ────────────── - # code_subject → has_identity → subject_identity "DEFINE TABLE has_identity SCHEMAFULL TYPE RELATION IN code_subject OUT subject_identity", - "DEFINE FIELD confidence ON has_identity TYPE float " - "ASSERT $value >= 0 AND $value <= 1", + "DEFINE FIELD confidence ON has_identity TYPE float ASSERT $value >= 0 AND $value <= 1", "DEFINE FIELD created_at ON has_identity TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_has_identity_unique ON has_identity FIELDS in, out UNIQUE", - # code_subject → has_version → subject_version "DEFINE TABLE has_version SCHEMAFULL TYPE RELATION IN code_subject OUT subject_version", - "DEFINE FIELD confidence ON has_version TYPE float " - "ASSERT $value >= 0 AND $value <= 1", + "DEFINE FIELD confidence ON has_version TYPE float ASSERT $value >= 0 AND $value <= 1", "DEFINE FIELD created_at ON has_version TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_has_version_unique ON has_version FIELDS in, out UNIQUE", - # decision → about → code_subject (used by find_subject_identities_for_decision # to walk decision → subject → identity in two hops) "DEFINE TABLE about SCHEMAFULL TYPE RELATION IN decision OUT code_subject", - "DEFINE FIELD confidence ON about TYPE float " - "ASSERT $value >= 0 AND $value <= 1", + "DEFINE FIELD confidence ON about TYPE float ASSERT $value >= 0 AND $value <= 1", "DEFINE FIELD created_at ON about TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_about_unique ON about FIELDS in, out UNIQUE", ] @@ -368,9 +340,15 @@ def _with_overwrite(sql: str) -> str: the current field constraints (ASSERT clauses, DEFAULT values, TYPE) even when the field already exists in the DB. """ - for keyword in ("DEFINE TABLE", "DEFINE FIELD", "DEFINE INDEX", "DEFINE ANALYZER", "DEFINE EVENT"): + for keyword in ( + "DEFINE TABLE", + "DEFINE FIELD", + "DEFINE INDEX", + "DEFINE ANALYZER", + "DEFINE EVENT", + ): if sql.upper().startswith(keyword) and "OVERWRITE" not in sql.upper(): - return keyword + " OVERWRITE" + sql[len(keyword):] + return keyword + " OVERWRITE" + sql[len(keyword) :] return sql @@ -402,7 +380,7 @@ async def init_schema(client: LedgerClient) -> None: clauses, DEFAULT values, TYPE) are always brought up to the current schema definition — even when running against a DB created by an older version. """ - for sql in (_ANALYZERS + _TABLES + _EDGES + _META): + for sql in _ANALYZERS + _TABLES + _EDGES + _META: sql = sql.strip() if sql: await _execute_define_idempotent(client, _with_overwrite(sql)) @@ -410,6 +388,7 @@ async def init_schema(client: LedgerClient) -> None: # ── Migrations ────────────────────────────────────────────────────────── + async def _migrate_v4_to_v5(client: LedgerClient) -> None: """v4 → v5: Remove stale v3-era yields edges and deduplicate. @@ -431,7 +410,7 @@ async def _migrate_v4_to_v5(client: LedgerClient) -> None: "WHERE string::starts_with(type::string(in), 'source_span:') " " OR string::starts_with(type::string(out), 'intent:')" ) - for row in (stale or []): + for row in stale or []: try: await client.execute(f"DELETE {row['id']}") except Exception: @@ -448,7 +427,7 @@ async def _migrate_v4_to_v5(client: LedgerClient) -> None: all_yields = await client.query("SELECT id, in, out FROM yields") seen: set[tuple[str, str]] = set() removed = 0 - for row in (all_yields or []): + for row in all_yields or []: key = (str(row.get("in", "")), str(row.get("out", ""))) if key in seen: try: @@ -482,16 +461,14 @@ async def _migrate_v5_to_v6(client: LedgerClient) -> None: New ingests after v0.7.0 write signoff = {state:'proposed', ...} by default. """ - from datetime import datetime, timezone + from datetime import datetime - now_iso = datetime.now(timezone.utc).isoformat() + now_iso = datetime.now(UTC).isoformat() try: - all_decisions = await client.query( - "SELECT id, product_signoff FROM decision" - ) + all_decisions = await client.query("SELECT id, product_signoff FROM decision") migrated = 0 - for row in (all_decisions or []): + for row in all_decisions or []: decision_id = str(row.get("id", "")) old_signoff = row.get("product_signoff") @@ -560,7 +537,6 @@ async def _migrate_v6_to_v7(client: LedgerClient) -> None: "DEFINE FIELD reason ON supersedes TYPE string DEFAULT ''", "DEFINE FIELD created_at ON supersedes TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_supersedes_unique ON supersedes FIELDS in, out UNIQUE", - "DEFINE TABLE context_for SCHEMAFULL TYPE RELATION IN input_span OUT decision", "DEFINE FIELD relevance_score ON context_for TYPE float", "DEFINE FIELD reason ON context_for TYPE string DEFAULT ''", @@ -568,7 +544,6 @@ async def _migrate_v6_to_v7(client: LedgerClient) -> None: "ASSERT $value IN ['proposed', 'confirmed', 'rejected'] DEFAULT 'proposed'", "DEFINE FIELD created_at ON context_for TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_ctx_unique ON context_for FIELDS in, out UNIQUE", - # Proposal infrastructure (AI does not write here yet) "DEFINE TABLE graph_proposal SCHEMAFULL", "DEFINE FIELD proposal_type ON graph_proposal TYPE string " @@ -582,7 +557,6 @@ async def _migrate_v6_to_v7(client: LedgerClient) -> None: "DEFINE FIELD session_id ON graph_proposal TYPE string DEFAULT ''", "DEFINE FIELD created_at ON graph_proposal TYPE datetime DEFAULT time::now()", "DEFINE FIELD reviewed_at ON graph_proposal TYPE option<datetime> DEFAULT NONE", - # Expanded status ASSERT (v6→v7 era; narrowed again in v10) "DEFINE FIELD status ON decision TYPE string DEFAULT 'ungrounded' " "ASSERT $value IN ['reflected', 'drifted', 'pending', 'ungrounded', " @@ -609,7 +583,9 @@ async def _migrate_v7_to_v8(client: LedgerClient) -> None: try: await client.execute("UPDATE compliance_check SET ephemeral = false WHERE ephemeral = NONE") - logger.info("[migration] v7 → v8: backfilled compliance_check.ephemeral = false on existing rows") + logger.info( + "[migration] v7 → v8: backfilled compliance_check.ephemeral = false on existing rows" + ) except Exception as exc: logger.warning("[migration] v7 → v8: backfill failed (non-fatal): %s", exc) @@ -651,15 +627,16 @@ async def _migrate_v9_to_v10(client: LedgerClient) -> None: code-compliance status will be re-derived on the next drift sweep. 3. Tighten the ASSERT constraint on the status field. """ - from datetime import datetime, timezone - _now = datetime.now(timezone.utc).isoformat() + from datetime import datetime + + _now = datetime.now(UTC).isoformat() # Step 1: superseded decisions — move superseded into signoff superseded_rows = await client.query( "SELECT type::string(id) AS id, signoff FROM decision WHERE status = 'superseded'" ) migrated_superseded = 0 - for row in (superseded_rows or []): + for row in superseded_rows or []: decision_id = row.get("id", "") existing_signoff = row.get("signoff") or {} if not decision_id: @@ -684,8 +661,7 @@ async def _migrate_v9_to_v10(client: LedgerClient) -> None: # (their signoff already carries the right state; the status field was a # projection artifact of the old project_decision_status short-circuits) await client.execute( - "UPDATE decision SET status = 'ungrounded' " - "WHERE status IN ['proposal', 'context_pending']" + "UPDATE decision SET status = 'ungrounded' WHERE status IN ['proposal', 'context_pending']" ) # Step 3: tighten ASSERT @@ -723,7 +699,6 @@ async def _migrate_v10_to_v11(client: LedgerClient) -> None: "DEFINE FIELD updated_at ON code_subject TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_code_subject_canonical " "ON code_subject FIELDS kind, canonical_name UNIQUE", - "DEFINE TABLE subject_identity SCHEMAFULL", "DEFINE FIELD address ON subject_identity TYPE string", "DEFINE FIELD identity_type ON subject_identity TYPE string", @@ -735,9 +710,7 @@ async def _migrate_v10_to_v11(client: LedgerClient) -> None: "ASSERT $value >= 0 AND $value <= 1", "DEFINE FIELD model_version ON subject_identity TYPE string", "DEFINE FIELD created_at ON subject_identity TYPE datetime DEFAULT time::now()", - "DEFINE INDEX idx_subject_identity_address " - "ON subject_identity FIELDS address UNIQUE", - + "DEFINE INDEX idx_subject_identity_address ON subject_identity FIELDS address UNIQUE", "DEFINE TABLE subject_version SCHEMAFULL", "DEFINE FIELD repo_ref ON subject_version TYPE string", "DEFINE FIELD file_path ON subject_version TYPE string", @@ -750,23 +723,17 @@ async def _migrate_v10_to_v11(client: LedgerClient) -> None: "DEFINE FIELD created_at ON subject_version TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_subject_version_loc " "ON subject_version FIELDS repo_ref, file_path, start_line, end_line", - # Edges "DEFINE TABLE has_identity SCHEMAFULL TYPE RELATION IN code_subject OUT subject_identity", - "DEFINE FIELD confidence ON has_identity TYPE float " - "ASSERT $value >= 0 AND $value <= 1", + "DEFINE FIELD confidence ON has_identity TYPE float ASSERT $value >= 0 AND $value <= 1", "DEFINE FIELD created_at ON has_identity TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_has_identity_unique ON has_identity FIELDS in, out UNIQUE", - "DEFINE TABLE has_version SCHEMAFULL TYPE RELATION IN code_subject OUT subject_version", - "DEFINE FIELD confidence ON has_version TYPE float " - "ASSERT $value >= 0 AND $value <= 1", + "DEFINE FIELD confidence ON has_version TYPE float ASSERT $value >= 0 AND $value <= 1", "DEFINE FIELD created_at ON has_version TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_has_version_unique ON has_version FIELDS in, out UNIQUE", - "DEFINE TABLE about SCHEMAFULL TYPE RELATION IN decision OUT code_subject", - "DEFINE FIELD confidence ON about TYPE float " - "ASSERT $value >= 0 AND $value <= 1", + "DEFINE FIELD confidence ON about TYPE float ASSERT $value >= 0 AND $value <= 1", "DEFINE FIELD created_at ON about TYPE datetime DEFAULT time::now()", "DEFINE INDEX idx_about_unique ON about FIELDS in, out UNIQUE", ] @@ -831,7 +798,9 @@ async def migrate(client: LedgerClient, allow_destructive: bool = False) -> None logger.info( "[migration] Schema version %d → %d (%d migration(s) to apply)", - current, SCHEMA_VERSION, SCHEMA_VERSION - current, + current, + SCHEMA_VERSION, + SCHEMA_VERSION - current, ) for target_version in range(current + 1, SCHEMA_VERSION + 1): diff --git a/ledger/status.py b/ledger/status.py index c25faf88..91c64546 100644 --- a/ledger/status.py +++ b/ledger/status.py @@ -44,7 +44,10 @@ def resolve_symbol_lines( try: result = subprocess.run( ["git", "show", f"{ref}:{file_path}"], - cwd=abs_repo, capture_output=True, text=True, timeout=10, + cwd=abs_repo, + capture_output=True, + text=True, + timeout=10, ) if result.returncode != 0: return None @@ -57,9 +60,15 @@ def resolve_symbol_lines( ext = Path(file_path).suffix lang_map = { - ".py": "python", ".js": "javascript", ".jsx": "javascript", - ".ts": "typescript", ".tsx": "typescript", ".java": "java", - ".go": "go", ".rs": "rust", ".cs": "csharp", + ".py": "python", + ".js": "javascript", + ".jsx": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".java": "java", + ".go": "go", + ".rs": "rust", + ".cs": "csharp", } lang = lang_map.get(ext) if lang is None: @@ -67,19 +76,33 @@ def resolve_symbol_lines( symbols = extract_symbols_from_content(content, lang, file_path) for sym in symbols: - name = getattr(sym, "name", None) or (sym.get("name") if isinstance(sym, dict) else None) - qname = getattr(sym, "qualified_name", None) or (sym.get("qualified_name") if isinstance(sym, dict) else None) - sl = getattr(sym, "start_line", None) or (sym.get("start_line") if isinstance(sym, dict) else None) - el = getattr(sym, "end_line", None) or (sym.get("end_line") if isinstance(sym, dict) else None) + name = getattr(sym, "name", None) or ( + sym.get("name") if isinstance(sym, dict) else None + ) + qname = getattr(sym, "qualified_name", None) or ( + sym.get("qualified_name") if isinstance(sym, dict) else None + ) + sl = getattr(sym, "start_line", None) or ( + sym.get("start_line") if isinstance(sym, dict) else None + ) + el = getattr(sym, "end_line", None) or ( + sym.get("end_line") if isinstance(sym, dict) else None + ) if name == symbol_name or qname == symbol_name: return (sl, el) # Try fuzzy: symbol_name might be unqualified bare = symbol_name.split(".")[-1] if "." in symbol_name else symbol_name for sym in symbols: - name = getattr(sym, "name", None) or (sym.get("name") if isinstance(sym, dict) else None) - sl = getattr(sym, "start_line", None) or (sym.get("start_line") if isinstance(sym, dict) else None) - el = getattr(sym, "end_line", None) or (sym.get("end_line") if isinstance(sym, dict) else None) + name = getattr(sym, "name", None) or ( + sym.get("name") if isinstance(sym, dict) else None + ) + sl = getattr(sym, "start_line", None) or ( + sym.get("start_line") if isinstance(sym, dict) else None + ) + el = getattr(sym, "end_line", None) or ( + sym.get("end_line") if isinstance(sym, dict) else None + ) if name == bare: return (sl, el) @@ -159,12 +182,12 @@ def compute_content_hash( content = get_git_content(file_path, start_line, end_line, repo_path, ref) if content is None: return None - # Validate line range (warn but still hash — shorter file = drift signal) - line_count = len(content.splitlines()) if start_line < 1 or end_line < start_line: logger.warning( "[status] Invalid range %d:%d for %s", - start_line, end_line, file_path, + start_line, + end_line, + file_path, ) return None return hash_lines(content, start_line, end_line) @@ -259,7 +282,9 @@ def get_changed_files_in_range( if result.returncode != 0: logger.warning( "[status] git diff %s..%s failed: %s", - base_sha[:8], head_sha[:8], result.stderr[:200], + base_sha[:8], + head_sha[:8], + result.stderr[:200], ) return None return [f.strip() for f in result.stdout.strip().splitlines() if f.strip()] diff --git a/local_counters.py b/local_counters.py index 7c8a1d8e..72b2e21a 100644 --- a/local_counters.py +++ b/local_counters.py @@ -23,11 +23,11 @@ import json import logging import os -import sys import threading from collections import Counter -from datetime import datetime, timezone +from datetime import UTC, datetime from pathlib import Path +from typing import IO logger = logging.getLogger(__name__) @@ -41,7 +41,7 @@ def _enabled() -> bool: return val not in _OFF_VALUES -def _open_for_append_secure(path: Path) -> "os.PathLike": +def _open_for_append_secure(path: Path) -> IO[bytes]: """Open the counters file with 0o600 mode on POSIX (user-only).""" flags = os.O_WRONLY | os.O_CREAT | os.O_APPEND fd = os.open(str(path), flags, 0o600) @@ -57,7 +57,7 @@ def increment(tool_name: str, *, delta: int = 1) -> None: record = { "tool": tool_name, "delta": int(delta), - "ts": datetime.now(timezone.utc).isoformat(), + "ts": datetime.now(UTC).isoformat(), } line = json.dumps(record, separators=(",", ":")) + "\n" with _LOCK: diff --git a/ports.py b/ports.py index a446d94c..9ba65809 100644 --- a/ports.py +++ b/ports.py @@ -10,10 +10,9 @@ from __future__ import annotations -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Protocol, runtime_checkable - # ── Drift Analysis ────────────────────────────────────────────────────── diff --git a/pyproject.toml b/pyproject.toml index cfd6c93f..308194d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,12 +48,86 @@ test = [ "pytest>=8.0.0", "pytest-asyncio>=0.23.0", "tiktoken>=0.7.0,<1.0.0", + "ruff>=0.5.0", + "mypy>=1.10.0", ] [project.scripts] bicameral-mcp = "server:cli_main" +bicameral-mcp-preflight-reminder = "scripts.hooks.preflight_reminder:main" +bicameral-mcp-post-commit-sync-reminder = "scripts.hooks.post_commit_sync_reminder:main" +bicameral-mcp-collision-capture-reminder = "scripts.hooks.post_preflight_capture_reminder:main" [tool.hatch.build.targets.wheel] packages = ["."] -exclude = ["tests", "visual-plan", "mocks", "test-results"] +exclude = [ + "tests", + "visual-plan", + "mocks", + "test-results", + "docs/demos/**/*.mp4", +] artifacts = ["skills/**/*.md", "skills/**/*.yaml"] + +[tool.ruff] +line-length = 100 +target-version = "py311" +extend-exclude = [ + "test-results", + "visual-plan", + "mocks", + ".agent", + ".claude", +] + +[tool.ruff.lint] +select = ["E", "F", "W", "I", "B", "UP"] # pyflakes + pycodestyle + isort + bugbear + pyupgrade +ignore = ["E501"] # line-length handled by formatter + +[tool.ruff.lint.per-file-ignores] +# Test files often reference module-internal symbols imported via patching, +# use intentional unused-locals for clarity, and use assert-style equality. +# Day-one CI keeps tests/ lenient; tighten in follow-up cleanup PRs. +"tests/**" = ["F401", "F811", "F821", "F841", "E712", "B017", "B904", "E402", "E731"] +"scripts/**" = ["F401", "F841", "E402", "E731"] + +[tool.mypy] +python_version = "3.11" +ignore_missing_imports = true # project depends on pydantic, mcp, surrealdb — many unstubbed +warn_return_any = false +strict_optional = true +disable_error_code = ["import-untyped"] # missing third-party stubs (e.g. PyYAML) — chip away in follow-ups +exclude = [ + "test-results/", + "visual-plan/", + "mocks/", + ".agent/", + ".claude/", + "build/", + "dist/", + # Tests/fixtures aren't part of the production type surface; tighten in follow-ups. + "^tests/", + "^scripts/", +] + +# Day-one mypy: noisy modules suppressed wholesale to keep CI green. +# Each entry below is a follow-up cleanup target. Track in a separate type-cleanup +# project — do NOT remove entries here without first fixing the underlying errors. +[[tool.mypy.overrides]] +module = [ + "server", + "setup_wizard", + "code_locator.indexing.cocoindex_pipeline", + "code_locator.indexing.symbol_extractor", + "adapters.code_locator", + "ledger.adapter", + "ledger.status", + "ledger.queries", + "ledger.schema", + "handlers.ratify", + "handlers.search_decisions", + "handlers.resolve_compliance", + "handlers.preflight", + "handlers.detect_drift", +] +ignore_errors = true diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/hooks/__init__.py b/scripts/hooks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/hooks/post_commit_sync_reminder.py b/scripts/hooks/post_commit_sync_reminder.py new file mode 100644 index 00000000..ed610472 --- /dev/null +++ b/scripts/hooks/post_commit_sync_reminder.py @@ -0,0 +1,82 @@ +"""PostToolUse hook for the ``Bash`` tool — git write-op detector. + +When the agent runs ``git commit`` / ``git merge`` / ``git pull`` / +``git rebase --continue``, inject a system-reminder telling the agent to +call ``/bicameral:sync`` so the decision ledger picks up the new HEAD, +runs compliance checks, and produces authoritative reflected/drifted +verdicts before the next user turn. + +Replaces the plain-stdout one-liner ``_BICAMERAL_POST_COMMIT_COMMAND`` +that previously lived inline in ``setup_wizard.py``. Per Claude Code +2.x hook docs (https://code.claude.com/docs/en/hooks), plain stdout +from PostToolUse hooks is silently dropped to the debug log — only +UserPromptSubmit / UserPromptExpansion / SessionStart treat raw stdout +as agent-visible context. Symptom: the agent committed but never +followed through to call ``link_commit`` / ``/bicameral:sync`` because +the reminder never reached the model. Fix: emit the structured +envelope ``{"hookSpecificOutput": {"hookEventName": "PostToolUse", +"additionalContext": "..."}}``. + +The reminder text preserves the canonical ``"bicameral: new commit +detected"`` phrase — the ``bicameral-sync`` skill watches for that +exact prefix as one of its trigger signals. + +Errors are swallowed silently (exit 0, empty response) so a broken +hook never blocks a user. +""" + +from __future__ import annotations + +import json +import sys + +BASH_TOOL_NAME = "Bash" + +# Substrings that mark a git write-op against HEAD that the agent should +# follow up with /bicameral:sync. Exact phrasing matches the legacy +# inline command's tuple so behavior is byte-identical except for the +# stdout envelope. +WRITE_OP_MARKERS: tuple[str, ...] = ( + "git commit", + "git merge ", + "git pull", + "git rebase --continue", +) + +REMINDER_TEXT = ( + "bicameral: new commit detected — run /bicameral:sync to resolve " + "compliance and get authoritative reflected/drifted status" +) + + +def _is_git_write_op(command: str) -> bool: + return any(marker in command for marker in WRITE_OP_MARKERS) + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except (json.JSONDecodeError, ValueError): + return 0 + if not isinstance(payload, dict): + return 0 + if payload.get("tool_name") != BASH_TOOL_NAME: + return 0 + tool_input = payload.get("tool_input") or {} + command = tool_input.get("command", "") if isinstance(tool_input, dict) else "" + if not isinstance(command, str) or not _is_git_write_op(command): + return 0 + json.dump( + { + "hookSpecificOutput": { + "hookEventName": "PostToolUse", + "additionalContext": REMINDER_TEXT, + } + }, + sys.stdout, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/hooks/post_preflight_capture_reminder.py b/scripts/hooks/post_preflight_capture_reminder.py new file mode 100644 index 00000000..c2669274 --- /dev/null +++ b/scripts/hooks/post_preflight_capture_reminder.py @@ -0,0 +1,149 @@ +"""PostToolUse hook for ``bicameral.preflight``. + +When preflight surfaces ≥1 decision, inject a system-reminder templating +the correction-capture loop (Step 5.6 of ``skills/bicameral-preflight``). +Per #175, the agent does NOT judge contradiction itself — instead it +asks the user via ``AskUserQuestion`` (Step 5.6.1) and acts mechanically +on the answer (Step 5.6.2): + + 1. ``AskUserQuestion`` — disambiguate whether the current request is + a refinement of any surfaced decision. Three options: supersede, + keep_both, unrelated. + 2. If 'supersede' or 'keep_both': + - ``bicameral.ingest(source="agent_session", ...)`` + - ``bicameral.resolve_collision(new_id=..., old_id=..., action=...)`` + 3. If 'unrelated': skip capture, proceed to implementation. + +Why route the judgment to the user (path D in the #175 design discussion): +prior implementations tried (a) a conditional "IF you contradict ..." gate +which let the agent skip on borderline prompts, then (b) an unconditional +"you MUST capture" gate which the agent still ignored on structural- +mismatch prompts (e.g. "add programmatic API" vs "drag-and-drop UX" +decision — agent rationalized "these can coexist" and skipped). The +contradiction judgment is semantic, not lexical, and LLM-level inference +is unreliable on it. The user is the only party with the actual intent; +the skill puts the question to them. + +Trust contract preserved: the hook only fires when ``fired=True`` +AND ``len(decisions) > 0`` — silent on no signal. The question runs at +a moment the flow is already paused (rendering the surfaced block). + +Per Claude Code 2.x hook contract: read JSON ``{tool_name, tool_input, +tool_response}`` from stdin. ``tool_response`` is either a JSON string or +a dict — both are handled. Output is the structured envelope +``{"hookSpecificOutput": {"hookEventName": "PostToolUse", +"additionalContext": "..."}}`` written to stdout; the CLI surfaces +``additionalContext`` next to the tool result the model sees on the next +turn. Plain stdout is silently dropped to the debug log for PostToolUse +events (per https://code.claude.com/docs/en/hooks — only +UserPromptSubmit / UserPromptExpansion / SessionStart treat raw stdout +as agent-visible context). Errors swallowed silently (exit 0, empty +response) so a broken hook never blocks a user. +""" + +from __future__ import annotations + +import json +import sys + +PREFLIGHT_TOOL_NAME = "mcp__bicameral__bicameral_preflight" + + +def _coerce_response(raw: object) -> dict: + """Return a dict view of ``tool_response`` whether it arrived as a + JSON string or already-decoded dict. On any failure return ``{}`` — + the caller treats an empty dict as "no decisions to template." + """ + if isinstance(raw, dict): + return raw + if isinstance(raw, str): + try: + decoded = json.loads(raw) + except (json.JSONDecodeError, ValueError): + return {} + return decoded if isinstance(decoded, dict) else {} + return {} + + +def _format_reminder(decisions: list[dict]) -> str: + bullets = "\n".join( + f" - {d.get('decision_id', '<unknown>')}: {d.get('description', '<no description>')}" + for d in decisions + ) + first = decisions[0] + first_id = first.get("decision_id", "<decision_id>") + first_desc = first.get("description", "<description>") + return ( + "<system-reminder>\n" + f"bicameral.preflight surfaced {len(decisions)} prior decision(s):\n" + f"{bullets}\n" + "\n" + "BEFORE any code edits, do NOT judge contradiction yourself — ask " + "the user. Per skills/bicameral-preflight Step 5.6.1, call " + "AskUserQuestion to disambiguate whether the current request is a " + "refinement of any surfaced decision. The user (not the agent) " + "decides; the agent then acts mechanically on the answer.\n" + "\n" + "AskUserQuestion({\n" + ' "question": "Your request appears to operate on the same feature ' + f"surface as surfaced decision {first_id} " + f'(\\"{first_desc[:100]}\\"). Treat this work as a refinement of that prior plan?",' + "\n" + ' "multiSelect": False,\n' + ' "options": [\n' + ' {"label": "Yes — supersede prior plan",\n' + ' "description": "<paraphrase user\'s direction; replaces the prior wholesale>"},\n' + ' {"label": "Yes — keep both (addition or scoping)",\n' + ' "description": "<paraphrase; adds to or narrows; both remain>"},\n' + ' {"label": "No — unrelated to prior plan",\n' + ' "description": "Continue without capture"},\n' + " ],\n" + "})\n" + "\n" + "Branch on the answer:\n" + " - 'supersede' → bicameral.ingest(source='agent_session') +\n" + " bicameral.resolve_collision(action='supersede')\n" + " - 'keep both' → bicameral.ingest(source='agent_session') +\n" + " bicameral.resolve_collision(action='keep_both')\n" + " - 'unrelated' → skip capture; proceed to implementation; narrate one\n" + " line ('noted — surfaced context isn't applicable here').\n" + "\n" + "If multiple decisions were surfaced and the user's request plausibly\n" + "touches more than one, ask once per plausibly-touched decision; skip\n" + "the question for surfaced decisions clearly unrelated to the prompt.\n" + "\n" + "Asking is cheap; missed capture is irreversible context loss. Default\n" + "to asking; only skip when the surfaced decision is obviously off-topic.\n" + "</system-reminder>" + ) + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except (json.JSONDecodeError, ValueError): + return 0 + if not isinstance(payload, dict): + return 0 + if payload.get("tool_name") != PREFLIGHT_TOOL_NAME: + return 0 + response = _coerce_response(payload.get("tool_response")) + if not response.get("fired"): + return 0 + decisions = response.get("decisions") or [] + if not isinstance(decisions, list) or not decisions: + return 0 + json.dump( + { + "hookSpecificOutput": { + "hookEventName": "PostToolUse", + "additionalContext": _format_reminder(decisions), + } + }, + sys.stdout, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/hooks/preflight_intent.py b/scripts/hooks/preflight_intent.py new file mode 100644 index 00000000..5910dd0a --- /dev/null +++ b/scripts/hooks/preflight_intent.py @@ -0,0 +1,77 @@ +"""Preflight intent classifier. + +Single source of truth for the verb list used by the bicameral-preflight +SKILL.md description and the UserPromptSubmit hook. Deterministic: no +LLM, no network, no I/O beyond a string scan. +""" + +from __future__ import annotations + +import re + +IMPLEMENTATION_VERBS: frozenset[str] = frozenset( + { + "add", + "build", + "create", + "implement", + "modify", + "refactor", + "update", + "fix", + "change", + "write", + "edit", + "move", + "rename", + "remove", + "delete", + "extract", + "convert", + "integrate", + "deploy", + "ship", + "configure", + "connect", + "extend", + "migrate", + "wire", + "hook up", + "set up", + "complete", + "finish", + "continue", + } +) + +INDIRECT_INTENT_PHRASES: tuple[str, ...] = ( + "how should i implement", + "how do i build", + "how should i write", + "what's the best way to add", + "what's the cleanest way to refactor", +) + +SKIP_PATTERNS: tuple[re.Pattern[str], ...] = ( + re.compile(r"\bfix\b.*\btypo\b", re.IGNORECASE), + re.compile(r"\bbump\b.*\b(?:to|from)\b.*\d+\.\d+", re.IGNORECASE), + re.compile(r"\bhow does\b", re.IGNORECASE), +) + +_VERB_REGEX = re.compile( + r"\b(?:" + "|".join(re.escape(v) for v in IMPLEMENTATION_VERBS) + r")\b", + re.IGNORECASE, +) + + +def should_fire_preflight(prompt: str) -> bool: + """Return True iff prompt indicates code-implementation intent.""" + if not prompt or not prompt.strip(): + return False + for skip in SKIP_PATTERNS: + if skip.search(prompt): + return False + if _VERB_REGEX.search(prompt): + return True + lowered = prompt.lower() + return any(phrase in lowered for phrase in INDIRECT_INTENT_PHRASES) diff --git a/scripts/hooks/preflight_reminder.py b/scripts/hooks/preflight_reminder.py new file mode 100644 index 00000000..7ea081b2 --- /dev/null +++ b/scripts/hooks/preflight_reminder.py @@ -0,0 +1,93 @@ +"""UserPromptSubmit hook for Claude Code. + +When the user prompt indicates code-implementation intent, inject a +system-reminder elevating bicameral.preflight above the agent's default +tool-selection priority — but only as a write-op gate, not a discovery +gate. + +Reconciles with #146: that issue's failure mode was the agent doing +file inspection and then NEVER calling preflight at all. The original +fix (#155) over-corrected by telling the agent to call preflight +"before any file-inspection tool". That short-circuited the caller-LLM +discovery the rest of the contract depends on — preflight needs +``file_paths`` populated to do region-anchored retrieval, and the agent +can't extract file paths if we forbid it from looking at the codebase +first. + +Updated contract: + - Read / Grep / Glob FIRST — caller LLM resolves "the reorder feature" + to concrete file paths. + - bicameral.preflight(topic, file_paths) — fed by step 1's discovery, + so the server gets the deterministic binds_to lookup it needs. + - Write ops (Edit / Write / NotebookEdit / mutating Bash) — preflight + must precede the first one. This is the line that #146 was + actually defending; the asserter (assert_flow_2) gates exactly this. + +Per Claude Code 2.x hook contract: read JSON from stdin, write JSON to +stdout shaped as ``{"hookSpecificOutput": {"hookEventName": +"UserPromptSubmit", "additionalContext": "..."}}``. The legacy top-level +``{"additionalContext": ...}`` shape is silently ignored by the CLI — +the hook still runs and exits 0, but the context never reaches the +model. Errors are swallowed silently (exit 0, empty response) so a +broken hook never blocks a user. +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) +from hooks.preflight_intent import should_fire_preflight # noqa: E402 + +REMINDER_TEXT = ( + "<system-reminder>\n" + "This prompt indicates code-implementation intent. Auto-fire is the " + "contract: `bicameral.preflight` MUST run BEFORE any write op (Edit, " + "Write, NotebookEdit, or Bash that mutates state — git commit, file " + "writes, mv/rm/etc.).\n" + "\n" + "Read-only discovery FIRST, then preflight, then writes.\n" + "\n" + "Recommended sequence:\n" + " 1. Use Read / Grep / Glob to map the user's request to concrete " + "file paths. The user often names a feature ('the reorder feature') " + "rather than a file (`reorder.ts`); resolve that mapping yourself " + "before calling preflight.\n" + " 2. Call `bicameral.preflight(topic, file_paths)` with BOTH a " + "natural-language topic AND the concrete file paths discovered in " + "step 1. `file_paths=[]` defeats region-anchored retrieval — the " + "server uses these to look up bound decisions deterministically; " + "topic alone falls back to fuzzy text similarity.\n" + " 3. Read the surfaced decisions / drifted regions / open questions, " + "then proceed with the implementation.\n" + "\n" + "The skill produces zero output when nothing relevant is found, so " + "the cost of firing is bounded. Skipping preflight is the contract " + "violation, not running discovery first.\n" + "</system-reminder>" +) + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except (json.JSONDecodeError, ValueError): + return 0 + prompt = payload.get("prompt", "") if isinstance(payload, dict) else "" + if should_fire_preflight(prompt): + json.dump( + { + "hookSpecificOutput": { + "hookEventName": "UserPromptSubmit", + "additionalContext": REMINDER_TEXT, + } + }, + sys.stdout, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/sim_accountable.py b/scripts/sim_accountable.py index 6ffd401e..d95fe8fd 100644 --- a/scripts/sim_accountable.py +++ b/scripts/sim_accountable.py @@ -11,45 +11,58 @@ Run 7 — Search in surrealkv:// persistent mode (fix 3 verification) Run 8 — pending_compliance_checks → resolve_compliance → reflected status (v0.9.3 skill gap fix) """ -import sys, asyncio, os, tempfile, shutil, pathlib -sys.path.insert(0, '/Users/jinhongkuan/github/bicameral/pilot/mcp') -REPO = '/Users/jinhongkuan/github/Accountable-App-3.0' -os.environ['SURREAL_URL'] = 'memory://' -os.environ['REPO_PATH'] = REPO +import asyncio +import os +import pathlib +import shutil +import sys +import tempfile + +sys.path.insert(0, "/Users/jinhongkuan/github/bicameral/pilot/mcp") + +REPO = "/Users/jinhongkuan/github/Accountable-App-3.0" +os.environ["SURREAL_URL"] = "memory://" +os.environ["REPO_PATH"] = REPO RESULTS = [] + def section(title, body): RESULTS.append(f"\n## {title}\n\n{body.rstrip()}\n") - preview = body[:120].replace('\n', ' ') + preview = body[:120].replace("\n", " ") print(f"[{title}]", preview) def make_fresh_ledger(): - import importlib, adapters.ledger as _al + import importlib + + import adapters.ledger as _al + importlib.reload(_al) return _al.get_ledger() async def make_ctx(repo_path=None, surreal_url=None): if surreal_url: - os.environ['SURREAL_URL'] = surreal_url + os.environ["SURREAL_URL"] = surreal_url if repo_path: - os.environ['REPO_PATH'] = repo_path + os.environ["REPO_PATH"] = repo_path from adapters.code_locator import get_code_locator + ledger = make_fresh_ledger() await ledger.connect() code_graph = get_code_locator() class Ctx: pass + ctx = Ctx() ctx.repo_path = repo_path or REPO - ctx.session_id = 'sim-accountable-v2' - ctx.authoritative_ref = 'main' - ctx.authoritative_sha = '' - ctx.head_sha = '' + ctx.session_id = "sim-accountable-v2" + ctx.authoritative_ref = "main" + ctx.authoritative_sha = "" + ctx.head_sha = "" ctx.drift_analyzer = None ctx._sync_state = {} ctx.ledger = ledger @@ -58,24 +71,70 @@ class Ctx: SLACK_DECISIONS = [ - {"description": "All code changes must go to staging first via PR targeting staging branch — Ian cannot merge direct to main", "feature_group": "Dev Process", "decision_level": "L1"}, - {"description": "Staging environment mirrors prod with real integrations (except SMS and Zoom) and must stay in sync with main", "feature_group": "Dev Process", "decision_level": "L2"}, - {"description": "Brian Borg acts as engineering quarterback and coordinator — all PRs assigned to Brian before going to prod", "feature_group": "Dev Process", "decision_level": "L1"}, - {"description": "All high-value secrets live in Supabase secrets — not in Vercel env vars", "feature_group": "Security", "decision_level": "L2"}, - {"description": "Sentry auth token must be rotated and marked Sensitive in Vercel after Vercel breach exposed unprotected env vars", "feature_group": "Security", "decision_level": "L1"}, - {"description": "Assess Sentry vs PostHog — PostHog now captures ~80% of Sentry value; evaluate eliminating redundant tool", "feature_group": "Observability", "decision_level": "L2"}, - {"description": "Individual coaching portal for 1:1 clients to manage engagements, see recording transcripts, insights and trends", "feature_group": "Coaching Portal", "decision_level": "L1"}, - {"description": "Weekly workshop module should be a repeatable component — AI agent populates it and creates a new record each week rather than generating new code", "feature_group": "Weekly Workshop", "decision_level": "L2"}, - {"description": "Users can view their daily check-in completion history and trend data in the Accountable platform", "feature_group": "Daily Check-in", "decision_level": "L1"}, - {"description": "Claude reasoning level should be task-appropriate — start at lower reasoning with escalation tiers rather than always using maximum reasoning", "feature_group": "AI Coach", "decision_level": "L2"}, - {"description": "Weekly community bulletin delivered as a dynamic page — email directs users there rather than embedding full content to protect deliverability", "feature_group": "Email / Comms", "decision_level": "L2"}, + { + "description": "All code changes must go to staging first via PR targeting staging branch — Ian cannot merge direct to main", + "feature_group": "Dev Process", + "decision_level": "L1", + }, + { + "description": "Staging environment mirrors prod with real integrations (except SMS and Zoom) and must stay in sync with main", + "feature_group": "Dev Process", + "decision_level": "L2", + }, + { + "description": "Brian Borg acts as engineering quarterback and coordinator — all PRs assigned to Brian before going to prod", + "feature_group": "Dev Process", + "decision_level": "L1", + }, + { + "description": "All high-value secrets live in Supabase secrets — not in Vercel env vars", + "feature_group": "Security", + "decision_level": "L2", + }, + { + "description": "Sentry auth token must be rotated and marked Sensitive in Vercel after Vercel breach exposed unprotected env vars", + "feature_group": "Security", + "decision_level": "L1", + }, + { + "description": "Assess Sentry vs PostHog — PostHog now captures ~80% of Sentry value; evaluate eliminating redundant tool", + "feature_group": "Observability", + "decision_level": "L2", + }, + { + "description": "Individual coaching portal for 1:1 clients to manage engagements, see recording transcripts, insights and trends", + "feature_group": "Coaching Portal", + "decision_level": "L1", + }, + { + "description": "Weekly workshop module should be a repeatable component — AI agent populates it and creates a new record each week rather than generating new code", + "feature_group": "Weekly Workshop", + "decision_level": "L2", + }, + { + "description": "Users can view their daily check-in completion history and trend data in the Accountable platform", + "feature_group": "Daily Check-in", + "decision_level": "L1", + }, + { + "description": "Claude reasoning level should be task-appropriate — start at lower reasoning with escalation tiers rather than always using maximum reasoning", + "feature_group": "AI Coach", + "decision_level": "L2", + }, + { + "description": "Weekly community bulletin delivered as a dynamic page — email directs users there rather than embedding full content to protect deliverability", + "feature_group": "Email / Comms", + "decision_level": "L2", + }, ] # ── Run 1: Ingest ──────────────────────────────────────────────────────────── + async def run_ingest(ctx): from handlers.ingest import handle_ingest + mappings = [ { "intent": d["description"], @@ -91,11 +150,14 @@ async def run_ingest(ctx): } for d in SLACK_DECISIONS ] - result = await handle_ingest(ctx, { - "repo": REPO, - "query": "Accountable platform decisions from #accountable-tech", - "mappings": mappings, - }) + result = await handle_ingest( + ctx, + { + "repo": REPO, + "query": "Accountable platform decisions from #accountable-tech", + "mappings": mappings, + }, + ) created = result.created_decisions body = ( @@ -106,9 +168,11 @@ async def run_ingest(ctx): "Entries:\n" ) for d in created: - body += f" [{d.decision_level or '?'}] {d.decision_id} \"{d.description[:58]}...\"\n" + body += f' [{d.decision_level or "?"}] {d.decision_id} "{d.description[:58]}..."\n' - l1_in_pending = [d for d in result.pending_grounding_decisions if d.get("decision_level") == "L1"] + l1_in_pending = [ + d for d in result.pending_grounding_decisions if d.get("decision_level") == "L1" + ] body += ( f"\nL1 filter: pending_grounding_decisions has " f"{len(result.pending_grounding_decisions)} entries, " @@ -120,20 +184,26 @@ async def run_ingest(ctx): # ── Run 2: Preflight regression ────────────────────────────────────────────── + async def run_preflight_quick(ctx): from handlers.preflight import handle_preflight + r = await handle_preflight(ctx, topic="weekly workshop module repeatable component") - fired = getattr(r, 'fired', False) - count = len(getattr(r, 'decisions', []) or []) + fired = getattr(r, "fired", False) + count = len(getattr(r, "decisions", []) or []) body = f"Topic: 'weekly workshop module repeatable component'\nFired: {fired}, decisions surfaced: {count}\n" - body += "Result: " + ("PASS — preflight regression clean\n" if fired and count >= 1 else "FAIL\n") + body += "Result: " + ( + "PASS — preflight regression clean\n" if fired and count >= 1 else "FAIL\n" + ) section("Run 2 — Preflight regression", body) # ── Run 3: History + fix-2 verification ───────────────────────────────────── + async def run_history_verify(ctx): from handlers.history import handle_history + result = await handle_history(ctx) features = result.features or [] @@ -141,18 +211,18 @@ async def run_history_verify(ctx): name_ok = True level_ok = False for fg in features: - name = fg.name # correct attr (was fg.feature_group in v1 sim → showed '?') + name = fg.name # correct attr (was fg.feature_group in v1 sim → showed '?') decisions = fg.decisions or [] body += f" [{name}] — {len(decisions)} decision(s)\n" - if not name or name == '?': + if not name or name == "?": name_ok = False for d in decisions[:2]: - lvl = d.decision_level # new field — was absent from HistoryDecision in v1 sim + lvl = d.decision_level # new field — was absent from HistoryDecision in v1 sim body += f" [{lvl or 'None'}|{d.status}] {d.summary[:65]}\n" if lvl is not None: level_ok = True - body += f"\nFix 2 verdict:\n" + body += "\nFix 2 verdict:\n" body += f" fg.name populated: {name_ok} (was '?' in v1 — fixed)\n" body += f" d.decision_level populated: {level_ok} (was absent in v1 — fixed)\n" section("Run 3 — History + fix-2 verification (HistoryDecision.decision_level)", body) @@ -160,6 +230,7 @@ async def run_history_verify(ctx): # ── Run 4: Bind L2 decisions to Accountable code ──────────────────────────── + async def run_bind_accountable(ctx, ingest_result): from handlers.bind import handle_bind @@ -168,7 +239,10 @@ async def run_bind_accountable(ctx, ingest_result): ai_coach_id = next((v for k, v in id_by_desc.items() if "reasoning level" in k.lower()), None) if not weekly_id or not ai_coach_id: - section("Run 4 — Bind L2 decisions to Accountable code", "ERROR: target IDs not found in created_decisions") + section( + "Run 4 — Bind L2 decisions to Accountable code", + "ERROR: target IDs not found in created_decisions", + ) return None bindings = [ @@ -212,12 +286,14 @@ async def run_bind_accountable(ctx, ingest_result): # ── Run 5: Drift check post-bind (should be clean) ────────────────────────── + async def run_drift_post_bind(ctx): from handlers.detect_drift import handle_detect_drift + target = "supabase/functions/generate-weekly-ai-insights/index.ts" result = await handle_detect_drift(ctx, file_path=target) - drifted = getattr(result, 'drifted', []) or [] - reflected = getattr(result, 'reflected', []) or [] + drifted = getattr(result, "drifted", []) or [] + reflected = getattr(result, "reflected", []) or [] body = ( f"File: {target}\n" f"Drifted: {len(drifted)}, Reflected: {len(reflected)}\n" @@ -260,21 +336,34 @@ def apply_tier_bonus(base: float, tier: str) -> float: async def run_full_drift_loop(): """Follow-up 4: ingest → bind → modify file → detect drift.""" import subprocess - tmpdir = tempfile.mkdtemp(prefix='bicam_drift_test_') + + tmpdir = tempfile.mkdtemp(prefix="bicam_drift_test_") try: # Bootstrap a real git repo so compute_content_hash works - subprocess.run(['git', 'init', '-b', 'main'], cwd=tmpdir, check=True, capture_output=True) - subprocess.run(['git', 'config', 'user.email', 'test@test.com'], cwd=tmpdir, check=True, capture_output=True) - subprocess.run(['git', 'config', 'user.name', 'Test'], cwd=tmpdir, check=True, capture_output=True) + subprocess.run(["git", "init", "-b", "main"], cwd=tmpdir, check=True, capture_output=True) + subprocess.run( + ["git", "config", "user.email", "test@test.com"], + cwd=tmpdir, + check=True, + capture_output=True, + ) + subprocess.run( + ["git", "config", "user.name", "Test"], cwd=tmpdir, check=True, capture_output=True + ) # Write and commit initial version test_file = pathlib.Path(tmpdir) / "discount.py" test_file.write_text(TEMP_FILE_CONTENT_V1) - subprocess.run(['git', 'add', 'discount.py'], cwd=tmpdir, check=True, capture_output=True) - subprocess.run(['git', 'commit', '-m', 'initial: 10% discount on $100+'], cwd=tmpdir, check=True, capture_output=True) + subprocess.run(["git", "add", "discount.py"], cwd=tmpdir, check=True, capture_output=True) + subprocess.run( + ["git", "commit", "-m", "initial: 10% discount on $100+"], + cwd=tmpdir, + check=True, + capture_output=True, + ) - os.environ['SURREAL_URL'] = 'memory://' - os.environ['REPO_PATH'] = tmpdir + os.environ["SURREAL_URL"] = "memory://" + os.environ["REPO_PATH"] = tmpdir ledger = make_fresh_ledger() await ledger.connect() @@ -283,12 +372,13 @@ async def run_full_drift_loop(): class Ctx: pass + ctx = Ctx() ctx.repo_path = tmpdir - ctx.session_id = 'sim-drift-loop' - ctx.authoritative_ref = 'main' - ctx.authoritative_sha = '' - ctx.head_sha = '' + ctx.session_id = "sim-drift-loop" + ctx.authoritative_ref = "main" + ctx.authoritative_sha = "" + ctx.head_sha = "" ctx.drift_analyzer = None ctx._sync_state = {} ctx.ledger = ledger @@ -296,64 +386,78 @@ class Ctx: # Step 1: ingest a decision about the discount logic from handlers.ingest import handle_ingest - ingest_result = await handle_ingest(ctx, { - "repo": tmpdir, - "query": "discount policy decision", - "mappings": [{ - "intent": "Apply 10% discount on orders over $100", - "feature_group": "Pricing", - "decision_level": "L2", - "span": { - "text": "Apply 10% discount on orders over $100", - "source_type": "slack", - "source_ref": "eng-discussion", - "meeting_date": "2026-04-26", - "speakers": ["Jin"], - }, - }], - }) + + ingest_result = await handle_ingest( + ctx, + { + "repo": tmpdir, + "query": "discount policy decision", + "mappings": [ + { + "intent": "Apply 10% discount on orders over $100", + "feature_group": "Pricing", + "decision_level": "L2", + "span": { + "text": "Apply 10% discount on orders over $100", + "source_type": "slack", + "source_ref": "eng-discussion", + "meeting_date": "2026-04-26", + "speakers": ["Jin"], + }, + } + ], + }, + ) decision_id = ingest_result.created_decisions[0].decision_id # Step 2: bind to the file at its current state from handlers.bind import handle_bind - bind_result = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "discount.py", - "symbol_name": "calculate_discount", - "start_line": 1, - "end_line": 5, - "purpose": "Discount calculation — 10% on orders over $100", - }]) + + bind_result = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "discount.py", + "symbol_name": "calculate_discount", + "start_line": 1, + "end_line": 5, + "purpose": "Discount calculation — 10% on orders over $100", + } + ], + ) bind_ok = bind_result.bindings and not bind_result.bindings[0].error initial_hash = bind_result.bindings[0].content_hash if bind_ok else "?" region_id = bind_result.bindings[0].region_id # Step 3: snapshot the stored hash before modification - pre_hash_row = await ledger._client.query( - f"SELECT content_hash FROM {region_id} LIMIT 1" - ) + pre_hash_row = await ledger._client.query(f"SELECT content_hash FROM {region_id} LIMIT 1") pre_hash = (pre_hash_row[0].get("content_hash") or "") if pre_hash_row else "" # Step 3b: check drift status — should be pending (V1: no compliance verdict yet) from handlers.detect_drift import handle_detect_drift + pre_result = await handle_detect_drift(ctx, file_path="discount.py") - pre_pending = len(getattr(pre_result, 'pending', []) or []) + pre_pending = len(getattr(pre_result, "pending", []) or []) # Step 4: modify the file and commit (threshold and rate changed) test_file.write_text(TEMP_FILE_CONTENT_V2) - subprocess.run(['git', 'add', 'discount.py'], cwd=tmpdir, check=True, capture_output=True) - subprocess.run(['git', 'commit', '-m', 'change: 15% discount on $50+'], cwd=tmpdir, check=True, capture_output=True) + subprocess.run(["git", "add", "discount.py"], cwd=tmpdir, check=True, capture_output=True) + subprocess.run( + ["git", "commit", "-m", "change: 15% discount on $50+"], + cwd=tmpdir, + check=True, + capture_output=True, + ) # Step 5: run detect_drift — triggers link_commit which re-hashes the file post_result = await handle_detect_drift(ctx, file_path="discount.py") - post_drifted = getattr(post_result, 'drifted', []) or [] - post_pending = getattr(post_result, 'pending', []) or [] + post_drifted = getattr(post_result, "drifted", []) or [] + post_pending = getattr(post_result, "pending", []) or [] # Step 5b: confirm the stored hash updated to reflect the new content - post_hash_row = await ledger._client.query( - f"SELECT content_hash FROM {region_id} LIMIT 1" - ) + post_hash_row = await ledger._client.query(f"SELECT content_hash FROM {region_id} LIMIT 1") post_hash = (post_hash_row[0].get("content_hash") or "") if post_hash_row else "" hash_changed = pre_hash != post_hash and bool(post_hash) @@ -385,66 +489,80 @@ class Ctx: finally: shutil.rmtree(tmpdir, ignore_errors=True) - os.environ['SURREAL_URL'] = 'memory://' - os.environ['REPO_PATH'] = REPO + os.environ["SURREAL_URL"] = "memory://" + os.environ["REPO_PATH"] = REPO section("Run 6 — Full ingest→bind→modify→drift loop (follow-up 4)", body) # ── Run 7: Search in surrealkv:// persistent mode ─────────────────────────── + async def run_search_persistent(): - tmpdir = tempfile.mkdtemp(prefix='bicam_search_test_') + tmpdir = tempfile.mkdtemp(prefix="bicam_search_test_") try: - db_url = f'surrealkv://{tmpdir}/test.db' - os.environ['SURREAL_URL'] = db_url - os.environ['REPO_PATH'] = REPO + db_url = f"surrealkv://{tmpdir}/test.db" + os.environ["SURREAL_URL"] = db_url + os.environ["REPO_PATH"] = REPO ledger = make_fresh_ledger() await ledger.connect() from ledger.queries import upsert_decision + client = ledger._client test_decisions = [ - ("Coaching portal enables 1:1 client engagement visibility with transcripts", "Coaching Portal"), - ("Weekly workshop creates a new repeatable record each week via AI agent", "Weekly Workshop"), + ( + "Coaching portal enables 1:1 client engagement visibility with transcripts", + "Coaching Portal", + ), + ( + "Weekly workshop creates a new repeatable record each week via AI agent", + "Weekly Workshop", + ), ("Sentry token must be rotated after Vercel breach exposed env vars", "Security"), ] for desc, fg in test_decisions: await upsert_decision( - client, description=desc, source_type="slack", - source_ref="accountable-tech", status="ungrounded", feature_group=fg, + client, + description=desc, + source_type="slack", + source_ref="accountable-tech", + status="ungrounded", + feature_group=fg, ) await asyncio.sleep(0.3) # let FTS index settle class Ctx2: pass + ctx2 = Ctx2() ctx2.repo_path = REPO - ctx2.session_id = 'sim-search' - ctx2.authoritative_ref = 'main' - ctx2.authoritative_sha = '' - ctx2.head_sha = '' + ctx2.session_id = "sim-search" + ctx2.authoritative_ref = "main" + ctx2.authoritative_sha = "" + ctx2.head_sha = "" ctx2.drift_analyzer = None ctx2._sync_state = {} ctx2.ledger = ledger ctx2.code_graph = None from handlers.search_decisions import handle_search_decisions + queries = ["coaching portal", "weekly workshop", "Sentry breach"] results_map = {} for q in queries: r = await handle_search_decisions(ctx2, query=q) - results_map[q] = getattr(r, 'decisions', []) or [] + results_map[q] = getattr(r, "decisions", []) or [] total_matches = sum(len(v) for v in results_map.values()) - body = f"DB: surrealkv:// (persistent, temp path)\nIngested 3 decisions, ran 3 queries.\n\n" + body = "DB: surrealkv:// (persistent, temp path)\nIngested 3 decisions, ran 3 queries.\n\n" for q, matches in results_map.items(): body += f"Query: '{q}'\n Matches: {len(matches)}\n" for d in matches[:2]: - body += f" - {getattr(d,'description','')[:70]}\n" + body += f" - {getattr(d, 'description', '')[:70]}\n" if total_matches == 0: body += ( @@ -460,14 +578,15 @@ class Ctx2: finally: shutil.rmtree(tmpdir, ignore_errors=True) - os.environ['SURREAL_URL'] = 'memory://' - os.environ['REPO_PATH'] = REPO + os.environ["SURREAL_URL"] = "memory://" + os.environ["REPO_PATH"] = REPO section("Run 7 — Search in surrealkv:// persistent mode (fix 3 verification)", body) # ── Run 8: pending_compliance_checks → resolve_compliance → reflected ──────── + async def run_compliance_resolution_loop(): """ Verify the V1 path to 'reflected' status: @@ -477,24 +596,37 @@ async def run_compliance_resolution_loop(): This is the exact flow the updated scan-branch / drift skills now prescribe. """ import subprocess - tmpdir = tempfile.mkdtemp(prefix='bicam_compliance_test_') + + tmpdir = tempfile.mkdtemp(prefix="bicam_compliance_test_") try: - subprocess.run(['git', 'init', '-b', 'main'], cwd=tmpdir, check=True, capture_output=True) - subprocess.run(['git', 'config', 'user.email', 'test@test.com'], cwd=tmpdir, check=True, capture_output=True) - subprocess.run(['git', 'config', 'user.name', 'Test'], cwd=tmpdir, check=True, capture_output=True) + subprocess.run(["git", "init", "-b", "main"], cwd=tmpdir, check=True, capture_output=True) + subprocess.run( + ["git", "config", "user.email", "test@test.com"], + cwd=tmpdir, + check=True, + capture_output=True, + ) + subprocess.run( + ["git", "config", "user.name", "Test"], cwd=tmpdir, check=True, capture_output=True + ) test_file = pathlib.Path(tmpdir) / "auth.py" test_file.write_text( - 'def require_auth(request):\n' + "def require_auth(request):\n" ' """Reject unauthenticated requests with 401."""\n' ' if not request.get("token"):\n' ' raise PermissionError("401 Unauthorized")\n' ) - subprocess.run(['git', 'add', 'auth.py'], cwd=tmpdir, check=True, capture_output=True) - subprocess.run(['git', 'commit', '-m', 'initial: auth gate'], cwd=tmpdir, check=True, capture_output=True) + subprocess.run(["git", "add", "auth.py"], cwd=tmpdir, check=True, capture_output=True) + subprocess.run( + ["git", "commit", "-m", "initial: auth gate"], + cwd=tmpdir, + check=True, + capture_output=True, + ) - os.environ['SURREAL_URL'] = 'memory://' - os.environ['REPO_PATH'] = tmpdir + os.environ["SURREAL_URL"] = "memory://" + os.environ["REPO_PATH"] = tmpdir ledger = make_fresh_ledger() await ledger.connect() @@ -503,12 +635,13 @@ async def run_compliance_resolution_loop(): class Ctx: pass + ctx = Ctx() ctx.repo_path = tmpdir - ctx.session_id = 'sim-compliance' - ctx.authoritative_ref = 'main' - ctx.authoritative_sha = '' - ctx.head_sha = '' + ctx.session_id = "sim-compliance" + ctx.authoritative_ref = "main" + ctx.authoritative_sha = "" + ctx.head_sha = "" ctx.drift_analyzer = None ctx._sync_state = {} ctx.ledger = ledger @@ -516,22 +649,28 @@ class Ctx: # Step 1: ingest from handlers.ingest import handle_ingest - ingest_result = await handle_ingest(ctx, { - "repo": tmpdir, - "query": "auth gate decision", - "mappings": [{ - "intent": "All API endpoints must reject unauthenticated requests with HTTP 401", - "feature_group": "Auth", - "decision_level": "L2", - "span": { - "text": "All API endpoints must reject unauthenticated requests with HTTP 401", - "source_type": "slack", - "source_ref": "eng-discussion", - "meeting_date": "2026-04-26", - "speakers": ["Jin"], - }, - }], - }) + + ingest_result = await handle_ingest( + ctx, + { + "repo": tmpdir, + "query": "auth gate decision", + "mappings": [ + { + "intent": "All API endpoints must reject unauthenticated requests with HTTP 401", + "feature_group": "Auth", + "decision_level": "L2", + "span": { + "text": "All API endpoints must reject unauthenticated requests with HTTP 401", + "source_type": "slack", + "source_ref": "eng-discussion", + "meeting_date": "2026-04-26", + "speakers": ["Jin"], + }, + } + ], + }, + ) decision_id = ingest_result.created_decisions[0].decision_id # Step 2: ratify the decision — proposed decisions are drift-exempt and @@ -539,23 +678,33 @@ class Ctx: # In real sessions the user reviews proposed decisions and calls ratify; # in this simulation we ratify immediately for verification purposes. from handlers.ratify import handle_ratify + await handle_ratify(ctx, decision_id=decision_id, signer="sim-run8", action="ratify") # Step 3: bind from handlers.bind import handle_bind - bind_result = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "auth.py", - "symbol_name": "require_auth", - "start_line": 1, - "end_line": 4, - "purpose": "Auth gate — reject unauthenticated requests with 401", - }]) + + bind_result = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "auth.py", + "symbol_name": "require_auth", + "start_line": 1, + "end_line": 4, + "purpose": "Auth gate — reject unauthenticated requests with 401", + } + ], + ) bind_ok = bind_result.bindings and not bind_result.bindings[0].error region_id = bind_result.bindings[0].region_id if bind_ok else None if not bind_ok: - section("Run 8 — pending_compliance_checks → resolve_compliance → reflected", "FAIL — bind failed") + section( + "Run 8 — pending_compliance_checks → resolve_compliance → reflected", + "FAIL — bind failed", + ) return # Step 3: advance HEAD so the sync cache is stale and link_commit sweeps fresh. @@ -563,32 +712,40 @@ class Ctx: # last_synced_commit, so without a new commit the detect_drift call # would hit the stale pre-bind cache and find 0 regions. test_file.write_text( - 'def require_auth(request):\n' + "def require_auth(request):\n" ' """Reject unauthenticated requests with 401."""\n' ' if not request.get("token"):\n' ' raise PermissionError("401 Unauthorized")\n' - '# v2: docstring clarified\n' + "# v2: docstring clarified\n" + ) + subprocess.run(["git", "add", "auth.py"], cwd=tmpdir, check=True, capture_output=True) + subprocess.run( + ["git", "commit", "-m", "docs: clarify require_auth docstring"], + cwd=tmpdir, + check=True, + capture_output=True, ) - subprocess.run(['git', 'add', 'auth.py'], cwd=tmpdir, check=True, capture_output=True) - subprocess.run(['git', 'commit', '-m', 'docs: clarify require_auth docstring'], cwd=tmpdir, check=True, capture_output=True) # Step 4: detect_drift — triggers a fresh link_commit that sweeps auth.py, # finds the grounded region, and generates pending_compliance_checks. from handlers.detect_drift import handle_detect_drift + drift_result = await handle_detect_drift(ctx, file_path="auth.py") - sync_status = getattr(drift_result, 'sync_status', None) - pending_checks = getattr(sync_status, 'pending_compliance_checks', []) or [] - flow_id = getattr(sync_status, 'flow_id', '') or '' + sync_status = getattr(drift_result, "sync_status", None) + pending_checks = getattr(sync_status, "pending_compliance_checks", []) or [] + flow_id = getattr(sync_status, "flow_id", "") or "" status_before = "unknown" if pending_checks: # Read the actual decision status before resolving from ledger.queries import project_decision_status - inner = getattr(ledger, '_inner', ledger) + + inner = getattr(ledger, "_inner", ledger) status_before = await project_decision_status(inner._client, decision_id) # Step 5: call resolve_compliance for each pending check from handlers.resolve_compliance import handle_resolve_compliance + verdicts_written = 0 if pending_checks: verdicts = [ @@ -612,10 +769,11 @@ class Ctx: # Step 6: verify status is now 'reflected' from ledger.queries import project_decision_status - inner = getattr(ledger, '_inner', ledger) + + inner = getattr(ledger, "_inner", ledger) status_after = await project_decision_status(inner._client, decision_id) - passed = (status_after == "reflected") + passed = status_after == "reflected" if pending_checks: body = ( @@ -642,14 +800,17 @@ class Ctx: finally: shutil.rmtree(tmpdir, ignore_errors=True) - os.environ['SURREAL_URL'] = 'memory://' - os.environ['REPO_PATH'] = REPO + os.environ["SURREAL_URL"] = "memory://" + os.environ["REPO_PATH"] = REPO - section("Run 8 — pending_compliance_checks → resolve_compliance → reflected (skill gap fix)", body) + section( + "Run 8 — pending_compliance_checks → resolve_compliance → reflected (skill gap fix)", body + ) # ── Run 9: signoff/status decoupling verification ─────────────────────────── + async def run_signoff_status_decoupling(): """ Verify the v0.9+ orthogonalization of status (code-compliance) and signoff (human-approval): @@ -660,30 +821,40 @@ async def run_signoff_status_decoupling(): C. resolve_collision supersede merges signoff dict — ratification record preserved D. History shows superseded decisions with last code-compliance status + signoff_state """ - import subprocess, datetime as dt - tmpdir = tempfile.mkdtemp(prefix='bicam_signoff_test_') + import datetime as dt + import subprocess + + tmpdir = tempfile.mkdtemp(prefix="bicam_signoff_test_") try: - subprocess.run(['git', 'init', '-b', 'main'], cwd=tmpdir, check=True, capture_output=True) - subprocess.run(['git', 'config', 'user.email', 'test@test.com'], cwd=tmpdir, check=True, capture_output=True) - subprocess.run(['git', 'config', 'user.name', 'Test'], cwd=tmpdir, check=True, capture_output=True) - (pathlib.Path(tmpdir) / 'app.py').write_text('def main(): pass\n') - subprocess.run(['git', 'add', 'app.py'], cwd=tmpdir, check=True, capture_output=True) - subprocess.run(['git', 'commit', '-m', 'init'], cwd=tmpdir, check=True, capture_output=True) - - os.environ['SURREAL_URL'] = 'memory://' - os.environ['REPO_PATH'] = tmpdir + subprocess.run(["git", "init", "-b", "main"], cwd=tmpdir, check=True, capture_output=True) + subprocess.run( + ["git", "config", "user.email", "test@test.com"], + cwd=tmpdir, + check=True, + capture_output=True, + ) + subprocess.run( + ["git", "config", "user.name", "Test"], cwd=tmpdir, check=True, capture_output=True + ) + (pathlib.Path(tmpdir) / "app.py").write_text("def main(): pass\n") + subprocess.run(["git", "add", "app.py"], cwd=tmpdir, check=True, capture_output=True) + subprocess.run(["git", "commit", "-m", "init"], cwd=tmpdir, check=True, capture_output=True) + + os.environ["SURREAL_URL"] = "memory://" + os.environ["REPO_PATH"] = tmpdir ledger = make_fresh_ledger() await ledger.connect() from adapters.code_locator import get_code_locator class Ctx: pass + ctx = Ctx() ctx.repo_path = tmpdir - ctx.session_id = 'sim-signoff' - ctx.authoritative_ref = 'main' - ctx.authoritative_sha = '' - ctx.head_sha = '' + ctx.session_id = "sim-signoff" + ctx.authoritative_ref = "main" + ctx.authoritative_sha = "" + ctx.head_sha = "" ctx.drift_analyzer = None ctx._sync_state = {} ctx.ledger = ledger @@ -698,36 +869,39 @@ class Ctx: from handlers.ingest import handle_ingest from ledger.queries import project_decision_status - ingest_r = await handle_ingest(ctx, { - "repo": tmpdir, - "query": "signoff decoupling test", - "mappings": [{ - "intent": "Feature flags must be documented before enabling in prod", - "feature_group": "Release", - "decision_level": "L2", - "span": { - "text": "Feature flags must be documented before enabling in prod", - "source_type": "slack", - "source_ref": "eng-channel", - "meeting_date": "2026-04-26", - "speakers": ["Jin"], - }, - # NOTE: no 'signoff' key — server stamps signoff.state='proposed' - }], - }) + ingest_r = await handle_ingest( + ctx, + { + "repo": tmpdir, + "query": "signoff decoupling test", + "mappings": [ + { + "intent": "Feature flags must be documented before enabling in prod", + "feature_group": "Release", + "decision_level": "L2", + "span": { + "text": "Feature flags must be documented before enabling in prod", + "source_type": "slack", + "source_ref": "eng-channel", + "meeting_date": "2026-04-26", + "speakers": ["Jin"], + }, + # NOTE: no 'signoff' key — server stamps signoff.state='proposed' + } + ], + }, + ) did = ingest_r.created_decisions[0].decision_id - inner = getattr(ledger, '_inner', ledger) + inner = getattr(ledger, "_inner", ledger) code_status = await project_decision_status(inner._client, did) - raw_rows = await inner._client.query( - f"SELECT signoff FROM {did} LIMIT 1" - ) - raw_signoff = (raw_rows[0].get('signoff') or {}) if raw_rows else {} - signoff_state = raw_signoff.get('state', '?') - discovered = raw_signoff.get('discovered', '?') + raw_rows = await inner._client.query(f"SELECT signoff FROM {did} LIMIT 1") + raw_signoff = (raw_rows[0].get("signoff") or {}) if raw_rows else {} + signoff_state = raw_signoff.get("state", "?") + discovered = raw_signoff.get("discovered", "?") - a_pass = (code_status == 'ungrounded' and signoff_state == 'proposed') + a_pass = code_status == "ungrounded" and signoff_state == "proposed" results_a = [ f" decision_id: {did}", f" status: {code_status} (expected: ungrounded)", @@ -738,9 +912,7 @@ class Ctx: # ── B: session-start banner detects stale proposal via signoff ──────── # Backdate the signoff to simulate 15-day-old proposal - stale_created = ( - dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=15) - ).isoformat() + stale_created = (dt.datetime.now(dt.UTC) - dt.timedelta(days=15)).isoformat() await inner._client.execute( f"UPDATE {did} SET signoff = $s", {"s": {**raw_signoff, "created_at": stale_created}}, @@ -748,6 +920,7 @@ class Ctx: # Mock the ledger's get_decisions_by_status to return our stale-proposal row from unittest.mock import AsyncMock, patch + stale_row = { "decision_id": did, "description": "Feature flags must be documented before enabling in prod", @@ -759,6 +932,7 @@ class Ctx: class BannerCtx: pass + bctx = BannerCtx() bctx._sync_state = {} mock_ledger = AsyncMock() @@ -766,14 +940,15 @@ class BannerCtx: bctx.ledger = mock_ledger from handlers.sync_middleware import get_session_start_banner + banner = await get_session_start_banner(bctx) b_pass = ( banner is not None and banner.stale_proposal_count == 1 and banner.proposal_count == 1 - and any(i.get('signoff_state') == 'proposed' for i in banner.items) - and 'stale proposal' in banner.message + and any(i.get("signoff_state") == "proposed" for i in banner.items) + and "stale proposal" in banner.message ) results_b = [ f" banner fired: {banner is not None}", @@ -788,37 +963,46 @@ class BannerCtx: # ── C: resolve_collision supersede merges signoff ───────────────────── # Ratify the old decision first from handlers.ratify import handle_ratify + rat = await handle_ratify(ctx, decision_id=did, signer="sim-run9") old_signoff_after_ratify = rat.signoff # Ingest a new superseding decision - ingest_new = await handle_ingest(ctx, { - "repo": tmpdir, - "query": "supersede test", - "mappings": [{ - "intent": "Feature flags must be documented AND reviewed by two engineers before prod", - "feature_group": "Release", - "decision_level": "L2", - "span": { - "text": "Feature flags must be documented AND reviewed by two engineers", - "source_type": "slack", - "source_ref": "eng-channel-v2", - "meeting_date": "2026-04-26", - "speakers": ["Jin"], - }, - }], - }) + ingest_new = await handle_ingest( + ctx, + { + "repo": tmpdir, + "query": "supersede test", + "mappings": [ + { + "intent": "Feature flags must be documented AND reviewed by two engineers before prod", + "feature_group": "Release", + "decision_level": "L2", + "span": { + "text": "Feature flags must be documented AND reviewed by two engineers", + "source_type": "slack", + "source_ref": "eng-channel-v2", + "meeting_date": "2026-04-26", + "speakers": ["Jin"], + }, + } + ], + }, + ) new_did = ingest_new.created_decisions[0].decision_id from handlers.resolve_collision import handle_resolve_collision + await handle_resolve_collision(ctx, new_id=new_did, old_id=did, action="supersede") # Read the old decision's signoff after supersession post_rows = await inner._client.query(f"SELECT signoff FROM {did} LIMIT 1") - post_signoff = (post_rows[0].get('signoff') or {}) if post_rows else {} + post_signoff = (post_rows[0].get("signoff") or {}) if post_rows else {} - c_ratified_preserved = post_signoff.get('ratified_at') == old_signoff_after_ratify.get('ratified_at') - c_state_superseded = post_signoff.get('state') == 'superseded' + c_ratified_preserved = post_signoff.get("ratified_at") == old_signoff_after_ratify.get( + "ratified_at" + ) + c_state_superseded = post_signoff.get("state") == "superseded" c_pass = c_state_superseded and c_ratified_preserved results_c = [ @@ -831,15 +1015,15 @@ class BannerCtx: # ── D: history shows superseded decisions with code-compliance status ─ from handlers.history import handle_history + hist = await handle_history(ctx) superseded_decisions = [ - d for fg in hist.features for d in fg.decisions - if d.signoff_state == 'superseded' + d for fg in hist.features for d in fg.decisions if d.signoff_state == "superseded" ] d_pass = ( len(superseded_decisions) == 1 - and superseded_decisions[0].status in ('ungrounded', 'pending', 'drifted', 'reflected') - and superseded_decisions[0].signoff_state == 'superseded' + and superseded_decisions[0].status in ("ungrounded", "pending", "drifted", "reflected") + and superseded_decisions[0].signoff_state == "superseded" ) results_d_dec = superseded_decisions[0] if superseded_decisions else None results_d = [ @@ -851,20 +1035,24 @@ class BannerCtx: finally: shutil.rmtree(tmpdir, ignore_errors=True) - os.environ['SURREAL_URL'] = 'memory://' - os.environ['REPO_PATH'] = REPO + os.environ["SURREAL_URL"] = "memory://" + os.environ["REPO_PATH"] = REPO all_pass = a_pass and b_pass and c_pass and d_pass body = ( "Testing v0.9+ status/signoff orthogonalization:\n\n" "A — Ingest without signoff → status='ungrounded', signoff.state='proposed'\n" - + '\n'.join(results_a) + '\n\n' + + "\n".join(results_a) + + "\n\n" "B — Session-start banner detects stale proposals via signoff.state (not status)\n" - + '\n'.join(results_b) + '\n\n' + + "\n".join(results_b) + + "\n\n" "C — resolve_collision supersede merges signoff (preserves ratification record)\n" - + '\n'.join(results_c) + '\n\n' + + "\n".join(results_c) + + "\n\n" "D — History surfaces superseded decisions with last code-compliance status\n" - + '\n'.join(results_d) + '\n\n' + + "\n".join(results_d) + + "\n\n" f"Overall: {'PASS — all four orthogonalization invariants hold' if all_pass else 'PARTIAL PASS — see sub-results'}\n" ) section("Run 9 — signoff/status decoupling verification (v0.9+)", body) @@ -872,10 +1060,11 @@ class BannerCtx: # ── main ───────────────────────────────────────────────────────────────────── + async def main(): print("=== Bicameral MCP v0.9.3 extended simulation ===\n") - ctx = await make_ctx(repo_path=REPO, surreal_url='memory://') + ctx = await make_ctx(repo_path=REPO, surreal_url="memory://") ingest_result = await run_ingest(ctx) await run_preflight_quick(ctx) await run_history_verify(ctx) diff --git a/scripts/sim_issue_108_flows.py b/scripts/sim_issue_108_flows.py index a524b05c..6590068a 100644 --- a/scripts/sim_issue_108_flows.py +++ b/scripts/sim_issue_108_flows.py @@ -75,9 +75,7 @@ class Ctx: def init_temp_git(prefix: str) -> str: tmpdir = tempfile.mkdtemp(prefix=prefix) - subprocess.run( - ["git", "init", "-b", "main"], cwd=tmpdir, check=True, capture_output=True - ) + subprocess.run(["git", "init", "-b", "main"], cwd=tmpdir, check=True, capture_output=True) subprocess.run( ["git", "config", "user.email", "sim@sim.com"], cwd=tmpdir, @@ -98,9 +96,7 @@ def commit_file(repo: str, relpath: str, content: str, message: str) -> None: p.parent.mkdir(parents=True, exist_ok=True) p.write_text(content) subprocess.run(["git", "add", relpath], cwd=repo, check=True, capture_output=True) - subprocess.run( - ["git", "commit", "-m", message], cwd=repo, check=True, capture_output=True - ) + subprocess.run(["git", "commit", "-m", message], cwd=repo, check=True, capture_output=True) # ── Flow 1: Record decisions from a meeting ──────────────────────────── @@ -156,9 +152,7 @@ async def flow_1_record_decisions() -> None: # Read raw signoff to verify state inner = getattr(ctx.ledger, "_inner", ctx.ledger) - raw_rows = await inner._client.query( - f"SELECT signoff FROM {decision_id} LIMIT 1" - ) + raw_rows = await inner._client.query(f"SELECT signoff FROM {decision_id} LIMIT 1") raw_signoff = (raw_rows[0].get("signoff") or {}) if raw_rows else {} signoff_state_post_ingest = raw_signoff.get("state", "?") status_post_ingest = await project_decision_status(inner._client, decision_id) @@ -174,8 +168,7 @@ async def flow_1_record_decisions() -> None: and signoff_state_post_ingest == "proposed" and status_post_ingest == "ungrounded" and signoff_state_post_ratify == "ratified" - and status_post_ratify - == "ungrounded" # still ungrounded — bind not yet called + and status_post_ratify == "ungrounded" # still ungrounded — bind not yet called ) body = ( @@ -396,9 +389,7 @@ async def flow_3_commit_to_reflected() -> None: # Out-of-session-committer invariant: status === 'pending' is the state that # drives the dashboard tooltip. Tooltip text in dashboard.html: # "Pending compliance — run /bicameral-sync in your Claude Code session to resolve." - out_of_session_state_correct = ( - status_pending == "pending" and len(pending_checks) >= 1 - ) + out_of_session_state_correct = status_pending == "pending" and len(pending_checks) >= 1 # Caller-LLM resolves the queue (this is what /bicameral-sync does) verdicts = [ @@ -413,17 +404,11 @@ async def flow_3_commit_to_reflected() -> None: for c in pending_checks ] if verdicts: - await handle_resolve_compliance( - ctx, phase="drift", verdicts=verdicts, flow_id=flow_id - ) + await handle_resolve_compliance(ctx, phase="drift", verdicts=verdicts, flow_id=flow_id) status_after = await project_decision_status(inner._client, decision_id) - passed = ( - out_of_session_state_correct - and bool(flow_id) - and status_after == "reflected" - ) + passed = out_of_session_state_correct and bool(flow_id) and status_after == "reflected" body = ( f"Pre-resolve (out-of-session committer state):\n" @@ -464,9 +449,7 @@ async def flow_3a_ephemeral_branch() -> None: check=True, capture_output=True, ) - commit_file( - tmpdir, "feat.py", "def feature():\n return 'branch'\n", "feat: branch impl" - ) + commit_file(tmpdir, "feat.py", "def feature():\n return 'branch'\n", "feat: branch impl") try: ctx = await make_temp_ctx(tmpdir, "sim-flow3a") @@ -546,18 +529,14 @@ async def flow_3a_ephemeral_branch() -> None: } for c in pending_checks ] - await handle_resolve_compliance( - ctx, phase="drift", verdicts=verdicts, flow_id=flow_id - ) + await handle_resolve_compliance(ctx, phase="drift", verdicts=verdicts, flow_id=flow_id) inner = getattr(ctx.ledger, "_inner", ctx.ledger) status_on_branch = await project_decision_status(inner._client, did) # Switch back to main — ensure_ledger_synced should fire on next tool call # and the stale repair should mark the decision drifted (since H_main != H_branch). - subprocess.run( - ["git", "checkout", "main"], cwd=tmpdir, check=True, capture_output=True - ) + subprocess.run(["git", "checkout", "main"], cwd=tmpdir, check=True, capture_output=True) # Force fresh sync by invalidating any caches try: from handlers.link_commit import invalidate_sync_cache @@ -650,12 +629,8 @@ async def flow_4_session_end_capture() -> None: decision_id = ingest_r.created_decisions[0].decision_id inner = getattr(ctx.ledger, "_inner", ctx.ledger) - raw_rows = await inner._client.query( - f"SELECT signoff FROM {decision_id} LIMIT 1" - ) - signoff_state = ( - (raw_rows[0].get("signoff") or {}).get("state", "?") if raw_rows else "?" - ) + raw_rows = await inner._client.query(f"SELECT signoff FROM {decision_id} LIMIT 1") + signoff_state = (raw_rows[0].get("signoff") or {}).get("state", "?") if raw_rows else "?" status = await project_decision_status(inner._client, decision_id) # Verify source_type round-trips (history readback is the user-facing surface) @@ -667,9 +642,7 @@ async def flow_4_session_end_capture() -> None: target = next((d for d in all_decisions if d.id == decision_id), None) sources = target.sources if target else [] # HistorySource is a Pydantic model — attribute access, not .get() - source_types = ( - [getattr(s, "source_type", "?") for s in sources] if sources else [] - ) + source_types = [getattr(s, "source_type", "?") for s in sources] if sources else [] source_type_round_trip = source_types[0] if source_types else "?" passed = ( @@ -771,9 +744,7 @@ async def flow_5_history_axes() -> None: } all_have_status = all(d.status in valid_status for d in all_decisions) - all_have_signoff = all( - (d.signoff_state in valid_signoff) for d in all_decisions - ) + all_have_signoff = all((d.signoff_state in valid_signoff) for d in all_decisions) feature_count = len(hist.features) # Verify the orthogonalization: the ratified decision should show @@ -786,17 +757,16 @@ async def flow_5_history_axes() -> None: ) passed = ( - feature_count >= 2 - and all_have_status - and all_have_signoff - and ratified_axes_correct + feature_count >= 2 and all_have_status and all_have_signoff and ratified_axes_correct ) body = f"Feature groups: {feature_count}\n\n" for fg in hist.features: body += f" [{fg.name}] — {len(fg.decisions)} decision(s)\n" for d in fg.decisions: - body += f" status={d.status} signoff_state={d.signoff_state} '{d.summary[:50]}'\n" + body += ( + f" status={d.status} signoff_state={d.signoff_state} '{d.summary[:50]}'\n" + ) body += ( f"\nSpec invariant — orthogonal axes:\n" diff --git a/server.py b/server.py index 340ebbe7..a0ceb507 100644 --- a/server.py +++ b/server.py @@ -37,19 +37,19 @@ from mcp.types import TextContent, Tool from context import BicameralContext -from ledger.schema import DestructiveMigrationRequired, SchemaVersionTooNew +from dashboard.server import get_dashboard_server from handlers.bind import handle_bind from handlers.gap_judge import handle_judge_gaps +from handlers.history import handle_history from handlers.ingest import handle_ingest from handlers.link_commit import handle_link_commit from handlers.preflight import handle_preflight -from handlers.reset import handle_reset from handlers.ratify import handle_ratify +from handlers.reset import handle_reset from handlers.resolve_collision import handle_resolve_collision from handlers.resolve_compliance import handle_resolve_compliance -from handlers.history import handle_history from handlers.update import get_update_notice, handle_update -from dashboard.server import get_dashboard_server +from ledger.schema import DestructiveMigrationRequired, SchemaVersionTooNew SERVER_NAME = "bicameral-mcp" @@ -72,14 +72,13 @@ def _resolve_server_version() -> str: for candidate in (here, here.parent): toml = candidate / "pyproject.toml" if toml.exists(): - m = re.search( - r'^version\s*=\s*"([^"]+)"', toml.read_text(), re.MULTILINE - ) + m = re.search(r'^version\s*=\s*"([^"]+)"', toml.read_text(), re.MULTILINE) if m: return m.group(1) try: from importlib.metadata import version as _pkg_version + return _pkg_version("bicameral-mcp") except Exception: return "0.1.0" @@ -194,12 +193,30 @@ async def list_tools() -> list[Tool]: "items": { "type": "object", "properties": { - "decision_id": {"type": "string", "description": "Decision ID from the ledger (e.g. from pending_grounding_decisions)"}, - "file_path": {"type": "string", "description": "Repo-relative path to the file"}, - "symbol_name": {"type": "string", "description": "Function/class/method name"}, - "start_line": {"type": "integer", "description": "1-indexed start line (optional — omit to auto-resolve automatically)"}, - "end_line": {"type": "integer", "description": "1-indexed end line (optional)"}, - "purpose": {"type": "string", "description": "Optional one-line description for display"}, + "decision_id": { + "type": "string", + "description": "Decision ID from the ledger (e.g. from pending_grounding_decisions)", + }, + "file_path": { + "type": "string", + "description": "Repo-relative path to the file", + }, + "symbol_name": { + "type": "string", + "description": "Function/class/method name", + }, + "start_line": { + "type": "integer", + "description": "1-indexed start line (optional — omit to auto-resolve automatically)", + }, + "end_line": { + "type": "integer", + "description": "1-indexed end line (optional)", + }, + "purpose": { + "type": "string", + "description": "Optional one-line description for display", + }, }, "required": ["decision_id", "file_path", "symbol_name"], }, @@ -794,16 +811,25 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: "t0": time.monotonic(), "rationale": arguments.get("rationale", ""), } - return [TextContent(type="text", text=json.dumps({ - "session_id": session_id, - "skill": arguments["skill_name"], - "status": "started", - }))] + return [ + TextContent( + type="text", + text=json.dumps( + { + "session_id": session_id, + "skill": arguments["skill_name"], + "status": "started", + } + ), + ) + ] if name == "bicameral.skill_end": from pydantic import ValidationError - from telemetry import record_skill_event + from contracts import SKILL_DIAGNOSTIC_MODELS + from telemetry import record_skill_event + session_id = arguments["session_id"] skill_name = arguments["skill_name"] errored = arguments.get("errored", False) @@ -825,8 +851,7 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: diagnostic = validated.model_dump() except ValidationError as exc: unknown_fields = [ - e["loc"][0] for e in exc.errors() - if e["type"] == "extra_forbidden" and e["loc"] + e["loc"][0] for e in exc.errors() if e["type"] == "extra_forbidden" and e["loc"] ] # Strip unknowns and validate the remaining known fields. known_raw = {k: v for k, v in raw_diagnostic.items() if k not in unknown_fields} @@ -839,8 +864,14 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: diagnostic = raw_diagnostic or None record_skill_event( - skill_name, session_id, duration_ms, errored, SERVER_VERSION, - diagnostic=diagnostic, error_class=error_class, rationale=rationale, + skill_name, + session_id, + duration_ms, + errored, + SERVER_VERSION, + diagnostic=diagnostic, + error_class=error_class, + rationale=rationale, ) response: dict = { "session_id": session_id, @@ -857,6 +888,7 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: if name == "bicameral.feedback": from telemetry import send_event + send_event( SERVER_VERSION, event_type="agent_feedback", @@ -869,6 +901,7 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: if name == "bicameral.usage_summary": from handlers.usage_summary import handle_usage_summary + data = await handle_usage_summary(ctx, days=int(arguments.get("days", 7))) return [TextContent(type="text", text=json.dumps(data, indent=2))] @@ -878,11 +911,11 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: _sync_result = None if name not in ("bicameral.link_commit", "link_commit", "bicameral.update", "update"): from handlers.sync_middleware import ensure_ledger_synced + _sync_result = await ensure_ledger_synced(ctx) try: if name in ("bicameral.link_commit", "link_commit"): - result = await handle_link_commit( ctx, commit_hash=arguments.get("commit_hash", "HEAD"), @@ -924,10 +957,12 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: # Honest empty path — handler returns None when no matches. # Emit an empty envelope the agent can detect and skip on. if result is None: - return [TextContent( - type="text", - text=json.dumps({"judgment_payload": None, "topic": arguments["topic"]}), - )] + return [ + TextContent( + type="text", + text=json.dumps({"judgment_payload": None, "topic": arguments["topic"]}), + ) + ] elif name in ("bicameral.resolve_compliance", "resolve_compliance"): result = await handle_resolve_compliance( ctx, @@ -983,6 +1018,7 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: return [TextContent(type="text", text=json.dumps(payload, indent=2))] elif name in ("bicameral.dashboard", "dashboard"): from contracts import DashboardResponse + srv = get_dashboard_server() if not srv.running: await srv.start(ctx_factory=BicameralContext.from_env) @@ -1059,10 +1095,12 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: if isinstance(exc, DestructiveMigrationRequired) else "upgrade your binary: pipx upgrade bicameral-mcp" ) - return [TextContent( - type="text", - text=json.dumps({"error": str(exc), "action": action}, indent=2), - )] + return [ + TextContent( + type="text", + text=json.dumps({"error": str(exc), "action": action}, indent=2), + ) + ] async def run_smoke_test() -> dict[str, object]: @@ -1110,6 +1148,7 @@ async def serve_stdio() -> None: # below once the session is live. try: from consent import notify_if_first_run + notify_if_first_run() except Exception: pass @@ -1202,18 +1241,22 @@ def _dispatch(args: Any) -> int: """Route parsed args to the appropriate handler. Returns exit code.""" if args.command == "config": from setup_wizard import run_config_wizard + return run_config_wizard() if args.command == "reset": from setup_wizard import run_reset_wizard + return run_reset_wizard() if args.command == "setup": from setup_wizard import run_setup + return run_setup(args.repo_path, args.history_path) # triage-adapt: link_commit dispatch — added per #124 backport without # the broader _register_subparsers/_dispatch refactor or the branch-scan # / --with-push-hook prerequisites if args.command == "link_commit": from cli.link_commit_cli import main as link_commit_main + return link_commit_main(args.commit_hash, quiet=args.quiet) if args.smoke_test: result = asyncio.run(run_smoke_test()) diff --git a/setup_wizard.py b/setup_wizard.py index d687efc9..fd1667e4 100644 --- a/setup_wizard.py +++ b/setup_wizard.py @@ -55,7 +55,7 @@ def _detect_history_path(repo_path: Path, hint: str | None = None) -> Path: return repo_path raw = input( - f"\n History storage path (default: same as repo — press Enter to skip):\n > " + "\n History storage path (default: same as repo — press Enter to skip):\n > " ).strip() if not raw: return repo_path @@ -127,6 +127,7 @@ def _detect_agents() -> list[str]: def _is_interactive() -> bool: """Check if stdin is a terminal (not piped).""" import sys + return sys.stdin.isatty() @@ -306,11 +307,17 @@ def _install_for_agent( config_json = json.dumps(config) subprocess.run( ["claude", "mcp", "remove", "bicameral", "--scope", "project"], - capture_output=True, text=True, timeout=10, cwd=str(repo_path), + capture_output=True, + text=True, + timeout=10, + cwd=str(repo_path), ) result = subprocess.run( ["claude", "mcp", "add-json", "bicameral", "--scope", "project", config_json], - capture_output=True, text=True, timeout=10, cwd=str(repo_path), + capture_output=True, + text=True, + timeout=10, + cwd=str(repo_path), ) if result.returncode == 0: print(f" {agent['name']}: installed via CLI") @@ -323,8 +330,15 @@ def _install_for_agent( for k, v in config["env"].items(): env_args.extend(["--env", f"{k}={v}"]) result = subprocess.run( - ["codex", "mcp", "add", "bicameral"] + env_args + ["--"] + [config["command"]] + config["args"], - capture_output=True, text=True, timeout=10, cwd=str(repo_path), + ["codex", "mcp", "add", "bicameral"] + + env_args + + ["--"] + + [config["command"]] + + config["args"], + capture_output=True, + text=True, + timeout=10, + cwd=str(repo_path), ) if result.returncode == 0: print(f" {agent['name']}: installed via CLI") @@ -332,44 +346,105 @@ def _install_for_agent( # Fallback: write config file directly if agent.get("config_format") == "toml": - _write_toml_config(repo_path, config_path, data_path=data_path, mode=mode, telemetry=telemetry) + _write_toml_config( + repo_path, config_path, data_path=data_path, mode=mode, telemetry=telemetry + ) else: - _write_json_config(repo_path, config_path, data_path=data_path, mode=mode, telemetry=telemetry) + _write_json_config( + repo_path, config_path, data_path=data_path, mode=mode, telemetry=telemetry + ) print(f" {agent['name']}: wrote {config_path}") return True -_BICAMERAL_SESSION_END_COMMAND = ( - "[ -d .bicameral ] && claude -p '/bicameral:capture-corrections' || true" -) +def _build_session_end_command(mcp_config_path: str | None = None) -> str: + """Build the SessionEnd hook command, optionally with `--mcp-config` flags. + + Production end-users have ``bicameral`` registered in their default + Claude Code MCP config (via the setup wizard's `claude mcp add`), so + the spawned subprocess inherits it without an explicit flag. Test + harnesses that drive ``claude -p`` against a non-default ledger + (e.g. ``tests/e2e/run_e2e_flows.py`` pointing SURREAL_URL at a + test-results path) must pass ``--mcp-config`` so the spawned + subprocess writes to the same ledger that the parent session and + post-hoc validators use; otherwise capture-corrections lands its + ``source=agent_session`` decisions in ``~/.bicameral/ledger.db`` + instead of the harness's test ledger. + + The no-args call returns the canonical command prescribed by + ``skills/bicameral-capture-corrections/SKILL.md:207`` byte-exact — + that's what end-user installs ship. + """ + import shlex + + extra_flags = "" + if mcp_config_path: + extra_flags = f" --mcp-config {shlex.quote(str(mcp_config_path))} --strict-mcp-config" + return ( + '[ -d .bicameral ] && [ -z "$BICAMERAL_SESSION_END_RUNNING" ] && ' + "BICAMERAL_SESSION_END_RUNNING=1 " + f"claude -p '/bicameral:capture-corrections --auto-ingest'{extra_flags} || true" + ) + + +# Canonical no-args form — what `_install_claude_hooks` writes to a fresh +# end-user's ``.claude/settings.json``. Re-derived from the helper so the +# function is the single source of truth. +_BICAMERAL_SESSION_END_COMMAND = _build_session_end_command() # Fires after every Bash tool use. When the command is a git write-op -# (commit / merge / pull / rebase --continue), prints a trigger line that -# causes the agent to invoke /bicameral:sync — running the full -# link_commit → compliance check flow so status is authoritative immediately. -_BICAMERAL_POST_COMMIT_COMMAND = ( - "python3 -c \"" - "import json,sys; " - "d=json.load(sys.stdin); " - "c=d.get('tool_input',{}).get('command',''); " - "ops=('git commit','git merge ','git pull','git rebase --continue'); " - "[print('bicameral: new commit detected — run /bicameral:sync to resolve compliance and get authoritative reflected/drifted status') " - "for _ in [1] if any(op in c for op in ops)]\"" -) +# (commit / merge / pull / rebase --continue), emits a hookSpecificOutput +# envelope whose additionalContext nudges the agent to invoke +# /bicameral:sync — running the full link_commit → compliance check +# flow so status is authoritative immediately. +# +# Was a plain-stdout python -c one-liner. Per Claude Code 2.x hook docs +# (https://code.claude.com/docs/en/hooks), plain stdout from PostToolUse +# is dropped to the debug log — only UserPromptSubmit / UserPromptExpansion +# / SessionStart treat raw stdout as agent-visible context. Symptom: the +# agent committed but never followed through with link_commit because +# the reminder never reached the model. Console script writes the proper +# envelope; source: scripts/hooks/post_commit_sync_reminder.py. +_BICAMERAL_POST_COMMIT_COMMAND = "bicameral-mcp-post-commit-sync-reminder" + +# UserPromptSubmit hook: deterministic regex over a verb list elevates +# bicameral.preflight above the agent's default tool-selection priority +# whenever a prompt indicates code-implementation intent. Console script +# is exposed via pyproject.toml [project.scripts] so it resolves on PATH +# regardless of cwd. Closes #146 for end-user installs (the dogfood path +# in the bicameral repo's own .claude/settings.json invokes the source +# file directly via python3). +_BICAMERAL_PREFLIGHT_REMINDER_COMMAND = "bicameral-mcp-preflight-reminder" + +# PostToolUse hook scoped to the bicameral.preflight tool: when preflight +# surfaces ≥1 decision, prints a system-reminder templating the +# correction-capture loop (Step 5.6 of bicameral-preflight) so the agent +# reliably calls bicameral.ingest(source=agent_session) + +# bicameral.resolve_collision when the user's prompt contradicts a +# surfaced decision. Closes #154 for end-user installs (the dogfood path +# invokes the source file directly via python3). +_BICAMERAL_COLLISION_CAPTURE_REMINDER_COMMAND = "bicameral-mcp-collision-capture-reminder" +_BICAMERAL_PREFLIGHT_TOOL_NAME = "mcp__bicameral__bicameral_preflight" def _install_claude_hooks(repo_path: Path) -> bool: """Merge bicameral hooks into the project-level .claude/settings.json. - Installs two hooks: + Installs four hooks: - PostToolUse/Bash: reminds the agent to call link_commit immediately after git write-ops (commit / merge / pull / rebase --continue). + - PostToolUse/bicameral_preflight: reminds the agent to capture + refinements via ingest(agent_session) + resolve_collision when + preflight surfaces decisions that the user's prompt contradicts. - SessionEnd: runs bicameral-capture-corrections to catch uningested mid-session corrections (only fires when .bicameral/ exists). + - UserPromptSubmit: deterministic verb-list classifier injects a + <system-reminder> elevating bicameral.preflight above the agent's + default tool-selection priority on code-implementation prompts. Idempotent — safe to call on every setup run. Returns True if any new - entry was written, False if both were already present. + entry was written, False if all four were already present. """ settings_path = repo_path / ".claude" / "settings.json" settings_path.parent.mkdir(parents=True, exist_ok=True) @@ -386,9 +461,7 @@ def _install_claude_hooks(repo_path: Path) -> bool: # ── PostToolUse / Bash — git write-op reminder ─────────────────── post_tool_use: list = hooks.setdefault("PostToolUse", []) - bash_entry = next( - (e for e in post_tool_use if e.get("matcher") == "Bash"), None - ) + bash_entry = next((e for e in post_tool_use if e.get("matcher") == "Bash"), None) if bash_entry is None: bash_entry = {"matcher": "Bash", "hooks": []} post_tool_use.append(bash_entry) @@ -400,11 +473,35 @@ def _install_claude_hooks(repo_path: Path) -> bool: bash_entry["hooks"] = non_bic + [new_post_hook] wrote_anything = True + # ── PostToolUse / bicameral_preflight — collision capture reminder ─ + preflight_entry = next( + (e for e in post_tool_use if e.get("matcher") == _BICAMERAL_PREFLIGHT_TOOL_NAME), + None, + ) + if preflight_entry is None: + preflight_entry = {"matcher": _BICAMERAL_PREFLIGHT_TOOL_NAME, "hooks": []} + post_tool_use.append(preflight_entry) + old_pre_hooks = preflight_entry.get("hooks", []) + non_bic_pre = [ + h + for h in old_pre_hooks + if "bicameral" not in h.get("command", "") + and "post_preflight_capture_reminder" not in h.get("command", "") + ] + new_pre_hook = { + "type": "command", + "command": _BICAMERAL_COLLISION_CAPTURE_REMINDER_COMMAND, + } + if non_bic_pre != old_pre_hooks or new_pre_hook not in old_pre_hooks: + preflight_entry["hooks"] = non_bic_pre + [new_pre_hook] + wrote_anything = True + # ── SessionEnd — capture uningested corrections ────────────────── session_end: list = hooks.setdefault("SessionEnd", []) # Remove any stale bicameral SessionEnd entries, then write current. non_bic_se = [ - e for e in session_end + e + for e in session_end if not any("bicameral" in h.get("command", "") for h in e.get("hooks", [])) ] new_se_entry = {"hooks": [{"type": "command", "command": _BICAMERAL_SESSION_END_COMMAND}]} @@ -412,6 +509,23 @@ def _install_claude_hooks(repo_path: Path) -> bool: hooks["SessionEnd"] = non_bic_se + [new_se_entry] wrote_anything = True + # ── UserPromptSubmit — preflight auto-fire reinforcement ───────── + user_prompt_submit: list = hooks.setdefault("UserPromptSubmit", []) + non_bic_ups = [ + e + for e in user_prompt_submit + if not any( + "bicameral" in h.get("command", "") or "preflight_reminder" in h.get("command", "") + for h in e.get("hooks", []) + ) + ] + new_ups_entry = { + "hooks": [{"type": "command", "command": _BICAMERAL_PREFLIGHT_REMINDER_COMMAND}] + } + if non_bic_ups != user_prompt_submit or new_ups_entry not in user_prompt_submit: + hooks["UserPromptSubmit"] = non_bic_ups + [new_ups_entry] + wrote_anything = True + if wrote_anything: settings_path.write_text(json.dumps(existing, indent=2) + "\n") return wrote_anything @@ -495,7 +609,9 @@ def _select_collaboration_mode() -> str: result = questionary.select( "Collaboration mode:", choices=[ - questionary.Choice("Team — decisions shared via git (append-only event files)", value="team"), + questionary.Choice( + "Team — decisions shared via git (append-only event files)", value="team" + ), questionary.Choice("Solo — decisions stored locally", value="solo"), ], default="team", @@ -559,7 +675,9 @@ def _select_telemetry() -> bool: result = questionary.select( "Enable anonymous telemetry?", choices=[ - questionary.Choice("Yes — share anonymous usage stats to improve Bicameral", value=True), + questionary.Choice( + "Yes — share anonymous usage stats to improve Bicameral", value=True + ), questionary.Choice("No — keep telemetry off", value=False), ], default=True, @@ -690,7 +808,7 @@ def run_setup(repo_hint: str | None = None, history_hint: str | None = None) -> # Step 3: Runner check command, _ = _detect_runner() if command not in ("bicameral-mcp",): - print(f"\n Note: bicameral-mcp binary not found on PATH.") + print("\n Note: bicameral-mcp binary not found on PATH.") print(f" Using '{command} -m bicameral_mcp' as runner.") print(" Install for a cleaner setup: pip install bicameral-mcp") @@ -708,7 +826,9 @@ def run_setup(repo_hint: str | None = None, history_hint: str | None = None) -> # Step 5: Install MCP config for each agent print() for agent_key in agents: - _install_for_agent(agent_key, repo_path, data_path=data_path, mode=collab_mode, telemetry=telemetry) + _install_for_agent( + agent_key, repo_path, data_path=data_path, mode=collab_mode, telemetry=telemetry + ) # Step 6: Install skills + hooks (Claude Code only) if "claude" in agents: @@ -716,12 +836,16 @@ def run_setup(repo_hint: str | None = None, history_hint: str | None = None) -> if num_skills: print(f" Claude Code: installed {num_skills} slash commands") if _install_claude_hooks(repo_path): - print(" Claude Code: installed hooks → link_commit on commit · capture-corrections on session end") + print( + " Claude Code: installed hooks → link_commit on commit · capture-corrections on session end" + ) # Step 7: Git post-commit hook (Guided mode only) if guided: if _install_git_post_commit_hook(repo_path): - print(" Git: installed post-commit hook → bicameral-mcp link_commit HEAD after every commit") + print( + " Git: installed post-commit hook → bicameral-mcp link_commit HEAD after every commit" + ) else: print(" Git: post-commit hook already present — skipped") @@ -759,6 +883,7 @@ def run_config_wizard() -> int: """ import subprocess import sys + try: import yaml except ImportError: @@ -818,7 +943,9 @@ def run_config_wizard() -> int: ) result = subprocess.run( [sys.executable, "-c", script], - capture_output=True, text=True, timeout=30, + capture_output=True, + text=True, + timeout=30, ) skills_n = int(result.stdout.strip() or "0") if result.returncode == 0 else 0 @@ -842,10 +969,13 @@ def _print_change(label: str, old, new) -> None: def _select_collaboration_mode_with_default(current: str) -> str: import questionary + if not _is_interactive(): return current choices = [ - questionary.Choice("Team — decisions shared via git (append-only event files)", value="team"), + questionary.Choice( + "Team — decisions shared via git (append-only event files)", value="team" + ), questionary.Choice("Solo — decisions stored locally", value="solo"), ] result = questionary.select( @@ -858,6 +988,7 @@ def _select_collaboration_mode_with_default(current: str) -> str: def _select_guided_mode_with_default(current: bool) -> bool: import questionary + if not _is_interactive(): return current choices = [ @@ -874,6 +1005,7 @@ def _select_guided_mode_with_default(current: bool) -> bool: def _select_telemetry_with_default(current: bool) -> bool: import questionary + if not _is_interactive(): return current choices = [ @@ -895,6 +1027,7 @@ def run_reset_wizard() -> int: then asks for explicit confirmation before wiping. """ import asyncio + import questionary print() @@ -924,6 +1057,7 @@ def run_reset_wizard() -> int: # Step 2: dry-run import os + from context import BicameralContext from handlers.reset import handle_reset diff --git a/skills/bicameral-preflight/SKILL.md b/skills/bicameral-preflight/SKILL.md index 17282cb2..6be030df 100644 --- a/skills/bicameral-preflight/SKILL.md +++ b/skills/bicameral-preflight/SKILL.md @@ -59,6 +59,22 @@ If uncertain whether the user will write code, **fire anyway** — the handler is gated on actionable signal and will stay silent if nothing relevant is found. The cost of a false fire is one silent no-op. +### Hook reinforcement + +The trigger described above is reinforced by a `UserPromptSubmit` hook +configured in [`.claude/settings.json`](../../.claude/settings.json). +The hook reads the user prompt, runs a deterministic regex over the +canonical verb list at +[`scripts/hooks/preflight_intent.py`](../../scripts/hooks/preflight_intent.py), +and — on match — injects a `<system-reminder>` block elevating +`bicameral.preflight` above the agent's default tool-selection priority. + +For v0 the verb list is duplicated by intent: the SKILL.md +`description` field above embeds the list as a string literal so +Claude Code skill discovery can read it, while the Python module is +the canonical source for the hook. Both must be edited together to +evolve the trigger surface; future configurability will deduplicate. + ## Telemetry > **Guard**: Only call `skill_begin` and `skill_end` if telemetry is enabled. Telemetry is enabled by default; disabled by setting `BICAMERAL_TELEMETRY=0` (or `false`/`off`/`no`). If disabled, skip both calls and omit all `diagnostic` tracking. @@ -123,10 +139,18 @@ case proceed directly to step 2. ### 2. Call `bicameral.preflight` for region-anchored and HITL state +**Discover first, then preflight.** Before this call, use Read / Grep / Glob to +resolve the user's request to concrete file paths. The user often names a +*feature* ("the reorder feature", "the rate limiter") rather than a *file*; the +caller LLM is responsible for that mapping — the server does deterministic +retrieval, not semantic guessing. A topic-only call falls back to fuzzy text +similarity over decision descriptions; passing `file_paths` engages the +high-precision `binds_to` graph lookup. + ``` bicameral.preflight( topic="<the 1-line topic>", - file_paths=["<repo-relative path>", ...], # include if you've scoped the files + file_paths=["<repo-relative path>", ...], # discovered in step 1 ) ``` @@ -144,8 +168,25 @@ those into your in-scope set. The response also carries an optional `sync_metrics` field — skip rendering it. If `response.product_stage` is non-null, surface it verbatim to the user as a brief note (shown once per device only). -**Omit `file_paths`** if you haven't scoped the files yet (early "how should I -approach X?" queries). The handler still runs sync and HITL checks. +**`file_paths` may be omitted only** for genuinely abstract queries with no +file referent yet (e.g. *"how should I approach building a retry helper?"* — +no existing files to point at). For implementation prompts that name or imply +a feature backed by existing code, populate `file_paths` from your discovery. +The handler still runs sync and HITL checks either way; passing `file_paths` +just unlocks the precision channel. + +The server expands caller-supplied `file_paths` by 1 hop along the +code-locator graph's **import edges** (file-level structural +dependency), so a decision bound to `app/src/lib/git/reorder.ts` still +surfaces when the caller passes the structurally-near +`app/src/ui/multi-commit-operation/reorder.tsx` (because the latter +imports the former). You should still pass concrete paths discovered +in step 1 — the expansion lifts the recall ceiling on near-misses, it +doesn't replace caller-side discovery. Decisions reached only via the +expansion carry `confidence=0.7` in the response (vs `0.9` for direct +pins), and `sources_chained` includes `"graph"` (alongside `"region"`) +when expansion contributed at least one hit. Caller can de-prioritize +expanded matches without losing them. ### 2.5 Resolve pending compliance checks if present @@ -311,6 +352,105 @@ A one-line forward narration helps: > from idempotency.ts. I'll flag the event.id deduplication question > for you to answer before I commit." +### 5.6 Capture refinements — ask the user, then act mechanically + +When preflight surfaced ≥1 decision and the user's request operates on or +near the same feature surface, **do not judge contradiction yourself.** +LLM contradiction detection has been observed to silently miss +structural-mismatch refinements (e.g. user asks for a "programmatic API +to reorder commits" while a prior decision describes "drag-to-reorder +UI" — the conflict is real but not lexical, and the agent rationalizes +"these can coexist"). Per #175, the judgment moves to the user. + +#### 5.6.1 Disambiguate via `AskUserQuestion` + +Fires whenever `response.fired == True` and `len(response.decisions) >= 1`, +regardless of guided mode (capture is the headline product behavior, not +opt-in). Ask once per surfaced decision the user's request plausibly +touches; skip for surfaced decisions that are clearly unrelated to the +prompt domain. + +```python +AskUserQuestion({ + "question": ( + "Your request appears to operate on the same feature surface as " + "surfaced decision <decision_id> ('<one-line description>'). " + "Treat this work as a refinement of that prior plan?" + ), + "multiSelect": False, + "options": [ + { + "label": "Yes — supersede prior plan", + "description": "<paraphrase user's direction; replaces the prior decision wholesale>", + }, + { + "label": "Yes — keep both (addition or scoping)", + "description": "<paraphrase; adds to or narrows the prior decision; both remain>", + }, + { + "label": "No — unrelated to prior plan", + "description": "Continue without capture", + }, + ], +}) +``` + +#### 5.6.2 Mechanical capture (after user disambiguation) + +Based on the user's selection, branch: + +- **"supersede"** → execute the two-call capture below with `action="supersede"`. +- **"keep both"** → execute the two-call capture below with `action="keep_both"`. +- **"unrelated"** → skip capture; proceed to implementation. Narrate one + line ("noted — surfaced context isn't applicable here") and move on. + +For the two "yes" branches: + +1. **Ingest the refinement** with `source=agent_session`, scoped to the + same `feature_group` as the surfaced decision: + +``` +bicameral.ingest(payload={ + "query": "<surfaced decision's topic>", + "source": "agent_session", + "title": "<short label, e.g. 'reorder-programmatic-api'>", + "date": "<today ISO date>", + "decisions": [{ "description": "<user's direction, stated as a decision>" }] +}, feature_group="<same feature group as the surfaced decision>") +``` + +2. **Wire it to the seeded decision** via `bicameral.resolve_collision`: + +``` +bicameral.resolve_collision( + new_id="<just-ingested refinement id>", + old_id="<surfaced decision id>", + action="supersede" | "keep_both" | "link_parent" +) +``` + +`link_parent` is also available (selectable at the `AskUserQuestion` +step if the surfaced decision is an L1 parent and the user's direction +is an L2 child) — wires `parent_decision_id`, no supersede edge, no +status change. + +The user has answered the disambiguation question, so capture is +mechanical from this point. PM ratifies in the inbox. + +Narrate one line: *"Captured refinement: '<paraphrase>' — wired as +<action> of <feature> roadmap entry."* + +#### Hook reinforcement + +A PostToolUse hook scoped to `mcp__bicameral__bicameral_preflight` injects a +`<system-reminder>` after every preflight call that surfaces ≥1 decision. The +reminder templates Step 5.6.1's `AskUserQuestion` shape with the surfaced +`decision_id` + description filled in, so the question fires reliably even +when the agent's natural inclination would be to skip the disambiguation. +Source: `scripts/hooks/post_preflight_capture_reminder.py`; wired by +`setup_wizard._install_claude_hooks` and the e2e harness's +`materialize_settings_with_hooks`. + ### 6. Honor blocking hints (guided mode vs normal mode) The agent's `guided_mode` setting controls whether action hints are diff --git a/telemetry.py b/telemetry.py index 9c291fac..efe2d72c 100644 --- a/telemetry.py +++ b/telemetry.py @@ -42,7 +42,6 @@ import json import logging -import os import threading import uuid from pathlib import Path @@ -60,6 +59,7 @@ def _is_enabled() -> bool: the env-var override (BICAMERAL_TELEMETRY=0) continues to work. """ from consent import telemetry_allowed + return telemetry_allowed() @@ -83,6 +83,7 @@ def _send_bg(payload: dict) -> None: """POST to the relay in a daemon thread. Never raises.""" try: import urllib.request + data = json.dumps(payload).encode() req = urllib.request.Request( _RELAY_URL, @@ -98,7 +99,9 @@ def _send_bg(payload: dict) -> None: logger.debug("[telemetry] relay POST failed (non-fatal): %s", exc) -def send_event(version: str, diagnostic: dict | None = None, **properties: str | int | float | bool) -> None: +def send_event( + version: str, diagnostic: dict | None = None, **properties: str | int | float | bool +) -> None: """Send a telemetry event. Fire-and-forget. Never raises. The relay only requires `distinct_id` and `version` — all other kwargs are @@ -118,6 +121,7 @@ def send_event(version: str, diagnostic: dict | None = None, **properties: str | # Privacy-preserving: only the skill/tool name + 1 are written, no payload. try: from local_counters import increment as _local_increment + skill_name = properties.get("skill") or properties.get("tool") if isinstance(skill_name, str): _local_increment(skill_name) diff --git a/tests/_extract_headless.py b/tests/_extract_headless.py index cc28b5e2..27dc2e8f 100644 --- a/tests/_extract_headless.py +++ b/tests/_extract_headless.py @@ -20,6 +20,7 @@ them with "OAuth authentication is currently not supported" (401). Standard API keys (sk-ant-api03...) authenticate via x-api-key. """ + from __future__ import annotations import hashlib @@ -148,7 +149,7 @@ def _extract_step1_excerpt(skill_md: str) -> str: next_header = _STEP_HEADER_RE.search(body, step1_match.end()) end = next_header.start() if next_header else len(body) - return body[step1_match.start():end].strip() + return body[step1_match.start() : end].strip() def _cache_path(skill_sha: str, transcript_sha: str, model: str) -> Path: diff --git a/tests/_extraction_matcher.py b/tests/_extraction_matcher.py index 027407cc..94ed7426 100644 --- a/tests/_extraction_matcher.py +++ b/tests/_extraction_matcher.py @@ -27,6 +27,7 @@ - Offline tests use the rapidfuzz fallback in _extraction_metrics.py by passing matcher="rapidfuzz" explicitly, so no network is needed. """ + from __future__ import annotations import hashlib @@ -280,9 +281,7 @@ def llm_match( "Set the env var, or pass matcher='rapidfuzz' explicitly." ) - tool_input = _call_matcher_api( - actual, expected, model=chosen_model, api_key=chosen_key - ) + tool_input = _call_matcher_api(actual, expected, model=chosen_model, api_key=chosen_key) pairs = _parse_matches(tool_input, n_actual=len(actual), n_expected=len(expected)) if use_cache: diff --git a/tests/_extraction_metrics.py b/tests/_extraction_metrics.py index de4346ab..8b1e4be6 100644 --- a/tests/_extraction_metrics.py +++ b/tests/_extraction_metrics.py @@ -38,6 +38,7 @@ fixture-less transcripts don't break CI before the ground-truth set is bootstrapped. """ + from __future__ import annotations import json @@ -67,9 +68,7 @@ def _descs(items: list[dict]) -> list[str]: return [str(d.get("description", "")).strip() for d in items if d.get("description")] -def _rapidfuzz_match( - actual: list[str], expected: list[str] -) -> list[tuple[int, int | None]]: +def _rapidfuzz_match(actual: list[str], expected: list[str]) -> list[tuple[int, int | None]]: """Rapidfuzz 1:1 matching. Returns (actual_idx, expected_idx | None) pairs. For each actual in order, pick the best remaining expected by @@ -143,6 +142,7 @@ def compute_extraction_metrics( # Import inside the function so offline tests that force # matcher="rapidfuzz" don't drag in httpx / network code. from _extraction_matcher import llm_match # type: ignore[import-not-found] + pairs = llm_match(actual, expected) elif chosen == "rapidfuzz": pairs = _rapidfuzz_match(actual, expected) diff --git a/tests/bench_drift.py b/tests/bench_drift.py index e56477fc..6e03cba3 100644 --- a/tests/bench_drift.py +++ b/tests/bench_drift.py @@ -108,7 +108,9 @@ async def _collect_real_symbols(adapter, repo_path: Path, n_files_target: int) - files: list[Path] = [] for d in seed_dirs: if d.exists(): - files.extend(sorted(p for p in d.rglob("*.py") if p.is_file() and "__pycache__" not in p.parts)) + files.extend( + sorted(p for p in d.rglob("*.py") if p.is_file() and "__pycache__" not in p.parts) + ) collected: list[dict] = [] seen_pairs: set[str] = set() @@ -129,11 +131,13 @@ async def _collect_real_symbols(adapter, repo_path: Path, n_files_target: int) - if key in seen_pairs: continue seen_pairs.add(key) - collected.append({ - "file_path": rel, - "symbol_name": sym, - "line_number": line, - }) + collected.append( + { + "file_path": rel, + "symbol_name": sym, + "line_number": line, + } + ) return collected @@ -146,26 +150,30 @@ def _build_payload(symbols: list[dict], batch_idx: int, batch_size: int) -> dict mappings = [] for i in range(batch_size): sym = symbols[(batch_idx * batch_size + i) % len(symbols)] - mappings.append({ - "span": { - "span_id": f"bench-{batch_idx}-{i}", - "source_type": "transcript", - "text": f"Bench decision {batch_idx}-{i} about {sym['symbol_name']}", - "speaker": "bench", - "source_ref": f"bench-meeting-{batch_idx}", - }, - "intent": f"Bench decision {batch_idx}-{i}: maintain {sym['symbol_name']} in {sym['file_path']}", - "symbols": [sym["symbol_name"]], - "code_regions": [{ - "file_path": sym["file_path"], - "symbol": sym["symbol_name"], - "type": "function", - "start_line": sym["line_number"], - "end_line": sym["line_number"] + 20, - "purpose": f"bench batch {batch_idx} item {i}", - }], - "dependency_edges": [], - }) + mappings.append( + { + "span": { + "span_id": f"bench-{batch_idx}-{i}", + "source_type": "transcript", + "text": f"Bench decision {batch_idx}-{i} about {sym['symbol_name']}", + "speaker": "bench", + "source_ref": f"bench-meeting-{batch_idx}", + }, + "intent": f"Bench decision {batch_idx}-{i}: maintain {sym['symbol_name']} in {sym['file_path']}", + "symbols": [sym["symbol_name"]], + "code_regions": [ + { + "file_path": sym["file_path"], + "symbol": sym["symbol_name"], + "type": "function", + "start_line": sym["line_number"], + "end_line": sym["line_number"] + 20, + "purpose": f"bench batch {batch_idx} item {i}", + } + ], + "dependency_edges": [], + } + ) return { "query": f"bench batch {batch_idx}", "repo": ".", @@ -189,12 +197,16 @@ async def _run_bench(ctx) -> None: adapter = get_code_locator() # --- Setup: collect real symbols, ingest 100 decisions in batches of 10 --- - symbols = await _collect_real_symbols(adapter, Path(ctx.repo_path), n_files_target=N_FILES_TARGET) + symbols = await _collect_real_symbols( + adapter, Path(ctx.repo_path), n_files_target=N_FILES_TARGET + ) assert len(symbols) >= 25, f"Only got {len(symbols)} symbols; need >= 25 for realistic bench" batch_size = 10 n_batches = N_DECISIONS // batch_size - print(f"\n[bench] Ingesting {N_DECISIONS} decisions across {len(symbols)} unique symbols ({n_batches} batches of {batch_size})") + print( + f"\n[bench] Ingesting {N_DECISIONS} decisions across {len(symbols)} unique symbols ({n_batches} batches of {batch_size})" + ) setup_start = time.perf_counter() for b in range(n_batches): @@ -262,11 +274,15 @@ async def _run_bench(ctx) -> None: print("DRIFT BENCHMARK BASELINE — V1 A1") print("=" * 68) print(f"Setup: {N_DECISIONS} decisions, {len(symbols)} symbols, {len(file_paths)} files") - print(f"Setup ingest: {setup_elapsed:.2f}s total ({setup_elapsed/N_DECISIONS*1000:.1f}ms / decision)") + print( + f"Setup ingest: {setup_elapsed:.2f}s total ({setup_elapsed / N_DECISIONS * 1000:.1f}ms / decision)" + ) print() print(f"{'handler':<25} {'p50 (ms)':>10} {'p95 (ms)':>10} {'max (ms)':>10} {'n':>5}") print("-" * 68) for name, p in report["handlers"].items(): - print(f"{name:<25} {p['p50']*1000:>10.1f} {p['p95']*1000:>10.1f} {p['max']*1000:>10.1f} {p['n']:>5}") + print( + f"{name:<25} {p['p50'] * 1000:>10.1f} {p['p95'] * 1000:>10.1f} {p['max'] * 1000:>10.1f} {p['n']:>5}" + ) print("=" * 68) print(f"Artifact: {out_path}") diff --git a/tests/conftest.py b/tests/conftest.py index 46856c4f..4042b11f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,10 +26,7 @@ def _isolate_consent_state(tmp_path_factory): third-party fixture plugin. """ home = tmp_path_factory.mktemp("bicameral_home") - saved = { - k: os.environ.get(k) - for k in ("HOME", "USERPROFILE", "BICAMERAL_SKIP_CONSENT_NOTICE") - } + saved = {k: os.environ.get(k) for k in ("HOME", "USERPROFILE", "BICAMERAL_SKIP_CONSENT_NOTICE")} os.environ["HOME"] = str(home) os.environ["USERPROFILE"] = str(home) os.environ["BICAMERAL_SKIP_CONSENT_NOTICE"] = "1" @@ -48,7 +45,9 @@ def pytest_configure(config): config.addinivalue_line("markers", "phase2: requires SurrealDBLedgerAdapter + SurrealDB") config.addinivalue_line("markers", "phase3: full E2E — requires both Phase 1 + Phase 2") config.addinivalue_line("markers", "alpha_flow: Jacob North Star regression suite — v0.7 gate") - config.addinivalue_line("markers", "bench: drift benchmark harness (V1 A1) — skipped by default, run with -m bench") + config.addinivalue_line( + "markers", "bench: drift benchmark harness (V1 A1) — skipped by default, run with -m bench" + ) @pytest.fixture(autouse=True) @@ -69,6 +68,7 @@ def _default_authoritative_ref_to_current_branch(monkeypatch): the start of the test, which unsets this default for that test only. """ import subprocess + try: result = subprocess.run( ["git", "rev-parse", "--abbrev-ref", "HEAD"], @@ -84,11 +84,12 @@ def _default_authoritative_ref_to_current_branch(monkeypatch): monkeypatch.setenv("BICAMERAL_AUTHORITATIVE_REF", current_branch) - @pytest.fixture def repo_path() -> str: """Repo root. Defaults to the MCP repo itself for Phase 1+ tests.""" - return os.getenv("REPO_PATH", str(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))) + return os.getenv( + "REPO_PATH", str(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) + ) @pytest.fixture @@ -100,6 +101,7 @@ def surreal_url() -> str: def ctx(): """Build a BicameralContext from current env (SURREAL_URL, REPO_PATH).""" from context import BicameralContext + return BicameralContext.from_env() diff --git a/tests/e2e/README.md b/tests/e2e/README.md new file mode 100644 index 00000000..204fd333 --- /dev/null +++ b/tests/e2e/README.md @@ -0,0 +1,104 @@ +# v0 user flow e2e + +End-to-end validation of `BicameralAI/bicameral#108`'s six canonical user +flows, driven by **real Claude Code CLI sessions** with `bicameral-mcp` +registered as an MCP server. Test fixture: a pinned commit of +`github.com/desktop/desktop`, with `docs/process/roadmap.md` as ingest +content. + +This is the canonical CI test for the spec. The handler-replay simulation +at `scripts/sim_issue_108_flows.py` complements it for fast local iteration +on handler logic without burning Claude API calls. + +## What it tests + +Each flow corresponds to a section of [bicameral#108 spec](https://github.com/BicameralAI/bicameral/issues/108): + +| Flow | Spec section | Asserts | +|---|---|---| +| 1 | Record decisions from a meeting | `bicameral.ingest` called with mappings | +| 2 | Begin to write code (preflight) | `bicameral.preflight` called with `file_paths` | +| 3 | Commit code → reflected | `bicameral.link_commit` + `bicameral.resolve_compliance` (with verdicts) | +| 4 | End coding session | `bicameral.ingest` called with `source="agent_session"` | +| 5 | Review what's been tracked | `bicameral.history` called (with seed ingest + ratify) | + +Each flow is a separate `claude -p` invocation with a fresh `memory://` +ledger. Within a session, prompts may chain multiple tool calls — the +asserter walks the entire stream-json transcript. + +## How it works + +``` +prompts/flow-N-*.md → claude -p → stream-json transcript → assert + │ + ├─ --mcp-config bicameral.mcp.json (registers bicameral-mcp) + ├─ --strict-mcp-config (no other MCP servers loaded) + ├─ --allowed-tools mcp__bicameral Read Grep + ├─ --add-dir <desktop_clone> (skill Read access) + └─ --output-format stream-json --verbose +``` + +`run_e2e_flows.py` orchestrates all five flows, captures transcripts to +`test-results/e2e/flow-N.ndjson`, and asserts on the tool-use blocks. + +## Running locally + +```bash +# 1. Install bicameral-mcp + Claude Code CLI +cd pilot/mcp +pip install -e ".[test]" +npm install -g @anthropic-ai/claude-code + +# 2. Authenticate Claude Code CLI (interactive — once) +claude auth + +# 3. Clone the test fixture +git clone --depth=1 https://github.com/desktop/desktop /tmp/desktop-clone +cd /tmp/desktop-clone && git checkout -b main && cd - + +# 4. Run all five flows +DESKTOP_REPO_PATH=/tmp/desktop-clone python tests/e2e/run_e2e_flows.py +``` + +Cost per run: ~$0.50–$2.00 across all five flows depending on how much the +LLM exercises in each session. Each run is bounded by `--max-budget-usd 2.0` +per flow. + +## CI + +GitHub Actions workflow: `.github/workflows/v0-user-flow-e2e.yml`. + +- Triggers on PRs touching `tests/e2e/**`, `handlers/**`, `ledger/**`, + `contracts.py`, `skills/bicameral-*/**`, or the workflow itself. +- Runs in the `production` GitHub environment for `CLAUDE_CODE_OAUTH_TOKEN`. +- Pinned `desktop/desktop` commit in the workflow file (update by editing + the env var). +- Uploads `test-results/e2e/*.ndjson` as job artifacts (30-day retention) + for failure forensics. + +## Updating + +When the spec changes, update both: + +1. The relevant `prompts/flow-N-*.md` (natural-language user prompt) +2. The matching `assert_flow_N` in `run_e2e_flows.py` + +When `desktop/desktop`'s `roadmap.md` or `cherry-pick.ts` shape drifts in +ways that break the prompts or bind targets, bump the pinned commit in +the workflow + adjust prompts. + +## Why not handler-replay only? + +The handler-replay sim (`scripts/sim_issue_108_flows.py`) directly imports +handler functions and calls them. It's fast and useful for iterating on +handler logic, but it bypasses three layers we need to validate: + +- **MCP protocol** — JSON-RPC over stdio, tool schema marshalling +- **Skill files** — `.claude/skills/bicameral-*/SKILL.md` parsing, trigger + matching, prompt construction +- **Caller LLM** — natural-language → tool-call sequencing, auto-chains + (preflight → capture-corrections → context-sentry → ingest → judge_gaps) + +This e2e suite covers all three. Together they form the spec's two-level +validation: handler invariants (replay sim) + user-experience contract +(this directory). diff --git a/tests/e2e/_harness_setup.py b/tests/e2e/_harness_setup.py new file mode 100644 index 00000000..036358f8 --- /dev/null +++ b/tests/e2e/_harness_setup.py @@ -0,0 +1,246 @@ +"""Shared test-harness setup helpers. + +Used by: + - tests/e2e/run_e2e_flows.py (headless ``claude -p`` assertion test) + - tests/e2e/record_demo_interactive.sh (interactive tmux-driven recording) + +Both code paths must produce IDENTICAL artifacts (materialized MCP config, +materialized claude settings with hooks, bootstrapped ``.bicameral/``) so the +agent sees the same hook substrate and same MCP config regardless of which +entry point invoked it. This module is the single source of truth for that +materialization — no inline duplication in either consumer. + +A CLI entry point exists so shell scripts can invoke the same logic as the +Python harness without re-implementing it inline. See ``__main__``. +""" + +from __future__ import annotations + +import argparse +import json +import pathlib +import shutil +import subprocess +import sys + + +def materialize_mcp_config( + template: pathlib.Path, + out_dir: pathlib.Path, + desktop_repo_path: str, + ledger_dir: pathlib.Path, +) -> pathlib.Path: + """Read the MCP config template, substitute env-var placeholders, write + a runtime copy to ``<out_dir>/bicameral.mcp.materialized.json``. + + The template uses ``${DESKTOP_REPO_PATH}`` and ``${LEDGER_DIR}`` so the + same template works locally (any clone path) and in CI (the workflow's + clone path). Claude Code's MCP spawn behaviour for env replacement vs + merge is implementation-defined; passing REPO_PATH explicitly via the + config avoids that ambiguity. + """ + raw = template.read_text(encoding="utf-8") + materialized = raw.replace("${DESKTOP_REPO_PATH}", desktop_repo_path).replace( + "${LEDGER_DIR}", str(ledger_dir) + ) + out = out_dir / "bicameral.mcp.materialized.json" + out.write_text(materialized, encoding="utf-8") + return out + + +def materialize_settings_with_hooks( + out_dir: pathlib.Path, + mcp_config_path: pathlib.Path, + mcp_root: pathlib.Path, +) -> pathlib.Path: + """Write a project-style ``settings.json`` carrying the four hooks + bicameral's setup-wizard installs in real projects. The PostToolUse and + UserPromptSubmit commands are byte-exact strings imported from + ``setup_wizard`` — single source of truth, no drift. + + The SessionEnd command is built via ``setup_wizard._build_session_end_command`` + with ``mcp_config_path`` set. Production end-users have ``bicameral`` + registered in their default Claude Code MCP config so the spawned + subprocess inherits it without an explicit flag; test harnesses + override ``SURREAL_URL`` via the materialized MCP config to point at + a test-results ledger, so we MUST pass that config explicitly to the + subprocess or its ``capture-corrections`` writes land in the user's + default ledger and post-hoc validators find zero rows. + + Hooks installed: + - PostToolUse/Bash: bicameral-sync listens for "new commit detected" + output to auto-fire ``link_commit``. + - PostToolUse/bicameral_preflight: collision-capture reminder fires + when preflight surfaces ≥1 decision, templating the Step 5.6 + ingest(agent_session) + resolve_collision call so the agent + captures user refinements that contradict surfaced decisions. + - SessionEnd: spawns a subprocess running + ``/bicameral:capture-corrections --auto-ingest`` (with the test + MCP config) to scan the just-ended session for uningested + mid-session corrections. + - UserPromptSubmit: deterministic verb-list classifier injects a + <system-reminder> elevating bicameral.preflight above the agent's + default tool-selection priority on code-implementation prompts. + """ + if str(mcp_root) not in sys.path: + sys.path.insert(0, str(mcp_root)) + from setup_wizard import ( # noqa: E402 + _BICAMERAL_COLLISION_CAPTURE_REMINDER_COMMAND, + _BICAMERAL_POST_COMMIT_COMMAND, + _BICAMERAL_PREFLIGHT_REMINDER_COMMAND, + _BICAMERAL_PREFLIGHT_TOOL_NAME, + _build_session_end_command, + ) + + session_end_command = _build_session_end_command(mcp_config_path=str(mcp_config_path)) + + settings = { + "hooks": { + "PostToolUse": [ + { + "matcher": "Bash", + "hooks": [{"type": "command", "command": _BICAMERAL_POST_COMMIT_COMMAND}], + }, + { + "matcher": _BICAMERAL_PREFLIGHT_TOOL_NAME, + "hooks": [ + { + "type": "command", + "command": _BICAMERAL_COLLISION_CAPTURE_REMINDER_COMMAND, + } + ], + }, + ], + "SessionEnd": [ + { + "hooks": [{"type": "command", "command": session_end_command}], + } + ], + "UserPromptSubmit": [ + { + "hooks": [ + {"type": "command", "command": _BICAMERAL_PREFLIGHT_REMINDER_COMMAND} + ], + } + ], + } + } + out = out_dir / "claude-settings-with-hook.json" + out.write_text(json.dumps(settings, indent=2), encoding="utf-8") + return out + + +def clean_ledger(ledger_dir: pathlib.Path) -> None: + """Wipe the persistent ledger between harness runs. + + State must persist across the 5 sequential claude sessions within a run + (so the PM in flow 5 sees decisions from flows 1/2/4), but must NOT leak + across runs (so each run is reproducible and CI is deterministic). + """ + if ledger_dir.exists(): + shutil.rmtree(ledger_dir, ignore_errors=True) + + +def reset_desktop_repo(desktop_repo_path: str) -> None: + """Reset desktop-clone to its pinned HEAD between runs. Flow 3 makes a + real commit; without a reset, the second-onwards run starts from a + polluted base. + """ + repo = pathlib.Path(desktop_repo_path) + if not (repo / ".git").exists(): + return + for args in (("git", "reset", "--hard", "FETCH_HEAD"), ("git", "reset", "--hard", "HEAD")): + try: + subprocess.run(args, cwd=repo, check=True, capture_output=True, timeout=20) + return + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + continue + + +def bootstrap_bicameral_dir(desktop_repo_path: str, mcp_root: pathlib.Path) -> None: + """Create a minimal ``.bicameral/`` inside ``desktop_repo_path`` so the + SessionEnd hook's ``[ -d .bicameral ]`` guard passes when the parent + claude session exits. Without this, the hook short-circuits silently + and Flow 4's path-X-(b) ledger validation has nothing to observe. + + Reuses ``setup_wizard._write_collaboration_config`` to write the same + minimal ``config.yaml`` (mode=solo, guided=false, telemetry=false) a + fresh end-user install would produce — single source of truth. + + Wiped + recreated each run so flows do not inherit cross-run state. + """ + if str(mcp_root) not in sys.path: + sys.path.insert(0, str(mcp_root)) + from setup_wizard import _write_collaboration_config # noqa: E402 + + bicameral_dir = pathlib.Path(desktop_repo_path) / ".bicameral" + if bicameral_dir.exists(): + shutil.rmtree(bicameral_dir, ignore_errors=True) + _write_collaboration_config( + data_path=pathlib.Path(desktop_repo_path), + mode="solo", + guided=False, + telemetry=False, + ) + + +def setup_all( + desktop_repo_path: str, + results_dir: pathlib.Path, + mcp_config_template: pathlib.Path, + mcp_root: pathlib.Path, + clean: bool = True, +) -> dict[str, pathlib.Path]: + """Run every setup step in the canonical order. Returns the resulting + artifact paths so consumers can wire them through to the agent invocation. + + When ``clean=True`` (default), wipes the ledger and resets the desktop + repo first. The harness uses this; the recording script uses it too — + state must persist across flows within a run, but not across runs. + """ + results_dir.mkdir(parents=True, exist_ok=True) + ledger_dir = results_dir / "ledger.db" + if clean: + clean_ledger(ledger_dir) + reset_desktop_repo(desktop_repo_path) + bootstrap_bicameral_dir(desktop_repo_path, mcp_root) + mcp_config_path = materialize_mcp_config( + mcp_config_template, results_dir, desktop_repo_path, ledger_dir + ) + settings_path = materialize_settings_with_hooks(results_dir, mcp_config_path, mcp_root) + return {"mcp_config": mcp_config_path, "settings": settings_path, "ledger": ledger_dir} + + +def main() -> int: + """CLI entrypoint for shell consumers (record_demo_interactive.sh). + + Prints the resulting artifact paths as ``<key>\\t<path>`` lines on + stdout so the shell can parse them with ``awk`` or ``cut`` if it + needs to thread them through to subsequent commands. + """ + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--desktop-repo-path", required=True) + p.add_argument("--results-dir", required=True) + p.add_argument("--mcp-config-template", required=True) + p.add_argument("--mcp-root", required=True) + p.add_argument( + "--no-clean", + action="store_true", + help="skip ledger wipe + desktop-clone reset (default: wipe + reset)", + ) + args = p.parse_args() + + paths = setup_all( + desktop_repo_path=args.desktop_repo_path, + results_dir=pathlib.Path(args.results_dir), + mcp_config_template=pathlib.Path(args.mcp_config_template), + mcp_root=pathlib.Path(args.mcp_root), + clean=not args.no_clean, + ) + for key, path in paths.items(): + print(f"{key}\t{path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/e2e/_ledger_helpers.py b/tests/e2e/_ledger_helpers.py new file mode 100644 index 00000000..8d4be1bf --- /dev/null +++ b/tests/e2e/_ledger_helpers.py @@ -0,0 +1,24 @@ +"""Pure helpers for ledger-based flow validation. + +Extracted from run_e2e_flows.py so unit tests can import without +triggering the harness's top-level env-var / CLI-presence guards. +""" + +from __future__ import annotations + + +def count_agent_session_decisions(snapshot: dict) -> int | None: + """Count decisions with source_type='agent_session' in a ledger snapshot. + + Returns None if the snapshot reports an error (caller treats as + INCONCLUSIVE, not FAIL — the assertion is unreliable when the ledger + isn't queryable). Returns 0 when there are no agent_session rows. The + 'agent_session' source_type is the canonical tag written by both + in-session capture-corrections (path-A) and the SessionEnd subprocess + (path-B); this helper does not discriminate between them, only counts + the product-outcome signal. + """ + if "error" in snapshot: + return None + decisions = snapshot.get("decisions") or [] + return sum(1 for d in decisions if d.get("source_type") == "agent_session") diff --git a/tests/e2e/bicameral.mcp.json b/tests/e2e/bicameral.mcp.json new file mode 100644 index 00000000..e08b1508 --- /dev/null +++ b/tests/e2e/bicameral.mcp.json @@ -0,0 +1,12 @@ +{ + "mcpServers": { + "bicameral": { + "command": "bicameral-mcp", + "args": [], + "env": { + "SURREAL_URL": "surrealkv://${LEDGER_DIR}", + "REPO_PATH": "${DESKTOP_REPO_PATH}" + } + } + } +} diff --git a/tests/e2e/demo_renderer.py b/tests/e2e/demo_renderer.py new file mode 100755 index 00000000..18936668 --- /dev/null +++ b/tests/e2e/demo_renderer.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# Pretty-print Claude Code stream-json to xterm and detect scene boundaries. +# +# Reads stream-json from stdin (one JSON object per line). Writes: +# - human-readable output to stdout (visible in the recorded xterm) +# - raw stream-json to $DEMO_TRANSCRIPT +# - scene-boundary timestamps to $DEMO_SCENES_FILE +# +# Scene boundaries (option a — tool-call ordering, no LLM-emitted sentinels): +# t1 (Scene 1 → Scene 2): first mcp__bicameral__bicameral_preflight call +# t2 (Scene 2 → Scene 3): first mcp__bicameral__bicameral_history call +# AFTER any mcp__bicameral__bicameral_link_commit call + +from __future__ import annotations + +import json +import os +import sys +import time +from pathlib import Path + +TRANSCRIPT = Path(os.environ.get("DEMO_TRANSCRIPT", "/tmp/demo-transcript.ndjson")) +SCENES_FILE = Path(os.environ.get("DEMO_SCENES_FILE", "/tmp/demo-scenes.txt")) + + +def _record_scene(name: str) -> None: + with SCENES_FILE.open("a") as f: + f.write(f"{name}={time.time():.3f}\n") + + +def _tool_bare(name: str) -> str: + return name.split("__")[-1] if "__" in name else name + + +def _input_summary(payload: dict) -> str: + if not isinstance(payload, dict) or not payload: + return "" + parts: list[str] = [] + for k, v in list(payload.items())[:3]: + s = str(v) + if len(s) > 60: + s = s[:57] + "..." + parts.append(f"{k}={s}") + return " ".join(parts) + + +def _flush(line: str = "") -> None: + sys.stdout.write(line + "\n") + sys.stdout.flush() + + +def main() -> int: + SCENES_FILE.write_text("") + TRANSCRIPT.write_text("") + _record_scene("recording_start") + + saw_link_commit = False + saw_preflight = False + saw_post_history = False + + raw = TRANSCRIPT.open("a") + + for line in sys.stdin: + if not line.strip(): + continue + + raw.write(line) + raw.flush() + + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + + t = obj.get("type") + + if t == "system" and obj.get("subtype") == "init": + _flush(f"[demo] session started — model={obj.get('model', '?')}") + continue + + if t == "assistant": + msg = obj.get("message") or {} + for block in msg.get("content") or []: + btype = block.get("type") + if btype == "text": + text = block.get("text", "").rstrip() + if text: + _flush() + _flush(text) + elif btype == "tool_use": + name = block.get("name", "") + bare = _tool_bare(name) + summary = _input_summary(block.get("input") or {}) + _flush(f"\n ▸ tool: {bare} {summary}".rstrip()) + + if not saw_preflight and name.endswith("bicameral_preflight"): + saw_preflight = True + _record_scene("scene_1_to_2") + if name.endswith("bicameral_link_commit"): + saw_link_commit = True + if ( + not saw_post_history + and saw_link_commit + and name.endswith("bicameral_history") + ): + saw_post_history = True + _record_scene("scene_2_to_3") + continue + + if t == "user": + msg = obj.get("message") or {} + for block in msg.get("content") or []: + if isinstance(block, dict) and block.get("type") == "tool_result": + content = block.get("content") or "" + if isinstance(content, list): + content = "".join( + part.get("text", "") if isinstance(part, dict) else str(part) + for part in content + ) + snippet = str(content).replace("\n", " ") + if len(snippet) > 220: + snippet = snippet[:217] + "..." + _flush(f" ◂ result: {snippet}") + continue + + if t == "result": + duration = obj.get("duration_ms", "?") + cost = obj.get("total_cost_usd", "?") + _flush(f"\n[demo] session complete — duration={duration}ms cost=${cost}") + + _record_scene("recording_end") + raw.close() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/e2e/prompts/composite-demo.md b/tests/e2e/prompts/composite-demo.md new file mode 100644 index 00000000..55d38d05 --- /dev/null +++ b/tests/e2e/prompts/composite-demo.md @@ -0,0 +1,85 @@ +# Composite v0 user-flow demo (single session, three scenes) + +This is a continuous demo session that will be split in post into a "PM +view" video (pm.mp4) and a "Dev view" video (dev.mp4). Walk through +the three scenes below in order. Do not skip steps. Do not abbreviate. + +Before you begin: call `bicameral.dashboard` so the dashboard sidecar +binds and the right pane of the recording has live ledger updates to +show. + +--- + +## SCENE 1 — Post-meeting (PM persona) + +You are the PM. The team just reviewed the GitHub Desktop roadmap. +Ingest the following decisions into the ledger via `bicameral.ingest`: + +1. **High signal notifications (versions 2.9.10 and 3.0.0)** — Receive + a notification when checks fail. Receive a notification when your + pull request is reviewed. +2. **Improved commit history (version 2.9.0)** — Reorder commits via + drag/drop. Squash commits via drag/drop. Amend last commit. Create + a branch from a previous commit. +3. **Cherry-picking commits from one branch to another (version 2.7.1)** + — Cherry-pick commits with a context menu and interactively. Bind + this decision to `app/src/lib/git/cherry-pick.ts` (specifically the + `CherryPickResult` enum near the top of the file). + +Source: `desktop/desktop:docs/process/roadmap.md`. + +After `bicameral.ingest` returns, ratify the decisions you just +ingested via `bicameral.ratify`. Briefly confirm what landed (decision +IDs and signoff state) so the viewer understands the ledger now has +proposed-then-ratified entries. + +--- + +## SCENE 2 — Implementation (Dev persona) + +You are now the dev. Walk through the implementation arc end-to-end: + +1. Call `bicameral.preflight` on `app/src/lib/git/cherry-pick.ts` to + surface relevant decisions before editing. Read the response — it + should remind you about the cherry-pick decision from Scene 1. + +2. Use the `Edit` tool to add a single-line comment near the top of + `app/src/lib/git/cherry-pick.ts` referencing the cherry-pick + roadmap decision (e.g., + `// Cherry-pick: roadmap v2.7.1 — context menu + interactive`). + Keep it minimal and non-disruptive. + +3. Stage and commit the change with `Bash`: + - `git add app/src/lib/git/cherry-pick.ts` + - `git commit -m "demo: annotate CherryPickResult with roadmap decision"` + +4. Call `bicameral.link_commit` on `HEAD` to detect drift against any + decisions bound to that file. + +5. For each pending compliance check that `link_commit` surfaces, call + `bicameral.resolve_compliance` with a verdict + (compliant / drifted / not_relevant). Use the file's content as + evidence. + +6. If any non-trivial decisions emerged mid-session (corrections, + constraint clarifications), capture them with `bicameral.ingest` + using `source=agent_session`. + +--- + +## SCENE 3 — Post-implementation (PM persona) + +You are the PM again. The dev just landed their changes. Show how +the ledger evolved: + +1. Call `bicameral.history`. The cherry-pick decision should now show + `status=reflected` (or `compliant`) where it was previously + pending or ungrounded. + +2. Render a brief markdown table grouped by feature area, showing each + decision's two axes — code-compliance status and human signoff + state — so the viewer can scan it. + +3. Ratify the post-implementation state of the cherry-pick decision + via `bicameral.ratify` to acknowledge that what shipped matches + what was decided. diff --git a/tests/e2e/prompts/flow-1-ingest.md b/tests/e2e/prompts/flow-1-ingest.md new file mode 100644 index 00000000..1d517167 --- /dev/null +++ b/tests/e2e/prompts/flow-1-ingest.md @@ -0,0 +1,9 @@ +Just out of roadmap review. Three things we agreed to track: + +- High-signal notifications (2.9.10 / 3.0.0): notify on failed checks, notify on PR review. +- Improved commit history (2.9.0): drag-to-reorder, drag-to-squash, amend last commit, branch from a previous commit. +- Cherry-pick between branches (2.7.1): context-menu and an interactive variant. + +Source: desktop/desktop:docs/process/roadmap.md. + +Already aligned with the team — please log these and sign them off on our end. If any have an obvious code home, bind them too so we can catch drift later. diff --git a/tests/e2e/prompts/flow-2-preflight.md b/tests/e2e/prompts/flow-2-preflight.md new file mode 100644 index 00000000..052957fd --- /dev/null +++ b/tests/e2e/prompts/flow-2-preflight.md @@ -0,0 +1,3 @@ +Add a programmatic API for reordering commits — it takes an ordered list of commit SHAs and rewrites the branch history to match that order. Wire it so any UI surface can call it with a sorted list and apply the new order. + +I'll handle the call-site cleanup separately. diff --git a/tests/e2e/prompts/flow-3-commit-sync.md b/tests/e2e/prompts/flow-3-commit-sync.md new file mode 100644 index 00000000..742bc981 --- /dev/null +++ b/tests/e2e/prompts/flow-3-commit-sync.md @@ -0,0 +1,3 @@ +Need a quick docs commit. Drop a one-line comment above the CherryPickResult enum in cherry-pick.ts pointing back to the roadmap — something like `// Cherry-pick: roadmap v2.7.1 — context menu + interactive`. + +Stage and commit it as `docs: annotate cherry-pick origin`. diff --git a/tests/e2e/prompts/flow-4-session-end.md b/tests/e2e/prompts/flow-4-session-end.md new file mode 100644 index 00000000..8bee57e8 --- /dev/null +++ b/tests/e2e/prompts/flow-4-session-end.md @@ -0,0 +1,5 @@ +hmm wait — small thing before we keep going on reorder. + +just realized: the cherry-pick conflict handler shouldn't ever fall back to a stdin prompt. visual conflict UI is the only resolution path, full stop. if it drifts toward a terminal prompt that's a rollback. + +ok back to reorder.ts — keep going on the `reorder()` function for the text-editor flow. diff --git a/tests/e2e/prompts/flow-5-history.md b/tests/e2e/prompts/flow-5-history.md new file mode 100644 index 00000000..1f08b9e3 --- /dev/null +++ b/tests/e2e/prompts/flow-5-history.md @@ -0,0 +1,3 @@ +Doing a Friday review across all the things we're tracking. Walk me through them grouped by feature — for each one, where it stands on the implementation side and whether it's been signed off. + +Anything still on the to-do pile that hasn't moved — flag those, give me a one-sentence read on each, and pick whichever one looks most ready (clear scope, supporting context, no open questions) and sign it off. Then show me the updated view. diff --git a/tests/e2e/record_demo.sh b/tests/e2e/record_demo.sh new file mode 100755 index 00000000..a01a5281 --- /dev/null +++ b/tests/e2e/record_demo.sh @@ -0,0 +1,288 @@ +#!/usr/bin/env bash +# Record a single continuous split-screen demo session of the v0 user flow, +# then post-split the recording into pm.mp4 (PM persona) and dev.mp4 +# (Dev persona). pm.mp4 has a transition slide between the +# pre-implementation and post-implementation chapters. +# +# Layout (1920x1080): +# ┌──────────────────────────┬──────────────────────────┐ +# │ xterm │ chromium │ +# │ claude -p <composite> │ http://localhost:<port> │ +# │ (one continuous session │ bicameral dashboard │ +# │ spanning all 3 scenes) │ (live SSE updates) │ +# └──────────────────────────┴──────────────────────────┘ +# +# Single claude session = single MCP process = single in-memory ledger. +# That's what makes Scene 3 (PM post-impl) authentically reflect Scene 2's +# (Dev) commits — the dashboard SSE keeps state across the whole arc. +# +# This script runs only in the GitHub workflow's optional manual-dispatch +# path (`record_demo=true`). It is `continue-on-error` at the workflow +# level — a flake here never gates merge. + +set -euo pipefail + +# ── Config ────────────────────────────────────────────────────────────── +DISPLAY_NUM=99 +RES_W=1920 +RES_H=1080 +HALF_W=$((RES_W / 2)) +RES="${RES_W}x${RES_H}" +FRAMERATE=10 +TRANSITION_DURATION=4 + +E2E_DIR="$(cd "$(dirname "$0")" && pwd)" +MCP_DIR="$(cd "$E2E_DIR/../.." && pwd)" +OUT_DIR="$MCP_DIR/docs/demos/v0-userflow-e2e" +RESULTS_DIR="$MCP_DIR/test-results/e2e" +MCP_CONFIG_TEMPLATE="$E2E_DIR/bicameral.mcp.json" +MCP_CONFIG_MATERIALIZED="$RESULTS_DIR/bicameral.mcp.materialized.json" +PORT_FILE="$HOME/.bicameral/dashboard.port" +COMPOSITE_PROMPT_FILE="$E2E_DIR/prompts/composite-demo.md" +DEMO_RENDERER="$E2E_DIR/demo_renderer.py" + +DESKTOP_REPO_PATH="${DESKTOP_REPO_PATH:-/tmp/desktop-clone}" + +mkdir -p "$OUT_DIR" "$RESULTS_DIR" "$(dirname "$PORT_FILE")" + +if [ ! -d "$DESKTOP_REPO_PATH" ]; then + echo "ERROR: DESKTOP_REPO_PATH=$DESKTOP_REPO_PATH does not exist." >&2 + exit 2 +fi + +for bin in Xvfb fluxbox xterm ffmpeg claude bicameral-mcp python3; do + if ! command -v "$bin" >/dev/null 2>&1; then + echo "ERROR: required binary '$bin' not found on PATH." >&2 + exit 2 + fi +done + +# Pick whichever chromium-compatible browser is available. GitHub's +# ubuntu-latest runners ship google-chrome-stable; Linux desktops often +# have chromium via snap. All four accept the same Chromium-style flags. +CHROME_BIN="$(command -v google-chrome-stable \ + || command -v google-chrome \ + || command -v chromium \ + || command -v chromium-browser \ + || true)" +if [ -z "$CHROME_BIN" ]; then + echo "ERROR: no chromium-compatible browser found on PATH." >&2 + echo " tried: google-chrome-stable, google-chrome, chromium, chromium-browser" >&2 + exit 2 +fi +echo "[demo] using browser: $CHROME_BIN" + +# ── Materialize MCP config (mirrors run_e2e_flows.py) ─────────────────── +sed "s|\${DESKTOP_REPO_PATH}|$DESKTOP_REPO_PATH|g" \ + "$MCP_CONFIG_TEMPLATE" > "$MCP_CONFIG_MATERIALIZED" + +# Reset port file so the chromium poll only sees this run's value. +rm -f "$PORT_FILE" + +# ── Start Xvfb + minimal WM ───────────────────────────────────────────── +Xvfb ":${DISPLAY_NUM}" -screen 0 "${RES}x24" -nolisten tcp >/tmp/xvfb.log 2>&1 & +XVFB_PID=$! +export DISPLAY=":${DISPLAY_NUM}" +sleep 1 + +fluxbox >/tmp/fluxbox.log 2>&1 & +FLUXBOX_PID=$! +sleep 1 + +cleanup() { + set +e + kill "$FLUXBOX_PID" "$XVFB_PID" 2>/dev/null + wait 2>/dev/null +} +trap cleanup EXIT + +# ── Recording paths ───────────────────────────────────────────────────── +FULL_MP4="$OUT_DIR/full.mp4" +TRANSCRIPT="$RESULTS_DIR/composite-demo-transcript.ndjson" +SCENES_FILE="$RESULTS_DIR/composite-demo-scenes.txt" + +export DEMO_TRANSCRIPT="$TRANSCRIPT" +export DEMO_SCENES_FILE="$SCENES_FILE" + +PROMPT_BODY="$(cat "$COMPOSITE_PROMPT_FILE")" + +# ── Start ffmpeg recording ────────────────────────────────────────────── +T0=$(date +%s.%N) +ffmpeg -y -f x11grab -video_size "$RES" -framerate "$FRAMERATE" \ + -i ":${DISPLAY_NUM}" \ + -c:v libx264 -preset ultrafast -pix_fmt yuv420p \ + "$FULL_MP4" >/tmp/ffmpeg-record.log 2>&1 & +FFMPEG_PID=$! +sleep 1 + +# ── Build claude command piped through the demo renderer ──────────────── +# stream-json gives us the tool-use timeline for scene detection; +# demo_renderer.py pretty-prints it back to readable text in the xterm. +# Bash is allowed for `git add`/`git commit` (per composite-demo.md); +# Edit is allowed so claude can modify cherry-pick.ts live. +CLAUDE_CMD=( + claude -p "$PROMPT_BODY" + --mcp-config "$MCP_CONFIG_MATERIALIZED" + --strict-mcp-config + --allowed-tools "mcp__bicameral,Read,Grep,Edit,Bash" + --add-dir "$DESKTOP_REPO_PATH" + --output-format stream-json + --verbose + --no-session-persistence + --max-budget-usd 5.0 + --dangerously-skip-permissions +) + +CLAUDE_LINE="" +for arg in "${CLAUDE_CMD[@]}"; do + CLAUDE_LINE+=$(printf ' %q' "$arg") +done + +# ── Launch xterm running claude → renderer ────────────────────────────── +( + cd "$DESKTOP_REPO_PATH" # so claude's Bash git commands run against the fixture repo + xterm -geometry 100x40+0+0 -fa Monospace -fs 11 \ + -bg black -fg white -title "claude — composite demo (3 scenes)" \ + -e bash -lc "${CLAUDE_LINE# } | python3 ${DEMO_RENDERER}; echo; echo '[demo] all scenes complete — recording wraps in 4s'; sleep 4" \ + >/tmp/xterm-composite.log 2>&1 & + echo $! > /tmp/xterm-composite.pid +) +XTERM_PID=$(cat /tmp/xterm-composite.pid) + +# ── Poll for dashboard.port (up to 60s) and launch chromium ───────────── +PORT="" +for _ in $(seq 1 60); do + if [ -f "$PORT_FILE" ]; then + PORT="$(tr -d '[:space:]' < "$PORT_FILE" || true)" + [ -n "$PORT" ] && break + fi + sleep 1 +done + +CHROMIUM_PID="" +if [ -n "$PORT" ]; then + "$CHROME_BIN" --no-sandbox --disable-gpu \ + --window-size="${HALF_W},${RES_H}" \ + --window-position="${HALF_W},0" \ + --user-data-dir="/tmp/chromium-composite" \ + --no-first-run --no-default-browser-check \ + --new-window "http://localhost:${PORT}" \ + >/tmp/chromium-composite.log 2>&1 & + CHROMIUM_PID=$! +else + echo " warning: dashboard port never appeared; recording xterm-only" >&2 +fi + +# ── Wait for claude to finish (cap 25 min) ────────────────────────────── +COMPOSITE_TIMEOUT=1500 +WAITED=0 +while kill -0 "$XTERM_PID" 2>/dev/null; do + sleep 2 + WAITED=$((WAITED + 2)) + if [ "$WAITED" -ge "$COMPOSITE_TIMEOUT" ]; then + echo " warning: composite demo exceeded ${COMPOSITE_TIMEOUT}s — killing xterm" >&2 + kill "$XTERM_PID" 2>/dev/null || true + break + fi +done + +# Brief pause so dashboard SSE settles into its final state on the right. +sleep 4 + +# ── Stop ffmpeg cleanly so the moov atom is flushed ───────────────────── +kill -INT "$FFMPEG_PID" 2>/dev/null || true +wait "$FFMPEG_PID" 2>/dev/null || true + +if [ -n "$CHROMIUM_PID" ]; then + kill "$CHROMIUM_PID" 2>/dev/null || true + wait "$CHROMIUM_PID" 2>/dev/null || true +fi + +if [ ! -s "$FULL_MP4" ]; then + echo "ERROR: $FULL_MP4 missing or empty — nothing to split" >&2 + exit 1 +fi + +echo "=== full.mp4 written ($(stat -c%s "$FULL_MP4" 2>/dev/null || stat -f%z "$FULL_MP4") bytes) ===" +echo "=== Scene markers ===" +cat "$SCENES_FILE" 2>/dev/null || echo "(no scenes file)" + +# ── Extract scene boundaries (epoch → seconds-from-T0) ────────────────── +to_offset() { + python3 - "$T0" "$1" <<'PY' +import sys +t0 = float(sys.argv[1]) +t = float(sys.argv[2]) +print(f"{max(0.0, t - t0):.3f}") +PY +} + +SCENE_1_TO_2_EPOCH="$(grep '^scene_1_to_2=' "$SCENES_FILE" 2>/dev/null | tail -1 | cut -d= -f2 || true)" +SCENE_2_TO_3_EPOCH="$(grep '^scene_2_to_3=' "$SCENES_FILE" 2>/dev/null | tail -1 | cut -d= -f2 || true)" + +# ── Fallback path: if scene markers are missing, keep full.mp4 as the +# only artifact — pm/dev split is impossible without timestamps. ──────── +if [ -z "$SCENE_1_TO_2_EPOCH" ] || [ -z "$SCENE_2_TO_3_EPOCH" ]; then + echo "WARNING: scene boundary markers missing — emitting full.mp4 only" >&2 + echo " (pm.mp4 / dev.mp4 will not be generated)" + ls -la "$OUT_DIR" + exit 0 +fi + +T1="$(to_offset "$SCENE_1_TO_2_EPOCH")" +T2="$(to_offset "$SCENE_2_TO_3_EPOCH")" +echo "Scene boundaries (s from T0): t1=$T1 t2=$T2" + +# ── Trim full.mp4 into three pieces (re-encoded for frame-accurate cuts) ─ +PM_PRE="$RESULTS_DIR/pm-pre.mp4" +DEV_OUT="$OUT_DIR/dev.mp4" +PM_POST="$RESULTS_DIR/pm-post.mp4" + +# Common encoder flags so all pieces share codec/format for safe concat. +ENC_FLAGS=( + -c:v libx264 -preset ultrafast -pix_fmt yuv420p + -r "$FRAMERATE" + -an +) + +ffmpeg -y -i "$FULL_MP4" -ss 0 -to "$T1" "${ENC_FLAGS[@]}" "$PM_PRE" \ + >>/tmp/ffmpeg-split.log 2>&1 +ffmpeg -y -i "$FULL_MP4" -ss "$T1" -to "$T2" "${ENC_FLAGS[@]}" "$DEV_OUT" \ + >>/tmp/ffmpeg-split.log 2>&1 +ffmpeg -y -i "$FULL_MP4" -ss "$T2" "${ENC_FLAGS[@]}" "$PM_POST" \ + >>/tmp/ffmpeg-split.log 2>&1 + +# ── Generate transition slide between PM-pre and PM-post ──────────────── +TRANSITION="$RESULTS_DIR/transition.mp4" +FONT_BOLD="/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf" +FONT_REG="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" + +# Three lines centered on a deep navy background. Font sizes chosen for +# 1920x1080 readability; colors match a darkmode-dashboard palette so +# the transition feels of-a-piece with the rest of the demo. +ffmpeg -y \ + -f lavfi -i "color=c=#0a0e27:s=${RES_W}x${RES_H}:d=${TRANSITION_DURATION}:r=${FRAMERATE}" \ + -vf "drawtext=fontfile='${FONT_BOLD}':text='— Pre-implementation complete —':fontsize=58:fontcolor=#8aa0c8:x=(w-text_w)/2:y=(h-text_h)/2-180, + drawtext=fontfile='${FONT_BOLD}':text='Dev now implements the change':fontsize=78:fontcolor=#ffffff:x=(w-text_w)/2:y=(h-text_h)/2-60, + drawtext=fontfile='${FONT_REG}':text='(see dev.mp4 — preflight, edit, commit, link_commit, resolve_compliance)':fontsize=30:fontcolor=#8aa0c8:x=(w-text_w)/2:y=(h-text_h)/2+40, + drawtext=fontfile='${FONT_BOLD}':text='Returning to PM after the implementation has landed':fontsize=46:fontcolor=#ffd76a:x=(w-text_w)/2:y=(h-text_h)/2+160" \ + "${ENC_FLAGS[@]}" -t "$TRANSITION_DURATION" "$TRANSITION" \ + >>/tmp/ffmpeg-transition.log 2>&1 + +# ── Concat pm.mp4 = pm-pre + transition + pm-post ─────────────────────── +PM_CONCAT_LIST="$RESULTS_DIR/pm-concat.txt" +{ + echo "file '$PM_PRE'" + echo "file '$TRANSITION'" + echo "file '$PM_POST'" +} > "$PM_CONCAT_LIST" + +PM_OUT="$OUT_DIR/pm.mp4" +ffmpeg -y -f concat -safe 0 -i "$PM_CONCAT_LIST" \ + "${ENC_FLAGS[@]}" "$PM_OUT" >>/tmp/ffmpeg-concat.log 2>&1 + +# Clean up the scratch trims; keep full.mp4 + dev.mp4 + pm.mp4 in OUT_DIR. +rm -f "$PM_PRE" "$PM_POST" "$TRANSITION" "$PM_CONCAT_LIST" + +echo "=== Demo recording + split complete ===" +ls -la "$OUT_DIR" diff --git a/tests/e2e/record_demo_interactive.sh b/tests/e2e/record_demo_interactive.sh new file mode 100755 index 00000000..b6f95109 --- /dev/null +++ b/tests/e2e/record_demo_interactive.sh @@ -0,0 +1,637 @@ +#!/usr/bin/env bash +# Interactive demo recording — tmux-driven real claude TUI, per-scene sessions. +# +# Implementation of thoughts/shared/plans/2026-05-01-interactive-recording-spec.md. +# Replaces the headless `claude -p` + demo_renderer.py path with five real +# interactive Claude Code sessions (one per flow), driven by `tmux send-keys` / +# bracketed paste. State carries across scenes via the shared surrealkv ledger +# (matching run_e2e_flows.py's persistence contract). +# +# Layout (1920x1080): +# ┌──────────────────────────┬──────────────────────────┐ +# │ xterm │ chromium │ +# │ attached to tmux pane │ http://localhost:<port> │ +# │ running interactive │ bicameral dashboard │ +# │ claude TUI │ (live SSE updates) │ +# └──────────────────────────┴──────────────────────────┘ +# +# Output (in $OUT_DIR): +# - full-int.mp4 — raw continuous capture of all 5 scenes (no transition) +# - scene-1.mp4 … scene-5.mp4 — per-scene splits +# - pm.mp4 — scene-1 + transition slide + scene-5 +# - dev.mp4 — scene-2 + scene-3 + scene-4 +# +# Legacy `record_demo.sh` is intentionally retained as a fallback path; the +# workflow's `recording` job has `continue-on-error: true`, so a flake here +# leaves the assertion artifacts intact. +# +# Prereqs (Linux runner): Xvfb, fluxbox, xterm, ffmpeg, tmux, claude CLI, +# bicameral-mcp, python3, chromium-compatible browser, DejaVu fonts. + +set -euo pipefail + +# ── Config ────────────────────────────────────────────────────────────── +DISPLAY_NUM=99 +RES_W=1920 +RES_H=1080 +HALF_W=$((RES_W / 2)) +RES="${RES_W}x${RES_H}" +FRAMERATE=10 +TRANSITION_DURATION=4 + +# Per-scene polling caps (see spec §6.1, §6.3, §6.4). +READY_TIMEOUT=90 # claude TUI must show input box within this — longer + # because fresh-runner state walks 5+ onboarding dialogs +IDLE_MAX_WAIT=300 # 5 min cap per scene for agent finish +IDLE_STABLE_FOR=8 # input box must persist for N consecutive samples +SESSION_DEAD_GRACE=60 # post-/exit grace for SessionEnd hook to run +PORT_POLL_TIMEOUT=45 # post-paste wait for dashboard.port to appear + +E2E_DIR="$(cd "$(dirname "$0")" && pwd)" +MCP_DIR="$(cd "$E2E_DIR/../.." && pwd)" +OUT_DIR="$MCP_DIR/docs/demos/v0-userflow-e2e" +RESULTS_DIR="$MCP_DIR/test-results/e2e" +LEDGER_DIR="$RESULTS_DIR/ledger.db" +MCP_CONFIG_TEMPLATE="$E2E_DIR/bicameral.mcp.json" +MCP_CONFIG_MATERIALIZED="$RESULTS_DIR/bicameral.mcp.materialized.json" +PROMPTS_DIR="$E2E_DIR/prompts" +PORT_FILE="$HOME/.bicameral/dashboard.port" + +DESKTOP_REPO_PATH="${DESKTOP_REPO_PATH:-/tmp/desktop-clone}" + +mkdir -p "$OUT_DIR" "$RESULTS_DIR" "$(dirname "$PORT_FILE")" + +if [ ! -d "$DESKTOP_REPO_PATH" ]; then + echo "ERROR: DESKTOP_REPO_PATH=$DESKTOP_REPO_PATH does not exist." >&2 + exit 2 +fi + +for bin in Xvfb fluxbox xterm ffmpeg claude bicameral-mcp python3 tmux; do + if ! command -v "$bin" >/dev/null 2>&1; then + echo "ERROR: required binary '$bin' not found on PATH." >&2 + exit 2 + fi +done + +CHROME_BIN="$(command -v google-chrome-stable \ + || command -v google-chrome \ + || command -v chromium \ + || command -v chromium-browser \ + || true)" +if [ -z "$CHROME_BIN" ]; then + echo "ERROR: no chromium-compatible browser found on PATH." >&2 + exit 2 +fi +echo "[demo] using browser: $CHROME_BIN" + +# ── Auth: ANTHROPIC_API_KEY (NOT CLAUDE_CODE_OAUTH_TOKEN) ────────────── +# Verified locally and matches GH issue #32463: interactive `claude` reads +# but does NOT honour `CLAUDE_CODE_OAUTH_TOKEN`. It DOES honour +# `ANTHROPIC_API_KEY`, but on first run it shows a "Detected a custom API +# key in your environment / Do you want to use this API key?" picker that +# we have to dismiss in `wait_for_claude_ready`. The assertions job keeps +# using OAuth (its `claude -p` path honours that env var fine). +if [ -z "${ANTHROPIC_API_KEY:-}" ]; then + echo "[demo] WARNING: ANTHROPIC_API_KEY unset — interactive claude will hit the 'Select login method' picker with no way to advance" >&2 +fi + +# ── Setup substrate — single source of truth shared with run_e2e_flows.py. +# `_harness_setup.py` materializes the MCP config, writes claude-settings +# with all three hooks (PostToolUse / SessionEnd / UserPromptSubmit) wired +# via setup_wizard, bootstraps `.bicameral/` inside DESKTOP_REPO_PATH so +# the SessionEnd guard passes, wipes the ledger, and resets the desktop +# clone. The recording job and the assertion job both call this — no +# inline duplication, no drift between the two paths. ────────────────── +SETTINGS_FILE="$RESULTS_DIR/claude-settings-with-hook.json" +python3 "$E2E_DIR/_harness_setup.py" \ + --desktop-repo-path "$DESKTOP_REPO_PATH" \ + --results-dir "$RESULTS_DIR" \ + --mcp-config-template "$MCP_CONFIG_TEMPLATE" \ + --mcp-root "$MCP_DIR" \ + >/dev/null +rm -f "$PORT_FILE" + +# ── Start Xvfb + minimal WM ───────────────────────────────────────────── +Xvfb ":${DISPLAY_NUM}" -screen 0 "${RES}x24" -nolisten tcp >/tmp/xvfb.log 2>&1 & +XVFB_PID=$! +export DISPLAY=":${DISPLAY_NUM}" +sleep 1 + +fluxbox >/tmp/fluxbox.log 2>&1 & +FLUXBOX_PID=$! +sleep 1 + +CHROMIUM_PID="" +CURRENT_PORT="" +FFMPEG_PID="" +XTERM_PIDS=() + +cleanup() { + set +e + if [ -n "$FFMPEG_PID" ]; then + kill -INT "$FFMPEG_PID" 2>/dev/null + wait "$FFMPEG_PID" 2>/dev/null + fi + if [ -n "$CHROMIUM_PID" ]; then + kill "$CHROMIUM_PID" 2>/dev/null + wait "$CHROMIUM_PID" 2>/dev/null + fi + for s in $(tmux list-sessions -F '#S' 2>/dev/null | grep '^scene-' || true); do + tmux kill-session -t "$s" 2>/dev/null + done + for p in "${XTERM_PIDS[@]}"; do + kill "$p" 2>/dev/null + done + kill "$FLUXBOX_PID" "$XVFB_PID" 2>/dev/null + wait 2>/dev/null +} +trap cleanup EXIT + +# ── Recording paths ───────────────────────────────────────────────────── +FULL_MP4="$OUT_DIR/full-int.mp4" +SCENE_BOUNDS_FILE="$RESULTS_DIR/scene-bounds-int.txt" +: > "$SCENE_BOUNDS_FILE" + +# ── Helpers ────────────────────────────────────────────────────────────── + +# now_offset — seconds elapsed since ffmpeg started (T0) +now_offset() { + python3 - "$T0" "$(date +%s.%N)" <<'PY' +import sys +print(f"{max(0.0, float(sys.argv[2]) - float(sys.argv[1])):.3f}") +PY +} + +# wait_for_claude_ready <session> +# Walks the first-run onboarding dialog stack on a fresh CI runner. +# Verified locally against claude 2.1.126 with HOME=tmpdir, ANTHROPIC_API_KEY +# set: dismissals reach the `^❯ ` input prompt at t≈7s. +# +# Sequence (each fires at most once per session): +# 1. Theme picker ("Choose the text style ... run /theme") +# — Enter (default option 2 = Dark mode is preselected) +# 2. API key picker ("Detected a custom API key in your environment") +# — '1' (override the preselected "No (recommended)" with "Yes") +# 3. Security notes ("Security notes: ... Press Enter to continue…") +# — Enter +# 4. Trust folder ("Quick safety check ... trust this folder") +# — Enter (default option 1 = Yes is preselected) +# 5. New MCP server prompt ("New MCP server found in .mcp.json") +# — Enter (default option 1 = Use this and all future) +# 6. Bypass-permissions warning ("Claude Code running in Bypass Permissions mode") +# — '2' (override the preselected "No, exit" with "Yes, I accept") +# +# Detection: search WHOLE pane (not `tail -3`) — claude renders dialogs at a +# fixed row near the middle of a tall pane. The `^❯` anchor at column 0 +# matches only the actual input prompt, not the menu rows ` ❯ 2. ...` which +# have a leading space. +wait_for_claude_ready() { + local session=$1 + local i=0 + declare -A dismissed=() + while [ $i -lt $READY_TIMEOUT ]; do + if ! tmux has-session -t "$session" 2>/dev/null; then + echo " warning: $session died before TUI was ready" >&2 + return 1 + fi + local pane + pane="$(tmux capture-pane -t "$session" -p 2>/dev/null || true)" + + # Ready + if printf '%s' "$pane" | grep -q '^❯'; then + return 0 + fi + + # Onboarding dialogs — each at most once per session + if [ -z "${dismissed[theme]:-}" ] && \ + printf '%s' "$pane" | grep -qE 'Choose the text style|run /theme'; then + tmux send-keys -t "$session" Enter + dismissed[theme]=1; sleep 2; i=$((i+2)); continue + fi + if [ -z "${dismissed[api_key]:-}" ] && \ + printf '%s' "$pane" | grep -q 'Detected a custom API key'; then + tmux send-keys -t "$session" '1' + dismissed[api_key]=1; sleep 2; i=$((i+2)); continue + fi + if [ -z "${dismissed[security]:-}" ] && \ + printf '%s' "$pane" | grep -q 'Security notes:'; then + tmux send-keys -t "$session" Enter + dismissed[security]=1; sleep 2; i=$((i+2)); continue + fi + if [ -z "${dismissed[trust]:-}" ] && \ + printf '%s' "$pane" | grep -q 'trust this folder'; then + tmux send-keys -t "$session" Enter + dismissed[trust]=1; sleep 2; i=$((i+2)); continue + fi + if [ -z "${dismissed[mcp]:-}" ] && \ + printf '%s' "$pane" | grep -q 'New MCP server found'; then + tmux send-keys -t "$session" Enter + dismissed[mcp]=1; sleep 2; i=$((i+2)); continue + fi + if [ -z "${dismissed[bypass]:-}" ] && \ + printf '%s' "$pane" | grep -q 'Bypass Permissions mode'; then + tmux send-keys -t "$session" '2' + dismissed[bypass]=1; sleep 2; i=$((i+2)); continue + fi + + sleep 1 + i=$((i+1)) + done + echo " warning: claude TUI never showed input prompt for $session" >&2 + return 1 +} + +# type_prompt <session> <body> [total_seconds] +# Types body character-by-character so the recording shows a human-paced +# typing animation (default ~3s total regardless of length, like the user +# asked). Embedded newlines are inserted via M-Enter (Alt+Return) — the +# only escape that preserves newlines in claude TUI's input box without +# submitting (verified locally). Final Enter submits. +type_prompt() { + local session=$1 + local body=$2 + local total_secs=${3:-3} + local len=${#body} + if [ "$len" -le 0 ]; then return; fi + local delay + delay=$(python3 -c "print(round(max(0.005, ${total_secs} / ${len}), 4))") + local i ch + for ((i=0; i<len; i++)); do + ch="${body:$i:1}" + if [ "$ch" = $'\n' ]; then + tmux send-keys -t "$session" M-Enter + else + tmux send-keys -t "$session" -l "$ch" + fi + sleep "$delay" + done + sleep 0.3 + tmux send-keys -t "$session" Enter +} + +# wait_for_agent_idle <session> +# Claude TUI keeps the `❯ ` input prompt rendered at a fixed row even while +# streaming, so the prompt-visible test is necessary but not sufficient. The +# real signal that the agent stopped is pane stability — when the streaming +# output stops mutating for IDLE_STABLE_FOR consecutive samples, we're idle. +wait_for_agent_idle() { + local session=$1 + local stable_count=0 + local i=0 + local prev="" + while [ $i -lt $IDLE_MAX_WAIT ]; do + if ! tmux has-session -t "$session" 2>/dev/null; then + echo " warning: $session died mid-response" >&2 + return 1 + fi + local pane + pane="$(tmux capture-pane -t "$session" -p 2>/dev/null || true)" + if [ "$pane" = "$prev" ] && printf '%s' "$pane" | grep -q '^❯'; then + stable_count=$((stable_count+1)) + if [ $stable_count -ge $IDLE_STABLE_FOR ]; then + return 0 + fi + else + stable_count=0 + fi + prev=$pane + sleep 1 + i=$((i+1)) + done + echo " warning: agent_idle timed out after ${IDLE_MAX_WAIT}s for $session" >&2 + return 1 +} + +# wait_for_session_dead <session> +# After /exit, claude runs the SessionEnd hook (capture-corrections may fire) +# before the process actually exits. Wait for natural death; force-kill only +# after the grace period to avoid polluting the ledger mid-hook. +wait_for_session_dead() { + local session=$1 + local i=0 + while tmux has-session -t "$session" 2>/dev/null; do + sleep 1 + i=$((i+1)) + if [ $i -ge $SESSION_DEAD_GRACE ]; then + echo " warning: $session didn't exit after ${SESSION_DEAD_GRACE}s — force-killing" >&2 + tmux kill-session -t "$session" 2>/dev/null + break + fi + done +} + +# poll_port_file — wait up to PORT_POLL_TIMEOUT for the dashboard sidecar to +# write its bound port. Returns the port on stdout (empty on timeout). +poll_port_file() { + local i=0 + while [ $i -lt $PORT_POLL_TIMEOUT ]; do + if [ -f "$PORT_FILE" ]; then + local p + p="$(tr -d '[:space:]' < "$PORT_FILE" || true)" + if [ -n "$p" ]; then + printf '%s' "$p" + return 0 + fi + fi + sleep 1 + i=$((i+1)) + done + return 1 +} + +# refresh_chromium_for_port <port> +# Each scene = new MCP process = new port. Kill the previous chromium and +# relaunch on the new port (spec §6.5 option A). The brief flicker visually +# emphasises the scene boundary; option B (standalone dashboard sidecar) is +# a deferred follow-up. +refresh_chromium_for_port() { + local new_port=$1 + if [ "$new_port" = "$CURRENT_PORT" ] && [ -n "$CHROMIUM_PID" ] && kill -0 "$CHROMIUM_PID" 2>/dev/null; then + return 0 + fi + if [ -n "$CHROMIUM_PID" ]; then + kill "$CHROMIUM_PID" 2>/dev/null || true + wait "$CHROMIUM_PID" 2>/dev/null || true + fi + "$CHROME_BIN" --no-sandbox --disable-gpu \ + --window-size="${HALF_W},${RES_H}" \ + --window-position="${HALF_W},0" \ + --user-data-dir="/tmp/chromium-int-${new_port}" \ + --no-first-run --no-default-browser-check \ + --new-window "http://localhost:${new_port}" \ + >>/tmp/chromium-int.log 2>&1 & + CHROMIUM_PID=$! + CURRENT_PORT=$new_port +} + +# ── Start ffmpeg (continuous capture) ──────────────────────────────────── +T0=$(date +%s.%N) +ffmpeg -y -f x11grab -video_size "$RES" -framerate "$FRAMERATE" \ + -i ":${DISPLAY_NUM}" \ + -c:v libx264 -preset ultrafast -pix_fmt yuv420p \ + "$FULL_MP4" >/tmp/ffmpeg-int.log 2>&1 & +FFMPEG_PID=$! +sleep 1 + +# ── Per-scene loop ────────────────────────────────────────────────────── +# One tmux+claude session per flow, mirroring run_e2e_flows.py exactly. State +# persists via the shared surrealkv ledger; what differs from headless is the +# real TUI rendering and the human-paced typed input. +SCENES=( + "1:flow-1-ingest.md" + "2:flow-2-preflight.md" + "3:flow-3-commit-sync.md" + "4:flow-4-session-end.md" + "5:flow-5-history.md" +) + +# Dashboard preamble — kept out of the flow prompt files so the assertion +# harness (which doesn't record) can reuse them as-is. Each scene's MCP +# process has its own port; this preamble triggers the dashboard tool so +# the port file is written and we can point chromium at it. +DASHBOARD_PREAMBLE='Before doing anything else, call bicameral.dashboard so a live dashboard sidecar is bound to this MCP process. Then continue with the request below. + +' + +run_scene() { + local N=$1 + local FILE=$2 + local SESSION="scene-${N}" + local PROMPT_FILE="$PROMPTS_DIR/$FILE" + local CLAUDE_LOG="$RESULTS_DIR/claude-scene-${N}.stderr" + local CLAUDE_EXIT="$RESULTS_DIR/claude-scene-${N}.exit" + local PANE_DUMP="$RESULTS_DIR/scene-${N}-pane.txt" + local RUNNER="$RESULTS_DIR/claude-scene-${N}.sh" + echo "=== Scene ${N} (${FILE}) ===" + + # New MCP process per scene → port may change. Wipe stale port file so the + # poll below only sees this scene's value. + rm -f "$PORT_FILE" "$CLAUDE_LOG" "$CLAUDE_EXIT" + + echo "scene_${N}_start=$(now_offset)" >> "$SCENE_BOUNDS_FILE" + + # Per-scene runner: redirects claude's stderr to a log and writes its exit + # code to a sibling file, so a startup failure (bad flag, missing OAuth, + # MCP crash) leaves actionable diagnostics instead of a silent dead pane. + # `--no-session-persistence` and `--max-budget-usd` are intentionally NOT + # passed — both are documented as `--print`-only and cause an immediate + # exit-1 in interactive mode (verified locally against claude 2.1.x). + cat > "$RUNNER" <<EOF +#!/usr/bin/env bash +cd "$DESKTOP_REPO_PATH" +exec 2>"$CLAUDE_LOG" +claude \\ + --mcp-config "$MCP_CONFIG_MATERIALIZED" \\ + --strict-mcp-config \\ + --settings "$SETTINGS_FILE" \\ + --allowed-tools mcp__bicameral,Read,Grep,Edit,Bash \\ + --add-dir "$DESKTOP_REPO_PATH" \\ + --dangerously-skip-permissions +echo "exit=\$?" > "$CLAUDE_EXIT" +EOF + chmod +x "$RUNNER" + + tmux new-session -d -s "$SESSION" -x 110 -y 40 "$RUNNER" || { + echo " ERROR: tmux new-session failed for $SESSION" >&2 + echo "scene_${N}_end=$(now_offset)" >> "$SCENE_BOUNDS_FILE" + return 1 + } + + xterm -geometry 100x40+0+0 -fa Monospace -fs 11 \ + -bg black -fg white -title "claude — scene ${N}: ${FILE}" \ + -e bash -lc "tmux attach -t $SESSION; sleep 2" \ + >/tmp/xterm-scene-${N}.log 2>&1 & + XTERM_PIDS+=($!) + + if ! wait_for_claude_ready "$SESSION"; then + { + echo "--- last pane capture ---" + tmux capture-pane -t "$SESSION" -p 2>/dev/null || echo "(no pane — session dead)" + echo "--- claude stderr ---" + cat "$CLAUDE_LOG" 2>/dev/null || echo "(no stderr log)" + echo "--- claude exit ---" + cat "$CLAUDE_EXIT" 2>/dev/null || echo "(no exit file — process may still be alive)" + } > "$PANE_DUMP" + echo " ERROR: scene ${N} did not reach ready state — diagnostics in $PANE_DUMP" >&2 + tmux kill-session -t "$SESSION" 2>/dev/null || true + echo "scene_${N}_end=$(now_offset)" >> "$SCENE_BOUNDS_FILE" + return 1 + fi + + PROMPT_BODY="${DASHBOARD_PREAMBLE}$(cat "$PROMPT_FILE")" + type_prompt "$SESSION" "$PROMPT_BODY" 3 + + if PORT="$(poll_port_file)"; then + refresh_chromium_for_port "$PORT" + else + echo " warning: scene ${N} dashboard.port never appeared — right pane may be stale" >&2 + fi + + wait_for_agent_idle "$SESSION" || true + + # Pause so the dashboard SSE settles into its final state for this scene + # (also masks the chromium reload flicker on the next scene behind a still + # frame of the closing state). + sleep 3 + + # Trigger SessionEnd hook (capture-corrections may auto-fire here), then + # wait for the tmux session to die naturally. + tmux send-keys -t "$SESSION" '/exit' Enter + wait_for_session_dead "$SESSION" + + tmux capture-pane -t "$SESSION" -p -S - 2>/dev/null > "$PANE_DUMP" || true + + echo "scene_${N}_end=$(now_offset)" >> "$SCENE_BOUNDS_FILE" + return 0 +} + +# `set +e` around each scene so a single failure doesn't abort the whole run — +# we still want the partial recording + diagnostics for the scenes that did +# work. Failed scenes still emit start/end bounds (zero-length window) so the +# downstream split logic walks them as empty cuts. +for entry in "${SCENES[@]}"; do + N="${entry%%:*}" + FILE="${entry#*:}" + set +e + run_scene "$N" "$FILE" + rc=$? + set -e + if [ $rc -ne 0 ]; then + echo " (scene ${N} failed; continuing to next)" >&2 + fi +done + +# Tail pause so ffmpeg captures a clean closing frame after scene 5. +sleep 3 + +# ── Stop ffmpeg cleanly ────────────────────────────────────────────────── +kill -INT "$FFMPEG_PID" 2>/dev/null || true +wait "$FFMPEG_PID" 2>/dev/null || true +FFMPEG_PID="" + +if [ -n "$CHROMIUM_PID" ]; then + kill "$CHROMIUM_PID" 2>/dev/null || true + wait "$CHROMIUM_PID" 2>/dev/null || true + CHROMIUM_PID="" +fi + +if [ ! -s "$FULL_MP4" ]; then + echo "ERROR: $FULL_MP4 missing or empty — nothing to split" >&2 + exit 1 +fi + +echo "=== full-int.mp4 written ($(stat -c%s "$FULL_MP4" 2>/dev/null || stat -f%z "$FULL_MP4") bytes) ===" +echo "=== Scene boundaries (offsets from T0) ===" +cat "$SCENE_BOUNDS_FILE" + +# ── Read boundary timestamps ───────────────────────────────────────────── +get_bound() { grep "^${1}=" "$SCENE_BOUNDS_FILE" | tail -1 | cut -d= -f2; } + +T_S1="$(get_bound scene_1_start)" +T_E1="$(get_bound scene_1_end)" +T_S2="$(get_bound scene_2_start)" +T_E2="$(get_bound scene_2_end)" +T_S3="$(get_bound scene_3_start)" +T_E3="$(get_bound scene_3_end)" +T_S4="$(get_bound scene_4_start)" +T_E4="$(get_bound scene_4_end)" +T_S5="$(get_bound scene_5_start)" +T_E5="$(get_bound scene_5_end)" + +# Fallback path: if any boundary is missing, keep full-int.mp4 only — the +# split is meaningless without a complete set of timestamps. +for v in "$T_S1" "$T_E1" "$T_S2" "$T_E2" "$T_S3" "$T_E3" "$T_S4" "$T_E4" "$T_S5" "$T_E5"; do + if [ -z "$v" ]; then + echo "WARNING: scene boundary missing — emitting full-int.mp4 only" >&2 + ls -la "$OUT_DIR" + exit 0 + fi +done + +# ── Trim into per-scene mp4s (re-encoded for safe concat) ─────────────── +ENC_FLAGS=( + -c:v libx264 -preset ultrafast -pix_fmt yuv420p + -r "$FRAMERATE" + -an +) + +# Failed scenes produce a zero-length (or near-zero) window. Skip them so we +# don't emit empty mp4s that break the downstream concat. +cut_scene() { + local from=$1 to=$2 dst=$3 + local span + span="$(python3 -c "print(max(0.0, float('$to') - float('$from')))")" + if python3 -c "import sys; sys.exit(0 if float('$span') >= 0.5 else 1)"; then + ffmpeg -y -i "$FULL_MP4" -ss "$from" -to "$to" "${ENC_FLAGS[@]}" "$dst" \ + >>/tmp/ffmpeg-int-split.log 2>&1 || rm -f "$dst" + else + echo " skip: $(basename "$dst") window=${span}s (scene likely failed)" >&2 + rm -f "$dst" + fi +} + +S1="$OUT_DIR/scene-1.mp4" +S2="$OUT_DIR/scene-2.mp4" +S3="$OUT_DIR/scene-3.mp4" +S4="$OUT_DIR/scene-4.mp4" +S5="$OUT_DIR/scene-5.mp4" + +cut_scene "$T_S1" "$T_E1" "$S1" +cut_scene "$T_S2" "$T_E2" "$S2" +cut_scene "$T_S3" "$T_E3" "$S3" +cut_scene "$T_S4" "$T_E4" "$S4" +cut_scene "$T_S5" "$T_E5" "$S5" + +# ── Generate transition slide (matches legacy aesthetic) ───────────────── +TRANSITION="$RESULTS_DIR/transition-int.mp4" +FONT_BOLD="/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf" +FONT_REG="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" + +ffmpeg -y \ + -f lavfi -i "color=c=#0a0e27:s=${RES_W}x${RES_H}:d=${TRANSITION_DURATION}:r=${FRAMERATE}" \ + -vf "drawtext=fontfile='${FONT_BOLD}':text='— Pre-implementation complete —':fontsize=58:fontcolor=#8aa0c8:x=(w-text_w)/2:y=(h-text_h)/2-180, + drawtext=fontfile='${FONT_BOLD}':text='Dev now implements the change':fontsize=78:fontcolor=#ffffff:x=(w-text_w)/2:y=(h-text_h)/2-60, + drawtext=fontfile='${FONT_REG}':text='(see dev.mp4 — preflight, commit-sync, session-end capture)':fontsize=30:fontcolor=#8aa0c8:x=(w-text_w)/2:y=(h-text_h)/2+40, + drawtext=fontfile='${FONT_BOLD}':text='Returning to PM after the implementation has landed':fontsize=46:fontcolor=#ffd76a:x=(w-text_w)/2:y=(h-text_h)/2+160" \ + "${ENC_FLAGS[@]}" -t "$TRANSITION_DURATION" "$TRANSITION" \ + >>/tmp/ffmpeg-int-transition.log 2>&1 + +# pm/dev concat — only include scene mp4s that actually exist (a failed scene +# leaves no file behind; see cut_scene). Skip the concat entirely if every +# input is missing. +write_concat_list() { + local list=$1 + shift + : > "$list" + for f in "$@"; do + if [ -s "$f" ]; then + echo "file '$f'" >> "$list" + fi + done +} + +run_concat() { + local list=$1 out=$2 + if [ ! -s "$list" ]; then + echo " warning: $(basename "$out") concat list empty — skipping" >&2 + return 0 + fi + ffmpeg -y -f concat -safe 0 -i "$list" "${ENC_FLAGS[@]}" "$out" \ + >>/tmp/ffmpeg-int-concat.log 2>&1 +} + +PM_OUT="$OUT_DIR/pm.mp4" +PM_LIST="$RESULTS_DIR/pm-int-concat.txt" +write_concat_list "$PM_LIST" "$S1" "$TRANSITION" "$S5" +run_concat "$PM_LIST" "$PM_OUT" + +DEV_OUT="$OUT_DIR/dev.mp4" +DEV_LIST="$RESULTS_DIR/dev-int-concat.txt" +write_concat_list "$DEV_LIST" "$S2" "$S3" "$S4" +run_concat "$DEV_LIST" "$DEV_OUT" + +# Clean up scratch files; keep per-scene mp4s + pm.mp4 + dev.mp4 + full-int.mp4. +rm -f "$PM_LIST" "$DEV_LIST" "$TRANSITION" + +echo "=== Interactive recording + split complete ===" +ls -la "$OUT_DIR" diff --git a/tests/e2e/run_e2e_flows.py b/tests/e2e/run_e2e_flows.py new file mode 100644 index 00000000..c7858abd --- /dev/null +++ b/tests/e2e/run_e2e_flows.py @@ -0,0 +1,1528 @@ +""" +v0 user flow e2e — Claude Code CLI session orchestrator. + +Drives a real Claude Code CLI session per flow (5 sessions total), with +bicameral-mcp registered as the only MCP server, and asserts on the +stream-json transcript that the right MCP tools were called with the +right shapes. + +Each flow: + 1. Reads ``prompts/flow-N-*.md`` (natural-language user prompt) + 2. Invokes ``claude -p <prompt> --mcp-config bicameral.mcp.json + --strict-mcp-config --output-format stream-json --add-dir <desktop_clone>`` + 3. Streams stdout to ``test-results/e2e/flow-N.ndjson`` + 4. Walks the transcript for tool_use blocks under ``mcp__bicameral__*`` + 5. Asserts per-flow invariants and prints PASS/FAIL + +The point: this exercises the full skill + MCP layer the way a user +experiences it. The handler-replay sim at ``scripts/sim_issue_108_flows.py`` +remains useful for fast dev iteration on handler logic. + +Required env: + CLAUDE_CODE_OAUTH_TOKEN Claude Code CLI auth (set by GitHub Actions + ``production`` environment in CI). + DESKTOP_REPO_PATH Path to a local clone of github.com/desktop/desktop. + +CI: see .github/workflows/v0-user-flow-e2e.yml. +""" + +from __future__ import annotations + +import json +import os +import pathlib +import shutil +import subprocess +import sys +from collections.abc import Callable +from dataclasses import dataclass, field + +E2E_ROOT = pathlib.Path(__file__).resolve().parent +PROMPTS_DIR = E2E_ROOT / "prompts" +MCP_CONFIG_TEMPLATE = E2E_ROOT / "bicameral.mcp.json" +RESULTS_DIR = pathlib.Path(__file__).resolve().parents[2] / "test-results" / "e2e" +RESULTS_DIR.mkdir(parents=True, exist_ok=True) + +# Wall-clock cap for a single `claude -p` flow invocation. Was 300s; CI +# repeatedly tripped that limit on Flow 2 (the longest flow — chained +# preflight → ingest(agent_session) → resolve_collision sequence after the +# #154 hook landed). Last clean dev-branch Flow 2 measured 289.7s — only +# ~3% headroom on the old cap. Bumped to 600s to give the post-hook +# sequence plenty of margin without inflating the recording job's wall +# beyond what GitHub Actions tolerates. +CLAUDE_SESSION_TIMEOUT_S = 600 + +# Persistent ledger shared across the 5 flow sessions in a single run, wiped +# at the start of each run so flow-1 seeds → flow-2 refines → flow-3 reflects +# → flow-4 captures → flow-5 ratifies, all against the same ledger state. +LEDGER_DIR = RESULTS_DIR / "ledger.db" + +DESKTOP_REPO_PATH = os.environ.get("DESKTOP_REPO_PATH", "").strip() +if not DESKTOP_REPO_PATH: + sys.stderr.write( + "ERROR: DESKTOP_REPO_PATH env var not set.\n" + "CI sets this automatically; locally:\n" + " git clone --depth=1 https://github.com/desktop/desktop /tmp/desktop-clone\n" + " DESKTOP_REPO_PATH=/tmp/desktop-clone python tests/e2e/run_e2e_flows.py\n" + ) + sys.exit(2) + +if not shutil.which("claude"): + sys.stderr.write( + "ERROR: 'claude' CLI not found on PATH.\n" + "Install via: npm install -g @anthropic-ai/claude-code\n" + ) + sys.exit(2) + +if not shutil.which("bicameral-mcp"): + sys.stderr.write( + "ERROR: 'bicameral-mcp' command not found on PATH.\nInstall via: pip install -e .\n" + ) + sys.exit(2) + + +# Setup helpers live in _harness_setup.py — single source of truth shared with +# tests/e2e/record_demo_interactive.sh so the recording job and the assertion +# job materialize byte-identical hook substrate. See _harness_setup.py docstring. +sys.path.insert(0, str(E2E_ROOT)) +# fmt: off +# isort: off +from _harness_setup import ( # noqa: E402,I001 # path tweak above + bootstrap_bicameral_dir as _bootstrap_helper, + clean_ledger as _clean_ledger_helper, + materialize_mcp_config, + materialize_settings_with_hooks, + reset_desktop_repo as _reset_desktop_helper, +) +# fmt: on +# isort: on + +_MCP_ROOT = pathlib.Path(__file__).resolve().parents[2] + + +def _clean_ledger() -> None: + _clean_ledger_helper(LEDGER_DIR) + + +def _reset_desktop_repo() -> None: + _reset_desktop_helper(DESKTOP_REPO_PATH) + + +def _bootstrap_bicameral_dir() -> None: + _bootstrap_helper(DESKTOP_REPO_PATH, _MCP_ROOT) + + +MCP_CONFIG_PATH = materialize_mcp_config( + template=MCP_CONFIG_TEMPLATE, + out_dir=RESULTS_DIR, + desktop_repo_path=DESKTOP_REPO_PATH, + ledger_dir=LEDGER_DIR, +) +SETTINGS_PATH = materialize_settings_with_hooks( + out_dir=RESULTS_DIR, + mcp_config_path=MCP_CONFIG_PATH, + mcp_root=_MCP_ROOT, +) + + +@dataclass +class FlowSpec: + """Each flow declares its layer so failures can be triaged honestly. + + - ``mcp_layer`` flows use prompts that explicitly invoke MCP tools (ingest, + link_commit, ratify, etc.). They validate that the tool surface works. + Failure here = real broken tool. + - ``agentic_layer`` flows use natural-developer-voice prompts and rely on + bicameral skills to AUTO-FIRE on intent (e.g. preflight on "refactor X", + capture-corrections at session end). Failure here is an advisory regression + signal: skills aren't reliably triggering in headless ``claude -p`` mode. + The interactive recording path (tmux-driven real TUI) is the primary + validator for this layer; this harness tracks the gap. + """ + + flow_id: str + prompt_file: str + asserter: Callable[[list[dict]], tuple[bool, str]] + category: str # "mcp_layer" | "agentic_layer" + advisory: str = "" # rendered when the flow FAILs to explain what it means + skip: bool = False # if True, do not invoke claude — mark SKIP and render advisory + # Flows sharing a session_group run inside one continuous claude session + # (chained via --session-id + --resume) so that multi-turn skills like + # bicameral-capture-corrections have real transcript history to scan and + # the SessionEnd hook fires once per group at the final flow's exit. + # None = standalone session (default; also disables session persistence). + session_group: str | None = None + # If set, do NOT invoke claude — reuse the tool_calls captured by the + # named earlier flow and run this asserter against them. Lets two flows + # grade independent properties of the same claude session (e.g. Flow 2 + # = auto-fire scope, Flow 2a = full correction-capture loop) without + # paying for a duplicate API call. + reuses_flow: str | None = None + + +@dataclass +class FlowResult: + flow_id: str + prompt_file: str + verdict: str # "PASS" | "FAIL" | "ERROR" | "SKIP" + body: str + category: str = "mcp_layer" + advisory: str = "" + tool_calls: list[dict] = field(default_factory=list) + transcript_path: str = "" + + +RESULTS: list[FlowResult] = [] + + +def section(result: FlowResult) -> None: + RESULTS.append(result) + line = result.body.splitlines()[0] if result.body else "" + print(f"[{result.flow_id}] {result.verdict} — {line[:100]}") + + +# ── Post-hoc ledger validation ───────────────────────────────────────── + + +def _snapshot_ledger() -> dict: + """Snapshot ledger state for before/after comparison. Returns counts of + decisions by status and total compliance_check rows. Uses raw client to + bypass the schema-migration crash documented in iteration 1. + + Returns ``{"total_decisions": N, "by_status": {status: N}, "compliance_checks": N}``. + On any error, returns ``{"error": str}`` — caller decides how to handle. + """ + import asyncio + import os + + os.environ["SURREAL_URL"] = f"surrealkv://{LEDGER_DIR}" + try: + from ledger.client import LedgerClient # noqa: E402 + + async def _q() -> dict: + client = LedgerClient(url=f"surrealkv://{LEDGER_DIR}") + await client.connect() + try: + drows = ( + await client.query( + "SELECT decision_id, description, status FROM decision LIMIT 200" + ) + ) or [] + ccrows = ( + await client.query( + "SELECT decision_id, region_id, content_hash, verdict " + "FROM compliance_check LIMIT 500" + ) + ) or [] + buckets: dict[str, int] = {} + for r in drows: + buckets[(r.get("status") or "unknown")] = ( + buckets.get(r.get("status") or "unknown", 0) + 1 + ) + return { + "total_decisions": len(drows), + "by_status": buckets, + "compliance_checks": len(ccrows), + "compliance_rows": ccrows, + "decisions": drows, + } + finally: + await client.close() + + return asyncio.run(_q()) + except Exception as exc: + return {"error": repr(exc)} + + +def _count_agent_session_decisions(snapshot: dict) -> int | None: + """Wrapper around the pure helper in ``_ledger_helpers``. The helper + lives in its own module so unit tests can import it without triggering + the harness's top-level env-var / CLI-presence guards. + """ + from _ledger_helpers import count_agent_session_decisions + + return count_agent_session_decisions(snapshot) + + +def _validate_flow4_via_ledger() -> None: + """Path-X-(b) validation per #147: open the ledger after the harness + completes and check for decisions written with source_type='agent_session'. + + The SessionEnd hook spawns a separate ``claude -p`` subprocess whose + tool calls are NOT visible in the parent stream-json; the subprocess + writes to the ledger with source_type='agent_session', so its effect + IS observable post-hoc. This function merges that signal into Flow 4's + FlowResult, in-place. + + Behavior matrix: + - Asserter PASS + ledger has agent_session: append confirmation note; + verdict unchanged. + - Asserter FAIL + ledger has agent_session: UPGRADE to PASS with note + 'in-stream signal absent but SessionEnd subprocess effect observed + in ledger (path-X-b)'. + - Asserter result + ledger error: append INCONCLUSIVE note; verdict + unchanged. + - Asserter PASS + ledger has zero agent_session: verdict unchanged. + - Asserter FAIL + ledger has zero agent_session: verdict unchanged + (real failure; both observable signals absent). + """ + flow4 = next((r for r in RESULTS if r.flow_id == "Flow 4"), None) + if flow4 is None: + return + + print("\n=== Flow 4 — querying ledger state for path-X-(b) signal ===") + after = _snapshot_ledger() + count = _count_agent_session_decisions(after) + + if count is None: + flow4.body += ( + f"\n— Ledger validation —\nINCONCLUSIVE: ledger query failed: {after.get('error')}\n" + ) + return + + if count > 0: + if flow4.verdict != "PASS": + flow4.verdict = "PASS" + flow4.body += ( + f"\n— Ledger validation —\n" + f"PASS: {count} decision(s) with source_type='agent_session' " + f"present in ledger after harness completion (path-X-b: SessionEnd " + f"subprocess and/or in-session capture-corrections wrote them).\n" + ) + else: + flow4.body += ( + "\n— Ledger validation —\n" + "path-X-b absent: zero decisions with source_type='agent_session' " + "after harness completion. SessionEnd subprocess either did not " + "fire, did not detect uningested corrections, or failed silently.\n" + ) + + +def _validate_flow3_via_ledger(session_id: str, baseline: dict) -> None: + """Validate the V1 lifecycle outcome by opening the ledger directly + after the chained dev_session has fully completed. + + Per bicameral-mcp #135, the post-commit hook is sync-only — ``link_commit`` + runs server-side via ``ensure_ledger_synced`` on the NEXT bicameral tool + call after HEAD moves (naturally happens during Flow 4's preflight, since + it's chained in the same session). Without a caller-LLM, ``resolve_compliance`` + can't fire from the hook, so the V1 success outcome we can validate + headless is: at least one decision flipped to ``status='pending'`` + after Flow 3's commit. + + This is Flow 3's REAL assertion — the per-flow stream-json check (did + git commit happen?) is a precondition. The ledger state IS the verdict. + This function finds the existing Flow 3 ``FlowResult`` and merges the + ledger findings into its body + verdict. No separate row is added. + """ + flow3 = next((r for r in RESULTS if r.flow_id == "Flow 3"), None) + if flow3 is None: + sys.stderr.write("Ledger validation: no Flow 3 result to merge into.\n") + return + + print("\n=== Flow 3 — querying ledger state for V1 lifecycle outcome ===") + + after = _snapshot_ledger() + if "error" in after: + flow3.verdict = "ERROR" + flow3.body += ( + f"\n— Ledger validation —\nfailed to open ledger at {LEDGER_DIR}: {after['error']}\n" + ) + return + if "error" in baseline: + flow3.verdict = "ERROR" + flow3.body += f"\n— Ledger validation —\nbaseline snapshot failed: {baseline['error']}\n" + return + + # The honest V1-lifecycle assertion: by the end of the dev_session run + # (and the runs that follow it within the same harness invocation), at + # least one decision should have transitioned from `pending` to a + # verdict state (`reflected` or `drifted`). That transition proves the + # full lifecycle — ensure_ledger_synced → link_commit → resolve_compliance + # → status verdict — completed somewhere in the run. The transition can + # be triggered by ANY bicameral tool call after HEAD moves; in practice + # it's often Flow 5's `bicameral.history` that provokes the chain. We + # don't try to attribute the transition to a specific flow — what + # matters is the V1 outcome materialised at all. + # + # Per #135 (post-commit hook is sync-only), the resolve_compliance step + # requires a caller-LLM. So this assertion implicitly tests the chain + # ALL THE WAY through, not just the sync. The compliance_check row + # count delta is reported alongside as an additional signal. + cc_before = baseline.get("compliance_checks", 0) + cc_after = after.get("compliance_checks", 0) + cc_delta = cc_after - cc_before + + pending_before = baseline.get("by_status", {}).get("pending", 0) + pending_after = after.get("by_status", {}).get("pending", 0) + reflected_before = baseline.get("by_status", {}).get("reflected", 0) + reflected_after = after.get("by_status", {}).get("reflected", 0) + drifted_before = baseline.get("by_status", {}).get("drifted", 0) + drifted_after = after.get("by_status", {}).get("drifted", 0) + + verdicts_written = (reflected_after - reflected_before) + (drifted_after - drifted_before) + pending_drained = pending_before - pending_after + + # Flow 3's verdict is now purely ledger-based per the user-flow design: + # the commit-happened stream-json check is informational, not a gate. + # The V1 lifecycle is what we care about; whichever flow triggers it + # is fine. + ledger_passed = verdicts_written > 0 or cc_delta > 0 + final_verdict = "PASS" if ledger_passed else "FAIL" + + if verdicts_written > 0: + ledger_detail = ( + f"✓ {verdicts_written} verdict(s) written during the run " + f"(reflected: {reflected_before}→{reflected_after}, " + f"drifted: {drifted_before}→{drifted_after}, " + f"pending: {pending_before}→{pending_after}). " + f"V1 lifecycle (ingest → bind → link_commit → resolve_compliance " + f"→ verdict) completed end-to-end." + ) + elif cc_delta > 0: + ledger_detail = ( + f"⚠ compliance_check rows grew by {cc_delta} ({cc_before}→{cc_after}) " + f"but no verdicts written — sync mechanism fired but resolve_compliance " + f"never ran. The caller-LLM step in the V1 chain didn't trigger; " + f"per #135 this is expected without an in-session bicameral call " + f"that surfaces pending checks to the agent." + ) + else: + ledger_detail = ( + f"✗ no compliance_check rows written ({cc_before}→{cc_after}) and " + f"no verdicts written. Either the bound decisions never had their " + f"sync triggered (no bicameral call after HEAD moves) or Flow 1's " + f"binding didn't land properly." + ) + + status_before = baseline.get("by_status", {}) + status_after = after.get("by_status", {}) + all_statuses = sorted(set(status_before) | set(status_after)) + status_lines = "\n".join( + f" {s:<22} {status_before.get(s, 0)} → {status_after.get(s, 0)}" for s in all_statuses + ) + commit_note = ( + "agent committed in Flow 3 (precondition met)" + if flow3.verdict == "PASS" + else "agent did NOT commit in Flow 3 (precondition NOT met — informational)" + ) + flow3.body += ( + f"\n— Ledger state (before → after dev_session) —\n" + f"session_id: {session_id[:8]}…\n" + f"ledger: {LEDGER_DIR}\n" + f"total decisions: {baseline.get('total_decisions', 0)} → {after.get('total_decisions', 0)}\n" + f"compliance_checks: {cc_before} → {cc_after} (Δ={cc_delta:+d})\n" + f"verdicts written: {verdicts_written}\n" + f"by status:\n{status_lines}\n\n" + f"stream-json precondition: {commit_note}\n" + f"ledger assertion: {ledger_detail}\n" + ) + # Flow 3's final verdict is the ledger result, not the commit precondition. + # The lifecycle outcome matters; the path through it is incidental. + flow3.verdict = final_verdict + + +# ── Claude Code CLI invocation ────────────────────────────────────────── + + +def run_claude_session( + flow_id: str, + prompt: str, + session_id: str | None = None, + is_first_in_group: bool = True, +) -> tuple[list[dict], pathlib.Path, int]: + """Invoke ``claude -p`` with stream-json output. Return (tool_calls, transcript_path, exit_code). + + stream-json emits one JSON object per line on stdout — system init, user + prompts, assistant turns (with tool_use blocks), tool results, and a final + result object. We capture all lines for the audit trail and extract + tool_use blocks for assertions. + + When ``session_id`` is provided: + - First flow in the group uses ``--session-id <uuid>`` to claim the UUID + and create a persistent session on disk. + - Subsequent flows use ``--resume <uuid>`` to extend the same session + (full transcript history available to skills/hooks). + - ``--no-session-persistence`` is dropped (it would block the chain). + + When ``session_id`` is None: standalone session, persistence disabled. + """ + transcript_path = RESULTS_DIR / f"{flow_id}.ndjson" + + cmd = [ + "claude", + "-p", + prompt, + "--mcp-config", + str(MCP_CONFIG_PATH), + "--strict-mcp-config", + "--settings", + str(SETTINGS_PATH), + # Bash + Edit required for Flow 3's commit. Read/Grep for inspection. + "--allowed-tools", + "mcp__bicameral,Read,Grep,Edit,Bash", + "--output-format", + "stream-json", + "--verbose", # required by stream-json for full event detail + "--max-budget-usd", + "2.0", + "--dangerously-skip-permissions", + ] + if session_id is None: + cmd.append("--no-session-persistence") + elif is_first_in_group: + cmd.extend(["--session-id", session_id]) + else: + cmd.extend(["--resume", session_id]) + + chain_tag = "" + if session_id is not None: + chain_tag = f" [session={session_id[:8]} {'first' if is_first_in_group else 'resume'}]" + # cwd MUST be DESKTOP_REPO_PATH. The agent treats cwd as the primary + # codebase and resolves prompt-relative paths there. Iteration 2 used + # pilot/mcp as cwd → agent saw the Python MCP server, refused to act + # on `app/src/lib/git/reorder.ts` because that doesn't exist in the + # MCP server tree. The MCP server's REPO_PATH env (in the materialized + # MCP config) is independent of claude's cwd, and bicameral skills load + # from ~/.claude/skills/ regardless of cwd. + print(f"\n=== {flow_id} — invoking claude (cwd={DESKTOP_REPO_PATH}){chain_tag} ===") + proc = subprocess.run( + cmd, + cwd=DESKTOP_REPO_PATH, + capture_output=True, + text=True, + timeout=CLAUDE_SESSION_TIMEOUT_S, + ) + + transcript_path.write_text(proc.stdout, encoding="utf-8") + if proc.returncode != 0: + sys.stderr.write( + f"[{flow_id}] claude CLI exit={proc.returncode}\n" + f" stderr (last 500 chars): {proc.stderr[-500:]}\n" + ) + + tool_calls = _extract_tool_calls(proc.stdout) + return tool_calls, transcript_path, proc.returncode + + +def run_scaffolding_turn(session_id: str, label: str, prompt: str) -> int: + """Inject a scaffolding turn into a chained session to seed state. + + Used when an upstream flow's auto-fire failed and we want to unblock + downstream flows by manually triggering the missing tool call. The + scaffolding turn IS allowed to name tools — its purpose is session-state + recovery, not auto-fire validation. The upstream flow's verdict still + measures auto-fire reliability honestly. + + Logged to ``test-results/e2e/scaffolding-<label>.ndjson`` for diagnostics. + Not added to RESULTS, not asserted. Returns claude's exit code. + """ + log_path = RESULTS_DIR / f"scaffolding-{label}.ndjson" + cmd = [ + "claude", + "-p", + prompt, + "--mcp-config", + str(MCP_CONFIG_PATH), + "--strict-mcp-config", + "--settings", + str(SETTINGS_PATH), + "--allowed-tools", + "mcp__bicameral,Read,Grep,Edit,Bash", + "--output-format", + "stream-json", + "--verbose", + "--max-budget-usd", + "1.0", + "--dangerously-skip-permissions", + "--resume", + session_id, + ] + print(f"\n=== Scaffolding ({label}) — injecting into session={session_id[:8]} ===") + proc = subprocess.run( + cmd, + cwd=DESKTOP_REPO_PATH, + capture_output=True, + text=True, + timeout=180, + ) + log_path.write_text(proc.stdout, encoding="utf-8") + tool_calls = _extract_tool_calls(proc.stdout) + bicameral_calls = _bicameral_tool_calls(tool_calls) + bcall_names = [c["name"].split("__")[-1] for c in bicameral_calls] + print( + f" scaffolding tool calls: {len(tool_calls)} total, " + f"{len(bicameral_calls)} bicameral → {bcall_names}" + ) + if proc.returncode != 0: + sys.stderr.write( + f"[scaffolding {label}] claude CLI exit={proc.returncode}\n" + f" stderr (last 500 chars): {proc.stderr[-500:]}\n" + ) + return proc.returncode + + +def _extract_tool_calls(stream_json: str) -> list[dict]: + """Walk stream-json output, extract every tool_use block under mcp__bicameral. + + stream-json shape: one JSON object per line. Assistant messages contain + ``message.content`` arrays with ``{"type":"tool_use","name":"...","input":{...}}``. + """ + calls: list[dict] = [] + for line in stream_json.splitlines(): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + + # Assistant turns carry tool_use blocks + if obj.get("type") == "assistant": + content = (obj.get("message") or {}).get("content") or [] + for block in content: + if isinstance(block, dict) and block.get("type") == "tool_use": + calls.append( + { + "name": block.get("name", ""), + "input": block.get("input") or {}, + "id": block.get("id", ""), + } + ) + return calls + + +def _bicameral_tool_calls(calls: list[dict]) -> list[dict]: + return [c for c in calls if c["name"].startswith("mcp__bicameral__")] + + +def _calls_named(calls: list[dict], suffix: str) -> list[dict]: + """Return calls whose tool name ends with the given suffix (server-name-agnostic).""" + return [c for c in calls if c["name"].endswith(suffix) or c["name"].endswith(f"_{suffix}")] + + +# ── Per-flow assertions ───────────────────────────────────────────────── + + +def _ingest_payload(call: dict) -> dict: + """Extract the inner payload from an ingest tool call. + + The MCP tool schema wraps the IngestPayload in a ``payload`` key. Some + skill versions also list mappings under ``decisions`` (the natural-LLM + spelling) rather than ``mappings`` (the internal field). Handle both. + """ + inp = call.get("input") or {} + return inp.get("payload") or inp + + +def _ingest_items(call: dict) -> list[dict]: + p = _ingest_payload(call) + return p.get("decisions") or p.get("mappings") or [] + + +# Feature-area binding sets for Flow 1. Each seeded decision can legitimately +# anchor to any of several files in the desktop/desktop tree — the asserter +# checks that *some* file in each area is bound, not which specific one. +# Previously the asserter required the exact paths "cherry-pick.ts" and +# "reorder.ts"; LLM nondeterminism on borderline cases (e.g. binding the +# UI-layer commit-list.tsx instead of the git-layer reorder.ts) flaked the +# test even though the functional outcome — drift detection has a code +# anchor for each feature — was satisfied. +# +# The "Improved commit history" decision bundles four ops (drag-to-reorder, +# drag-to-squash, amend, branch-from), so any of the files backing those is +# a legitimate anchor. cherry-pick has both lib and UI surfaces and either +# is acceptable. +_CHERRY_PICK_AREA_PATHS: tuple[str, ...] = ( + "cherry-pick.ts", + "cherry-pick.tsx", +) +_COMMIT_HISTORY_AREA_PATHS: tuple[str, ...] = ( + # git-layer (canonical anchors for drift on the actual operations) + "/git/reorder.ts", + "/git/squash.ts", + "/git/commit.ts", + # ui-layer (legitimate when the decision is framed as a UX feature) + "/history/commit-list.tsx", + "/history/commit-list-item.tsx", + "/multi-commit-operation/reorder.tsx", + "/multi-commit-operation/squash.tsx", + "/dispatcher/dispatcher.ts", + # models / store layer (when bound as data-shape contracts) + "/models/multi-commit-operation.ts", + "/models/retry-actions.ts", + "/stores/app-store.ts", +) + + +def _bound_to_area(bind_targets: list[str], area_paths: tuple[str, ...]) -> bool: + """Return True iff any bound path matches any acceptable substring for the area.""" + return any(any(sub in p for sub in area_paths) for p in bind_targets) + + +def assert_flow_1(calls: list[dict]) -> tuple[bool, str]: + """Flow 1: PM ingests the seed roadmap decisions, anchors at least one + file in each of the cherry-pick and commit-history feature areas, and + ratifies. Subsequent flows depend on a CLEAN, RATIFIED, BOUND ledger as + their baseline. + + Anchoring path: the canonical bicameral-ingest skill embeds bindings + inline via ``mappings[].code_regions[].file_path`` — there is no + separate ``bicameral.bind`` call for code that already exists. A + follow-up ``bicameral.bind`` is reserved for abstract decisions whose + code doesn't exist yet. This asserter accepts EITHER path. + + The check is feature-area-scoped, not file-scoped: any of the files + listed in ``_CHERRY_PICK_AREA_PATHS`` / ``_COMMIT_HISTORY_AREA_PATHS`` + counts as a legitimate anchor for the corresponding decision. The + earlier exact-filename check ("cherry-pick.ts" + "reorder.ts" only) + flaked when the LLM picked an equally valid UI-layer file like + ``commit-list.tsx`` for the bundled commit-history decision. + """ + bcalls = _bicameral_tool_calls(calls) + names = [c["name"].split("__")[-1] for c in bcalls] + + ingest_calls = _calls_named(bcalls, "bicameral_ingest") + if not ingest_calls: + return False, (f"expected bicameral.ingest; saw {len(bcalls)} bicameral calls: {names}") + + # Walk every ingest call's mappings[].code_regions[].file_path to find + # the bound files. Modern flow embeds binding here; agent may also fall + # back to a follow-up bicameral.bind for ungrounded decisions. + bind_targets: list[str] = [] + total_items = 0 + for c in ingest_calls: + items = _ingest_items(c) + total_items += len(items) + for item in items: + for region in (item or {}).get("code_regions") or []: + path = (region or {}).get("file_path") or (region or {}).get("path") or "" + if path: + bind_targets.append(path) + + if total_items < 1: + payload = _ingest_payload(ingest_calls[0]) + return False, ( + f"ingest called without decisions/mappings (payload keys: {list(payload.keys())})" + ) + + # Also accept any explicit bicameral.bind calls (still valid for the + # ungrounded-then-bind path). + bind_calls = _calls_named(bcalls, "bicameral_bind") + for c in bind_calls: + binp = c.get("input") or {} + bpayload = binp.get("payload") or binp + for span in bpayload.get("spans") or bpayload.get("bindings") or []: + path = (span or {}).get("file_path") or (span or {}).get("path") or "" + if path: + bind_targets.append(path) + + has_cp_area = _bound_to_area(bind_targets, _CHERRY_PICK_AREA_PATHS) + has_commit_history_area = _bound_to_area(bind_targets, _COMMIT_HISTORY_AREA_PATHS) + if not (has_cp_area and has_commit_history_area): + missing = [ + label + for label, present in ( + ("cherry-pick area", has_cp_area), + ("commit-history area", has_commit_history_area), + ) + if not present + ] + return False, ( + f"bind missing feature area(s): {missing}; checked " + f"ingest.mappings[].code_regions and bicameral.bind calls; saw bound " + f"paths: {bind_targets}; expected at least one path per missing area " + f"matching cherry-pick: {list(_CHERRY_PICK_AREA_PATHS)} or " + f"commit-history: {list(_COMMIT_HISTORY_AREA_PATHS)}; sequence: {names}" + ) + + # Ratify: PM blesses the just-ingested decisions. Flow 5 walks the + # `proposed` queue — flow 1's seeds must NOT remain in `proposed` or + # they'd contaminate flow 5's "what's queued for adoption" view. + ratify_calls = _calls_named(bcalls, "bicameral_ratify") + if not ratify_calls: + return False, ( + f"expected bicameral.ratify after ingest (PM blesses adoption); saw: {names}" + ) + + binding_path = "inline code_regions" if not bind_calls else "inline + follow-up bind" + return True, ( + f"ingest({total_items} items, {binding_path}) → cherry-pick + commit-history " + f"feature areas bound (paths: {bind_targets}); " + f"ratify({len(ratify_calls)}); sequence: {names}" + ) + + +def assert_flow_2(calls: list[dict]) -> tuple[bool, str]: + """Flow 2: dev requests a refactor that contradicts the seeded REORDER + decision. This asserter validates ONLY the auto-fire scope of #146 — did + ``bicameral.preflight`` fire on the affected file before the agent + side-effected the codebase? + + Read is deliberately allowed before/in-parallel-with preflight: agents + legitimately read in parallel with preflight to keep latency reasonable, + and the contract that matters is "preflight gates writes." Edit / Bash + write-ops are the line; preflight must precede the first one. + + The end-to-end correction-capture loop (agent_session ingest + + resolve_collision) is asserted separately by Flow 2a, which reuses this + flow's transcript so the same claude session is graded on two + independent properties without a duplicate API call. + """ + bcalls = _bicameral_tool_calls(calls) + names = [c["name"].split("__")[-1] for c in bcalls] + + # 1. preflight fired (hook-driven auto-trigger on "refactor" verb) + preflight_calls = _calls_named(bcalls, "bicameral_preflight") + if not preflight_calls: + return False, f"expected preflight (auto-fired); saw: {names}" + + file_paths = preflight_calls[0]["input"].get("file_paths") or [] + if not file_paths or not any("reorder.ts" in p for p in file_paths): + return False, ( + f"preflight called without reorder.ts in file_paths (the file the dev " + f"asked to refactor); got: {file_paths}" + ) + + # 2. preflight precedes the first WRITE op (Edit / Write / git-commit Bash). + # Reads are allowed in parallel — they don't side-effect. + first_preflight_idx = next( + (i for i, c in enumerate(calls) if c["name"].endswith("bicameral_preflight")), + None, + ) + write_tools = ("Edit", "Write", "NotebookEdit") + first_write_idx = next( + ( + i + for i, c in enumerate(calls) + if c["name"] in write_tools + or (c["name"] == "Bash" and "git commit" in (c.get("input") or {}).get("command", "")) + ), + None, + ) + if first_write_idx is not None and ( + first_preflight_idx is None or first_preflight_idx > first_write_idx + ): + return False, ( + f"preflight did not precede first write op (auto-fire contract violated); " + f"first preflight at idx {first_preflight_idx}, first write at idx {first_write_idx}" + ) + + return True, (f"preflight auto-fired on reorder.ts; preceded first write op; sequence: {names}") + + +def assert_flow_2a(calls: list[dict]) -> tuple[bool, str]: + """Flow 2a: contradiction-capture disambiguation. Reuses Flow 2's tool + calls (same claude session). The contract under #175 (D path): when + preflight surfaces ≥1 decision, the agent must not silently judge + contradiction — it must call ``AskUserQuestion`` with a disambiguation + shape (Step 5.6.1) so the user picks ``supersede`` / ``keep_both`` / + ``unrelated``. The actual ingest+resolve_collision sequence (Step 5.6.2) + only fires AFTER the user answers, which means it can't be driven in + headless ``claude -p``. The testable signal in CI is the question + invocation itself. + + What this asserter checks: + + - Preflight fired with ≥1 surfaced decision in Flow 2 (precondition; + if not, this flow has nothing to grade). + - At least one ``AskUserQuestion`` invocation appears in the + transcript AFTER the preflight call. The question's shape must + plausibly be the Step 5.6.1 disambiguation: text mentioning a + surfaced decision_id OR the keywords ``refinement`` / ``supersede`` + / ``keep both`` / options labeled with the supersede/keep-both/ + unrelated trichotomy. + + What this asserter NO LONGER requires (versus pre-#175): + + - ``bicameral.ingest(source=agent_session)`` — depends on the user's + answer, undriveable in headless mode. + - ``bicameral.resolve_collision`` — same. + + Both still flow through Step 5.6.2 in interactive sessions; CI just + can't simulate the human. See #175 for the design discussion. + """ + bcalls = _bicameral_tool_calls(calls) + names = [c["name"].split("__")[-1] for c in bcalls] + + preflight_calls = _calls_named(bcalls, "bicameral_preflight") + if not preflight_calls: + return False, ( + f"precondition NOT met — Flow 2 did not call bicameral_preflight; sequence: {names}" + ) + + # Did preflight return at least one surfaced decision? Without a hit, the + # disambiguation question shouldn't fire — and Flow 2a has no signal. + surfaced_decision_ids: list[str] = [] + for c in preflight_calls: + # Server response lives in the corresponding tool_result block; the + # tool_call we have here only carries inputs. Walk the full call list + # to find tool_result entries with our tool_use_id. + tool_use_id = c.get("id") or "" + for r in calls: + if r.get("type") != "tool_result": + continue + if r.get("tool_use_id") != tool_use_id: + continue + content = r.get("content", "") + text = content if isinstance(content, str) else json.dumps(content) + for marker in ("decision:",): + if marker in text: + # Extract decision IDs as best we can (presence is the + # signal; exact list isn't load-bearing here). + surfaced_decision_ids.append(marker) + break + + # Find AskUserQuestion calls that fall AFTER the first preflight call. + first_preflight_idx = next( + (i for i, c in enumerate(calls) if c.get("name", "").endswith("bicameral_preflight")), + None, + ) + if first_preflight_idx is None: + return False, f"preflight call index not found in tool calls; sequence: {names}" + + ask_user_calls = [ + c + for i, c in enumerate(calls) + if i > first_preflight_idx and c.get("name") == "AskUserQuestion" + ] + if not ask_user_calls: + return False, ( + f"expected AskUserQuestion (Step 5.6.1 disambiguation) after preflight surfaced " + f"decisions; saw none. sequence: {names}" + ) + + # Validate the question shape — must look like the Step 5.6.1 contract: + # mentions a surfaced decision OR contains the supersede/keep_both/ + # unrelated trichotomy. Loose check; the asserter doesn't try to grade + # whether the agent picked the right surfaced decision (that's product + # judgment, not a contract test). + SHAPE_KEYWORDS = ( + "supersede", + "keep both", + "keep_both", + "unrelated", + "refinement of", + "refinement of that", + "decision:", + ) + matched = None + for c in ask_user_calls: + inp = c.get("input") or {} + # AskUserQuestion accepts either a top-level question or nested + # questions[]; tolerate both shapes. + candidate_texts: list[str] = [] + q = inp.get("question") + if isinstance(q, str): + candidate_texts.append(q) + for nested in inp.get("questions") or []: + if isinstance(nested, dict) and isinstance(nested.get("question"), str): + candidate_texts.append(nested["question"]) + for opt in (nested or {}).get("options") or []: + if isinstance(opt, dict): + if isinstance(opt.get("label"), str): + candidate_texts.append(opt["label"]) + if isinstance(opt.get("description"), str): + candidate_texts.append(opt["description"]) + for opt in inp.get("options") or []: + if isinstance(opt, dict): + if isinstance(opt.get("label"), str): + candidate_texts.append(opt["label"]) + if isinstance(opt.get("description"), str): + candidate_texts.append(opt["description"]) + haystack = " | ".join(candidate_texts).lower() + if any(k in haystack for k in SHAPE_KEYWORDS): + matched = c + break + + if matched is None: + return False, ( + f"AskUserQuestion was called {len(ask_user_calls)} time(s) after preflight, but " + f"none matched the Step 5.6.1 disambiguation shape (expected one of: " + f"{SHAPE_KEYWORDS}); sequence: {names}" + ) + + return True, ( + f"AskUserQuestion fired after preflight with disambiguation shape " + f"(Step 5.6.1 signal); sequence: {names}" + ) + + +def assert_flow_3(calls: list[dict]) -> tuple[bool, str]: + """Flow 3 (chained dev session): dev implements the high-signal + notification feature (the only Flow-1 decision that's still + ungrounded — cherry-pick + reorder are already reflected from Flow 1's + inline binding) and commits. The prompt is intentionally minimal: + implement + commit, no bicameral verbs, no status checks. + + Per bicameral-mcp #135, the post-commit hook is sync-only by design — + it just prints a reminder to the agent. ``link_commit`` runs server-side + via ``ensure_ledger_synced`` on the next bicameral tool call after HEAD + moves (naturally happens in Flow 4's preflight), and ``resolve_compliance`` + requires a caller-LLM in-session (the hook can't trigger it). + + Per-flow assertion: did the agent actually run ``git commit``? That's + the only thing this flow controls. The interesting outcome — a + decision flipping to ``pending`` after the commit — is validated by the + post-hoc ledger query (``_assert_dev_session_ledger_state``) that runs + after the whole ``dev_session`` group completes. + """ + bash_calls = [c for c in calls if c.get("name") == "Bash"] + commit_calls = [ + c for c in bash_calls if "git commit" in (c.get("input") or {}).get("command", "") + ] + if not commit_calls: + bash_cmds = [(c.get("input") or {}).get("command", "")[:60] for c in bash_calls] + return False, ( + f"expected a `git commit` Bash call (the prompt asks for a commit); " + f"saw {len(bash_calls)} Bash call(s): {bash_cmds}" + ) + return True, ( + f"git commit executed ({len(commit_calls)} call(s)). Status flip to " + "`pending` validated post-hoc via ledger query at end of dev_session." + ) + + +def assert_flow_4(calls: list[dict]) -> tuple[bool, str]: + """Flow 4 (chained dev session): mid-flow correction. The user surfaces + a load-bearing constraint about the cherry-pick conflict path as an + aside — using correction markers (``wait``, ``shouldn't``, ``wrong``) + and NO explicit tracking verbs (``track this`` / ``log this`` / + ``lock this in``). The user then asks for code work, which should + trigger ``bicameral-preflight``; preflight step 3.5 invokes + ``bicameral-capture-corrections`` in in-session mode; capture-corrections + finds the constraint and ingests it with ``source=agent_session``. + + What this asserter checks (outcome, not path): + 1. ``bicameral_preflight`` fired (proves the chained session passed + the dev's "continue refactor" intent through to the right skill). + 2. EITHER an ``agent_session``-sourced ingest landed (capture- + corrections in-session ingested the constraint as mechanical) OR + capture-corrections did at least invoke ``bicameral_search`` for + dedup (Step C ran — the rubric processed the markers and just + classified the constraint as ``ask`` instead of mechanical). + + The SessionEnd hook spawns ``/bicameral:capture-corrections`` as a + SEPARATE subprocess; its tool calls are NOT visible in this stream-json. + That out-of-band path is the realistic production behaviour and is + validated by querying the ledger after the harness completes — not + here. This asserter only checks what's observable in-stream. + """ + bcalls = _bicameral_tool_calls(calls) + names = [c["name"].split("__")[-1] for c in bcalls] + + preflight_calls = _calls_named(bcalls, "bicameral_preflight") + if not preflight_calls: + return False, ( + f"expected bicameral.preflight to fire on the dev's 'continue refactor' " + f"request (the in-session capture-corrections invocation hangs off " + f"preflight step 3.5); saw: {names}" + ) + + # Outcome path A — capture-corrections auto-ingested as mechanical. + ingest_calls = _calls_named(bcalls, "bicameral_ingest") + agent_session_ingest = None + for c in ingest_calls: + payload = _ingest_payload(c) + top_source = payload.get("source", "") + span_sources = [(m.get("span") or {}).get("source_type", "") for m in _ingest_items(c)] + if top_source == "agent_session" or "agent_session" in span_sources: + agent_session_ingest = c + break + + # Outcome path B — capture-corrections ran Step C dedup (search) and + # classified the constraint as `ask` (which doesn't auto-ingest in + # headless without user confirmation). The search call is the + # observable signal that capture-corrections processed the markers. + search_calls = _calls_named(bcalls, "bicameral_search") + + if agent_session_ingest is None and not search_calls: + return False, ( + f"preflight fired but neither path-A (agent_session ingest) nor path-B " + f"(bicameral.search from capture-corrections Step C) was observed — " + f"capture-corrections did not appear to process the in-session " + f"corrections. sequence: {names}" + ) + + if agent_session_ingest is not None: + return True, ( + f"preflight + agent_session ingest fired (path A — mechanical " + f"auto-ingest); sequence: {names}" + ) + return True, ( + f"preflight + bicameral.search fired (path B — capture-corrections Step C " + f"dedup ran; constraint classified as `ask`, awaits user confirmation); " + f"sequence: {names}" + ) + + +def assert_flow_5(calls: list[dict]) -> tuple[bool, str]: + """Flow 5: PM Friday review. Inbox is real because state persists from + flows 1/2/4. Expect history (the review query) + IF there's anything + in the proposed queue, ratify it. + + The ratify call is conditional, not unconditional: if upstream flows + produced no new proposals (e.g. Flow 1 already ratified its 3 seeds + and Flow 2's collision didn't produce a refinement), there's literally + nothing to ratify and the prompt's instruction "ratify if you find + anything ready" is honestly satisfied by a no-op. Forcing ratify here + would catch a cascade failure from Flow 2 as if it were a Flow 5 bug. + + Per #108 Flow 5 spec: history + (ratify if proposals exist). The "if" + is load-bearing — see step 4: "Step 3 is silent if no proposals exist." + """ + bcalls = _bicameral_tool_calls(calls) + names = [c["name"].split("__")[-1] for c in bcalls] + + history_calls = _calls_named(bcalls, "bicameral_history") + if not history_calls: + return False, f"expected bicameral.history; saw: {names}" + + ratify_calls = _calls_named(bcalls, "bicameral_ratify") + if ratify_calls: + return True, ( + f"bicameral.history + ratify({len(ratify_calls)}) — PM ratified " + f"queued proposal(s); sequence: {names}" + ) + return True, ( + f"bicameral.history fired; no ratify (no proposals in queue — " + f"Flow 1 ratified its 3 seeds and upstream chain may not have " + f"produced new proposals); sequence: {names}" + ) + + +FLOW_PLAN: list[FlowSpec] = [ + FlowSpec( + flow_id="Flow 1", + prompt_file="flow-1-ingest.md", + asserter=assert_flow_1, + category="mcp_layer", + ), + # Flows 2/3/4 share session group "dev_session" — chained via + # --session-id + --resume so Flow 4's capture-corrections has real + # transcript history (Flow 2's refactor request, Flow 3's commit) to + # scan against, and the SessionEnd hook fires on the rich accumulated + # transcript at Flow 4's exit. Without chaining, capture-corrections + # can't operate honestly — it's designed to scan multi-turn history. + FlowSpec( + flow_id="Flow 2", + prompt_file="flow-2-preflight.md", + asserter=assert_flow_2, + # Auto-fire alone is the deterministic hook surface (UserPromptSubmit + # → bicameral.preflight on reorder.ts before any write op). MCP-layer + # because the contract is a single tool call wired by a hook, not a + # multi-step agentic skill walk. + category="mcp_layer", + session_group="dev_session", + ), + FlowSpec( + flow_id="Flow 2a", + prompt_file="flow-2-preflight.md", + asserter=assert_flow_2a, + category="agentic_layer", + session_group="dev_session", + # Reuse Flow 2's transcript — same claude session, second assertion. + # Avoids running flow-2-preflight.md twice and keeps both verdicts + # honest (the same session is judged on two independent properties). + reuses_flow="Flow 2", + advisory=( + "Skill-layer gap: bicameral-preflight surfaces decisions but does " + "not instruct the agent to (a) ingest a refinement with " + "source=agent_session when the user's prompt contradicts a " + "surfaced decision, or (b) call resolve_collision to wire the " + "refinement to the seeded decision. Tracked as P0 — see " + "BicameralAI/bicameral-mcp#154. Independent of #146 auto-fire." + ), + ), + FlowSpec( + flow_id="Flow 3", + prompt_file="flow-3-commit-sync.md", + asserter=assert_flow_3, + category="agentic_layer", + session_group="dev_session", + # link_commit auto-fire is no longer asserted here — that path is + # validated via the interactive recording (tmux real-TUI). This + # flow's role in the chain is to put a real edit + commit into the + # session transcript so Flow 4 has authentic dev-workflow context. + ), + FlowSpec( + flow_id="Flow 4", + prompt_file="flow-4-session-end.md", + asserter=assert_flow_4, + category="agentic_layer", + session_group="dev_session", + advisory=( + "Flow 4 captures an emerging constraint via correction markers " + '("wait", "shouldn\'t") — no collision-detection involved. NOT ' + "the same gap as #154 (which is Flow 2a / contradiction-with-" + "prior-decision specific). The substrate fixes in this PR " + "(.bicameral/ bootstrap + --mcp-config passthrough) close real " + "drift, but path-X-(b) still won't fire end-to-end because the " + "canonical SessionEnd hook command can't pass the parent " + "transcript to the spawned subprocess AND --auto-ingest is the " + "wrong shape for background capture. Both tracked as P1 — see " + "BicameralAI/bicameral-mcp#156 for the design pivot to " + "next-session surfacing via a transcript queue." + ), + ), + FlowSpec( + flow_id="Flow 5", + prompt_file="flow-5-history.md", + asserter=assert_flow_5, + category="mcp_layer", + ), +] + + +# ── Main ──────────────────────────────────────────────────────────────── + + +def main() -> int: + print("=== v0 user flow e2e — Claude Code CLI sessions ===") + print(f"DESKTOP_REPO_PATH: {DESKTOP_REPO_PATH}") + print(f"MCP config: {MCP_CONFIG_PATH}") + print(f"Ledger (persisted): {LEDGER_DIR}") + print(f"Transcripts: {RESULTS_DIR}") + print(f"Flows: {len(FLOW_PLAN)}\n") + + _clean_ledger() + _reset_desktop_repo() + _bootstrap_bicameral_dir() + + # One UUID per session_group, allocated lazily as we encounter the group. + # ``group_seen`` tracks which groups have already had their first flow run + # so subsequent flows know to use --resume rather than --session-id. + import uuid + + group_session_ids: dict[str, str] = {} + group_seen: set[str] = set() + chained_groups = sorted({s.session_group for s in FLOW_PLAN if s.session_group}) + if chained_groups: + print("Chained session groups:") + for g in chained_groups: + sid = str(uuid.uuid4()) + group_session_ids[g] = sid + members = [ + s.flow_id + for s in FLOW_PLAN + if s.session_group == g and not s.skip and not s.reuses_flow + ] + print(f" {g}: {sid[:8]}… → {' → '.join(members)}") + print() + + # Snapshot ledger state *between* Flow 1 and dev_session so the + # post-hoc validation can compute a real delta. Captured lazily — + # taken just before the first dev_session flow runs. + dev_session_baseline: dict | None = None + + for spec in FLOW_PLAN: + # Snapshot baseline once, immediately before the first dev_session + # flow. This means Flow 1's effects are baked in but Flow 2/3/4's + # effects (the ones we want to measure) are not. + if dev_session_baseline is None and spec.session_group == "dev_session" and not spec.skip: + print("\n=== Snapshotting ledger baseline before dev_session ===") + dev_session_baseline = _snapshot_ledger() + if "error" in dev_session_baseline: + sys.stderr.write(f"baseline snapshot failed: {dev_session_baseline['error']}\n") + else: + print( + f" baseline: {dev_session_baseline.get('total_decisions', 0)} decisions, " + f"{dev_session_baseline.get('compliance_checks', 0)} compliance_check rows, " + f"by_status={dev_session_baseline.get('by_status', {})}" + ) + + if spec.skip: + print(f"\n=== {spec.flow_id} — SKIPPED (see advisory) ===") + section( + FlowResult( + flow_id=spec.flow_id, + prompt_file=spec.prompt_file, + verdict="SKIP", + body=( + f"prompt: {spec.prompt_file}\n" + f"category: {spec.category}\n" + f"claude exit: n/a (not invoked)\n" + f"transcript: n/a\n" + f"total tool calls: 0\n" + f"bicameral tool calls: 0\n\n" + f"assertion: skipped — see advisory\n" + ), + category=spec.category, + advisory=spec.advisory, + ) + ) + continue + + if spec.reuses_flow: + # Re-grade an earlier flow's transcript with this asserter. No + # claude invocation; the source flow already paid for the API + # call and emitted the transcript we read here. + source = next((r for r in RESULTS if r.flow_id == spec.reuses_flow), None) + if source is None: + section( + FlowResult( + flow_id=spec.flow_id, + prompt_file=spec.prompt_file, + verdict="ERROR", + body=( + f"reuses_flow={spec.reuses_flow!r} not found in RESULTS — " + f"declare the source flow earlier in FLOW_PLAN" + ), + category=spec.category, + advisory=spec.advisory, + ) + ) + continue + print( + f"\n=== {spec.flow_id} — re-grading {source.flow_id}'s transcript " + f"({len(source.tool_calls)} tool calls) ===" + ) + passed, detail = spec.asserter(source.tool_calls) + bicameral_calls = _bicameral_tool_calls(source.tool_calls) + body = ( + f"prompt: {spec.prompt_file} (reused from {source.flow_id})\n" + f"category: {spec.category}\n" + f"claude exit: n/a (transcript reused)\n" + f"transcript: {source.transcript_path}\n" + f"total tool calls: {len(source.tool_calls)}\n" + f"bicameral tool calls: {len(bicameral_calls)}\n" + f" → {[c['name'].split('__')[-1] for c in bicameral_calls]}\n\n" + f"assertion: {detail}\n" + ) + section( + FlowResult( + flow_id=spec.flow_id, + prompt_file=spec.prompt_file, + verdict="PASS" if passed else "FAIL", + body=body, + category=spec.category, + advisory=spec.advisory, + tool_calls=source.tool_calls, + transcript_path=source.transcript_path, + ) + ) + continue + + prompt_path = PROMPTS_DIR / spec.prompt_file + prompt = prompt_path.read_text(encoding="utf-8") + session_id = group_session_ids.get(spec.session_group) if spec.session_group else None + is_first = spec.session_group is not None and spec.session_group not in group_seen + if spec.session_group is not None: + group_seen.add(spec.session_group) + try: + tool_calls, transcript_path, exit_code = run_claude_session( + spec.flow_id, prompt, session_id=session_id, is_first_in_group=is_first + ) + except subprocess.TimeoutExpired: + section( + FlowResult( + flow_id=spec.flow_id, + prompt_file=spec.prompt_file, + verdict="ERROR", + body=f"claude CLI session timed out (>{CLAUDE_SESSION_TIMEOUT_S}s)", + category=spec.category, + advisory=spec.advisory, + ) + ) + continue + except Exception as exc: + section( + FlowResult( + flow_id=spec.flow_id, + prompt_file=spec.prompt_file, + verdict="ERROR", + body=f"claude CLI invocation failed: {exc!r}", + category=spec.category, + advisory=spec.advisory, + ) + ) + continue + + passed, detail = spec.asserter(tool_calls) + bicameral_calls = _bicameral_tool_calls(tool_calls) + + body = ( + f"prompt: {spec.prompt_file}\n" + f"category: {spec.category}\n" + f"claude exit: {exit_code}\n" + f"transcript: {transcript_path.relative_to(RESULTS_DIR.parents[1])}\n" + f"total tool calls: {len(tool_calls)}\n" + f"bicameral tool calls: {len(bicameral_calls)}\n" + f" → {[c['name'].split('__')[-1] for c in bicameral_calls]}\n\n" + f"assertion: {detail}\n" + ) + section( + FlowResult( + flow_id=spec.flow_id, + prompt_file=spec.prompt_file, + verdict="PASS" if passed else "FAIL", + body=body, + category=spec.category, + advisory=spec.advisory, + tool_calls=tool_calls, + transcript_path=str(transcript_path), + ) + ) + + # Cascade-failure decoupling: if Flow 2's preflight auto-fire failed + # in the chained dev_session, inject a manual preflight call so Flow + # 3 / Flow 4 don't inherit a broken state. Flow 2's verdict above + # still measures auto-fire reliability honestly — this scaffolding + # is only state recovery for downstream flows. The scaffolding turn + # is allowed to name the tool because it isn't a tested flow. + if spec.flow_id == "Flow 2" and spec.session_group == "dev_session" and not passed: + run_scaffolding_turn( + session_id=group_session_ids["dev_session"], + label="post-flow2-preflight", + prompt=( + "Quick — please call bicameral.preflight on " + "app/src/lib/git/reorder.ts before we keep going on the " + "refactor. I want to see what existing decisions might apply." + ), + ) + + # Post-hoc ledger validation merges into Flow 3's verdict. Runs AFTER + # all flows complete so that ensure_ledger_synced (server-side, fires on + # the next bicameral tool call after HEAD moves) has had a chance to + # apply link_commit and write pending compliance checks. This is Flow 3's + # REAL assertion — the stream-json check (did git commit happen) is just + # a precondition. + if "dev_session" in group_session_ids: + if dev_session_baseline is None: + dev_session_baseline = {"error": "baseline never captured"} + _validate_flow3_via_ledger(group_session_ids["dev_session"], dev_session_baseline) + # Phase 1 of plan-147-flow4-ledger-validation.md: path-X-(b) + # post-hoc ledger query for the SessionEnd subprocess effect. + _validate_flow4_via_ledger() + + _print_report() + + # CI gate: a flow blocks merge ONLY if it FAILs without an `advisory` text. + # Advisory failures document known gaps (with linked issue numbers) — they + # surface loudly in the report but do not red-light CI. This lets the + # harness keep running these assertions every PR (so we notice when a + # gap silently CLOSES) without making every PR also pay for the open gap. + blocking_failures = [r for r in RESULTS if r.verdict in ("FAIL", "ERROR") and not r.advisory] + return 0 if not blocking_failures else 1 + + +def _print_report() -> None: + """Print the per-flow detail, then a sharable summary table that surfaces + the MCP-layer vs agentic-layer split and any advisory text on failures. + The summary is designed to be paste-able into a PR comment or shared + alongside the demo recording so reviewers can see at a glance which + flows validate the tool surface vs which flows still need the agentic + layer to come through. + """ + print("\n\n=== PER-FLOW DETAIL ===\n") + for r in RESULTS: + marker = _verdict_marker(r) + print(f"\n## {r.flow_id} — {marker} {r.verdict} ({r.category})\n") + print(r.body) + + # Header banner + print("\n" + "═" * 78) + print(" e2e SUMMARY — sharable") + print("═" * 78 + "\n") + + # Table + fmt = f"{'Flow':<14} {'Layer':<14} {'Verdict':<10} {'What it validates'}" + print(fmt) + print("-" * 14 + " " + "-" * 14 + " " + "-" * 10 + " " + "-" * 40) + for r in RESULTS: + marker = _verdict_marker(r) + layer_label = { + "mcp_layer": "MCP layer", + "agentic_layer": "Agentic", + "ledger_state": "Ledger", + }.get(r.category, r.category) + what = _flow_one_line(r.flow_id) + print(f"{r.flow_id:<14} {layer_label:<14} {marker} {r.verdict:<8} {what}") + + blocking_failures = [r for r in RESULTS if r.verdict in ("FAIL", "ERROR") and not r.advisory] + advisory_failures = [r for r in RESULTS if r.verdict == "FAIL" and r.advisory] + overall_pass = not blocking_failures + overall_marker = "✅" if overall_pass else "❌" + overall_label = "PASS" if overall_pass else "FAIL" + if overall_pass and advisory_failures: + overall_label = f"PASS ({len(advisory_failures)} advisory failure(s) — see below)" + print(f"\n{overall_marker} Overall: {overall_label}") + + # MCP-layer vs agentic-layer breakdown — SKIP excluded from both totals + # (skipped flows are documented gaps, not pending validation work). + mcp_results = [r for r in RESULTS if r.category == "mcp_layer" and r.verdict != "SKIP"] + agentic_results = [r for r in RESULTS if r.category == "agentic_layer" and r.verdict != "SKIP"] + mcp_pass = sum(1 for r in mcp_results if r.verdict == "PASS") + agentic_pass = sum(1 for r in agentic_results if r.verdict == "PASS") + skipped = [r for r in RESULTS if r.verdict == "SKIP"] + print(f"\n MCP-tool surface: {mcp_pass}/{len(mcp_results)} validating tool callability") + print( + f" Agentic auto-fire: {agentic_pass}/{len(agentic_results)} " + "(skills auto-firing on natural intent — see advisories below)" + ) + if skipped: + print( + f" Skipped: {len(skipped)} " + "(deferred to interactive recording — see advisories)" + ) + + # Advisories — render for flows that have them, regardless of verdict. + # An agentic-layer flow that PASSES still earns its advisory if the prompt + # leaks tool-name hints (compromised pass). SKIP gets its own tag. + advised = [r for r in RESULTS if r.advisory] + if advised: + print("\n" + "─" * 78) + print(" ADVISORIES — flows with caveats / known gaps") + print("─" * 78) + for r in advised: + if r.verdict == "SKIP": + tag = "⏭ SKIPPED" + elif r.verdict == "PASS": + tag = "⚠️ COMPROMISED PASS" + else: + tag = "⚠️ FAILED" + print(f"\n {r.flow_id} — {tag}") + print(f" {r.advisory}") + + # What this means + if any(r.advisory for r in RESULTS): + print("\n" + "─" * 78) + print(" CORRECTION-PATH STATUS") + print("─" * 78) + print( + " The end-to-end correction dynamic ('dev contradicts spec → preflight\n" + " catches → refinement captured → PM ratifies') is NOT validated by\n" + " this headless harness. MCP tool surface is callable and functional;\n" + " agentic auto-fire is the open gap.\n\n" + " Validate the agentic layer via the interactive recording path\n" + " (tmux-driven real claude TUI). See tests/e2e/record_demo.sh." + ) + print() + + +def _verdict_marker(r: FlowResult) -> str: + if r.verdict == "SKIP": + return "⏭ " + if r.verdict == "PASS" and not r.advisory: + return "✅" + if r.verdict == "PASS" and r.advisory: + return "⚠️ " # passes but compromised — caveat in advisories section + if r.verdict == "FAIL" and r.advisory: + return "⚠️ " # advisory failure — known gap, not a tool bug + return "❌" + + +def _flow_one_line(flow_id: str) -> str: + return { + "Flow 1": "ingest decisions from a doc", + "Flow 2": "auto-fire preflight before write op (auto-fire scope)", + "Flow 2a": "full correction-capture loop (ingest agent_session + resolve_collision)", + "Flow 3": "commit on bound file → ledger flips decision to `pending`", + "Flow 4": "in-session correction capture (chained dev_session)", + "Flow 5": "PM Friday review — history + ratify", + }.get(flow_id, "") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/eval/_baseline_io.py b/tests/eval/_baseline_io.py index a29b6763..fd2c3958 100644 --- a/tests/eval/_baseline_io.py +++ b/tests/eval/_baseline_io.py @@ -22,15 +22,15 @@ Noise floors: tokens 10 (deterministic, but tolerate small generator tweaks), latency 0.5ms (OS scheduler + GC jitter on non-realtime kernels). """ + from __future__ import annotations import json import os import platform -from datetime import datetime, timezone +from datetime import UTC, datetime, timezone from pathlib import Path - BASELINE_VERSION = "1" RELATIVE_THRESHOLD = 0.20 TOKEN_NOISE_FLOOR = 10 @@ -64,12 +64,14 @@ def load_baselines(path: Path = BASELINE_PATH) -> list[dict]: def write_baselines(rows: list[dict], path: Path = BASELINE_PATH) -> None: """Sorted, stable-key JSONL output to keep diffs minimal.""" + def _sort_key(row: dict) -> tuple: return ( row.get("metric", ""), row.get("recorded_on", ""), row.get("n_features", -1), ) + rows_sorted = sorted(rows, key=_sort_key) body = "\n".join(json.dumps(r, sort_keys=True, ensure_ascii=False) for r in rows_sorted) path.write_text(body + "\n", encoding="utf-8") @@ -154,4 +156,4 @@ def regression_check( def now_iso() -> str: - return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") diff --git a/tests/eval/_skill_judge.py b/tests/eval/_skill_judge.py index dc426ce5..014b245d 100644 --- a/tests/eval/_skill_judge.py +++ b/tests/eval/_skill_judge.py @@ -16,6 +16,7 @@ BICAMERAL_PREFLIGHT_EVAL_MODEL default "claude-sonnet-4-6" BICAMERAL_PREFLIGHT_EVAL_RECORD=1 force-bypass cache, re-record """ + from __future__ import annotations import hashlib @@ -27,7 +28,6 @@ import httpx - REPO_ROOT = Path(__file__).resolve().parents[2] SKILL_MD_PATH = REPO_ROOT / "skills" / "bicameral-preflight" / "SKILL.md" CACHE_DIR = Path(__file__).resolve().parent / "fixtures" / "skill_judge" @@ -129,7 +129,7 @@ def _extract_step1_excerpt(skill_md: str) -> str: next_header = _STEP_HEADER_RE.search(body, step1_match.end()) end = next_header.start() if next_header else len(body) - return body[step1_match.start():end].strip() + return body[step1_match.start() : end].strip() def _cache_path(model: str, skill_sha: str, input_sha: str) -> Path: @@ -169,9 +169,7 @@ def _call_messages_api( with httpx.Client(timeout=REQUEST_TIMEOUT_S) as client: resp = client.post(ANTHROPIC_API_URL, headers=headers, json=payload) if resp.status_code >= 400: - raise RuntimeError( - f"Anthropic API error {resp.status_code}: {resp.text[:500]}" - ) + raise RuntimeError(f"Anthropic API error {resp.status_code}: {resp.text[:500]}") data = resp.json() stop_reason = data.get("stop_reason", "") @@ -184,9 +182,7 @@ def _call_messages_api( f"(stop_reason={stop_reason!r}, text={'|'.join(text_parts)[:300]!r})" ) if stop_reason == "max_tokens": - raise RuntimeError( - f"Anthropic response hit max_tokens={MAX_OUTPUT_TOKENS}" - ) + raise RuntimeError(f"Anthropic response hit max_tokens={MAX_OUTPUT_TOKENS}") judgment = tool_use.get("input") if not isinstance(judgment, dict): raise RuntimeError(f"tool_use input is not a dict: {judgment!r}") diff --git a/tests/eval/_synthetic_ledger.py b/tests/eval/_synthetic_ledger.py index 468d70e5..0df1891f 100644 --- a/tests/eval/_synthetic_ledger.py +++ b/tests/eval/_synthetic_ledger.py @@ -10,19 +10,35 @@ fixed corpus and parameterized by index, so the payload feels plausible (not "lorem ipsum") but generation stays deterministic and zero-network. """ + from __future__ import annotations import random - GENERATOR_VERSION = "1" _FEATURE_NAMES: list[str] = [ - "auth", "billing", "payments", "logging", "audit", "search", "api", - "webhooks", "retention", "indexing", "ingestion", "drift-detection", - "ratification", "rate-limiting", "caching", "locking", "dedup", "ttl", - "sync", "scheduling", + "auth", + "billing", + "payments", + "logging", + "audit", + "search", + "api", + "webhooks", + "retention", + "indexing", + "ingestion", + "drift-detection", + "ratification", + "rate-limiting", + "caching", + "locking", + "dedup", + "ttl", + "sync", + "scheduling", ] @@ -131,7 +147,9 @@ def _make_decision( if status in {"reflected", "drifted"}: baseline_hash = f"{decision_index:064x}"[-64:] - current_hash = baseline_hash if status == "reflected" else f"{decision_index + 1:064x}"[-64:] + current_hash = ( + baseline_hash if status == "reflected" else f"{decision_index + 1:064x}"[-64:] + ) decision["fulfillments"] = [ { "file_path": f"{feature_id}/handler_{decision_index}.py", @@ -174,24 +192,21 @@ def generate_ledger( if n_features < 0: raise ValueError(f"n_features must be >= 0, got {n_features}") if decisions_per_feature < 0: - raise ValueError( - f"decisions_per_feature must be >= 0, got {decisions_per_feature}" - ) + raise ValueError(f"decisions_per_feature must be >= 0, got {decisions_per_feature}") rng = random.Random(seed) features: list[dict] = [] for i in range(n_features): feature_id = _feature_id(i) - decisions = [ - _make_decision(rng, feature_id, j) - for j in range(decisions_per_feature) - ] - features.append({ - "id": feature_id, - "name": feature_id.replace("-", " ").title(), - "decisions": decisions, - }) + decisions = [_make_decision(rng, feature_id, j) for j in range(decisions_per_feature)] + features.append( + { + "id": feature_id, + "name": feature_id.replace("-", " ").title(), + "decisions": decisions, + } + ) return { "features": features, diff --git a/tests/eval/_token_count.py b/tests/eval/_token_count.py index c6cb7936..18a30199 100644 --- a/tests/eval/_token_count.py +++ b/tests/eval/_token_count.py @@ -8,6 +8,7 @@ tiktoken is pinned in ``pyproject.toml`` ``[test]`` extras to avoid silent count drift across CI runs. """ + from __future__ import annotations import functools @@ -17,6 +18,7 @@ @functools.lru_cache(maxsize=1) def _encoder(): import tiktoken + return tiktoken.get_encoding("cl100k_base") diff --git a/tests/eval/preflight_dataset.jsonl b/tests/eval/preflight_dataset.jsonl index 8909c860..55429049 100644 --- a/tests/eval/preflight_dataset.jsonl +++ b/tests/eval/preflight_dataset.jsonl @@ -1,6 +1,6 @@ {"id": "M5", "layer": "handler", "axis": "miss", "catalog_status": "acknowledged", "title": "No file_paths supplied → no region surface (HITL also empty)", "setup": {"region_decisions": [{"decision_id": "decision:auth_jwt_ttl", "description": "JWT tokens expire after 60 minutes", "status": "reflected", "file_path": "auth/jwt.py", "symbol": "verify_token"}]}, "input": {"topic": "update auth configuration", "file_paths": []}, "expect": {"fired": false, "reason": "no_matches", "decisions_count": 0, "collision_pending_count": 0, "context_pending_ready_count": 0}, "xfail": null, "note": "Documents acknowledged behavior: when caller omits file_paths, region anchor is unreachable and only HITL/guided fire."} {"id": "M5_hitl_global", "layer": "handler", "axis": "correct", "catalog_status": "intentional", "title": "Empty file_paths but collision-pending exists → HITL fires globally", "setup": {"region_decisions": [], "collision_pending": [{"decision_id": "decision:billing_dedup", "description": "Use SETNX for payment idempotency", "status": "pending", "signoff": {"state": "collision_pending"}}]}, "input": {"topic": "update auth configuration", "file_paths": []}, "expect": {"fired": true, "reason": "fired", "decisions_count": 0, "collision_pending_count": 1, "context_pending_ready_count": 0}, "xfail": null, "note": "Validates FF4-adjacent contract: HITL is global and fires regardless of topic/file_paths."} -{"id": "M6", "layer": "handler", "axis": "miss", "catalog_status": "open", "title": "Transitive miss — decision pinned to a dependency of file_paths", "setup": {"region_decisions": []}, "input": {"topic": "refactor login handler", "file_paths": ["auth/login_handler.py"]}, "expect": {"fired": true, "reason": "fired", "decisions_count": 1}, "xfail": "M6 — get_neighbors transitive expansion not wired in handle_preflight; mitigation queued in catalog implementation queue", "note": "Decision is pinned to auth/jwt.py which login_handler imports; today's handler only sees the direct file."} +{"id": "M6", "layer": "handler", "axis": "miss", "catalog_status": "fixed", "title": "Transitive miss — decision pinned to a dependency of file_paths", "setup": {"region_decisions_pinned_to": {"auth/jwt.py": [{"decision_id": "decision:auth_jwt_validation", "description": "JWT signature validation must use RS256 — never HS256", "status": "reflected", "symbol": "verify_token"}]}, "graph_neighbors": {"auth/login_handler.py": ["auth/jwt.py"]}}, "input": {"topic": "refactor login handler", "file_paths": ["auth/login_handler.py"]}, "expect": {"fired": true, "reason": "fired", "decisions_count": 1}, "xfail": null, "note": "Decision is pinned to auth/jwt.py; login_handler imports jwt, so 1-hop graph expansion adds auth/jwt.py to the lookup set and the decision surfaces. Closed by #173/#174 (deterministic 1-hop expansion in _region_anchored_preflight)."} {"id": "M7a_dedup_ledger_change", "layer": "handler", "axis": "miss", "catalog_status": "open", "title": "Dedup window swallows fresh signal after a relevant decision lands", "calls": [{"input": {"topic": "webhook idempotency", "file_paths": ["payments/stripe.py"]}, "setup": {"region_decisions": []}}, {"input": {"topic": "webhook idempotency", "file_paths": ["payments/stripe.py"]}, "setup": {"region_decisions": [{"decision_id": "decision:wh_dedup", "description": "Webhook events deduped via Redis SETNX", "status": "reflected", "file_path": "payments/stripe.py", "symbol": "handle_webhook"}]}}], "expect_final": {"fired": true, "reason": "fired", "decisions_count": 1}, "xfail": "M7 — dedup key is (topic) only; second call hits recently_checked. Fix queued: broaden cache key to (topic, normalized_file_paths, ledger_revision).", "note": "Two-call: first call empty, ledger gains decision, second call within window currently silenced."} {"id": "M7b_dedup_file_paths_shift", "layer": "handler", "axis": "miss", "catalog_status": "open", "title": "Dedup window swallows result when file_paths shifts to a different region", "calls": [{"input": {"topic": "refactor handler", "file_paths": ["auth/login.py"]}, "setup": {"region_decisions": []}}, {"input": {"topic": "refactor handler", "file_paths": ["billing/subscriptions.py"]}, "setup": {"region_decisions": [{"decision_id": "decision:billing_proration", "description": "Pro-rate refunds on plan downgrade", "status": "reflected", "file_path": "billing/subscriptions.py", "symbol": "downgrade"}]}}], "expect_final": {"fired": true, "reason": "fired", "decisions_count": 1}, "xfail": "M7 — same dedup-key issue; file_paths must be part of the cache key.", "note": "Same topic, different file_paths — second call should re-evaluate but is silenced today."} {"id": "M7c_dedup_hitl_change", "layer": "handler", "axis": "miss", "catalog_status": "open", "title": "Dedup window ignores HITL state changes within window", "calls": [{"input": {"topic": "feature work session", "file_paths": []}, "setup": {"collision_pending": [{"decision_id": "decision:hitl_open", "description": "Pending collision in payments", "status": "pending", "signoff": {"state": "collision_pending"}}]}}, {"input": {"topic": "feature work session", "file_paths": []}, "setup": {"collision_pending": []}}], "expect_final": {"fired": false, "reason": "no_matches", "collision_pending_count": 0}, "xfail": "M7 — dedup ignores HITL revision. Fix queued: invalidate dedup on HITL state change.", "note": "First call surfaces HITL; HITL resolves; second call should re-evaluate (no signal → silent) but currently returns recently_checked."} diff --git a/tests/eval/run_preflight_cost_eval.py b/tests/eval/run_preflight_cost_eval.py index c03463e4..26ce982c 100644 --- a/tests/eval/run_preflight_cost_eval.py +++ b/tests/eval/run_preflight_cost_eval.py @@ -24,6 +24,7 @@ for the current platform; no assertion runs - No baseline for current platform: skip with re-record instructions """ + from __future__ import annotations import asyncio @@ -56,7 +57,6 @@ from _synthetic_ledger import GENERATOR_VERSION, generate_ledger # noqa: E402 from _token_count import count_tokens, count_tokens_json # noqa: E402 - _C3_WARMUP = 10 _C3_SAMPLES = 100 @@ -139,8 +139,10 @@ def _isolate_handler_environment(monkeypatch, tmp_path): monkeypatch.delenv("BICAMERAL_PREFLIGHT_MUTE", raising=False) monkeypatch.setenv("HOME", str(tmp_path)) import handlers.sync_middleware as sm + monkeypatch.setattr(sm, "ensure_ledger_synced", AsyncMock(return_value=None)) import handlers.preflight as pf + monkeypatch.setattr(pf, "_should_show_product_stage", lambda: False) @@ -202,21 +204,28 @@ def _build_realistic_ctx( ledger._inner = inner import ledger.queries as lq + monkeypatch.setattr( lq, "get_collision_pending_decisions", - AsyncMock(return_value=[ - _make_hitl_row(f"decision:coll-{i}", f"Collision pending {i}", "collision_pending") - for i in range(n_collision_pending) - ]), + AsyncMock( + return_value=[ + _make_hitl_row(f"decision:coll-{i}", f"Collision pending {i}", "collision_pending") + for i in range(n_collision_pending) + ] + ), ) monkeypatch.setattr( lq, "get_context_for_ready_decisions", - AsyncMock(return_value=[ - _make_hitl_row(f"decision:ctx-{i}", f"Context pending ready {i}", "context_pending_ready") - for i in range(n_context_pending) - ]), + AsyncMock( + return_value=[ + _make_hitl_row( + f"decision:ctx-{i}", f"Context pending ready {i}", "context_pending_ready" + ) + for i in range(n_context_pending) + ] + ), ) return SimpleNamespace( diff --git a/tests/eval/run_preflight_eval.py b/tests/eval/run_preflight_eval.py index 1a018990..6c126280 100644 --- a/tests/eval/run_preflight_eval.py +++ b/tests/eval/run_preflight_eval.py @@ -13,6 +13,7 @@ Skill-layer scenarios (M1–M4, FF1, FF3 in the catalog) are deferred to phase 2 (LLM-in-the-loop) and are not included here. """ + from __future__ import annotations import asyncio @@ -25,7 +26,6 @@ import pytest - DATASET = Path(__file__).parent / "preflight_dataset.jsonl" CATALOG = Path(__file__).parent.parent.parent / "docs" / "preflight-failure-scenarios.md" @@ -102,11 +102,60 @@ def _make_ctx(*, guided_mode: bool, sync_state: dict) -> SimpleNamespace: def _apply_setup(monkeypatch, setup: dict, ctx: SimpleNamespace) -> None: region_decisions = setup.get("region_decisions") or [] - ctx.ledger.get_decisions_for_files = AsyncMock( - return_value=[_make_decision_dict(d) for d in region_decisions] - ) + pinned_decisions = setup.get("region_decisions_pinned_to") or {} + + if pinned_decisions: + # Path-aware mock — used by M6 (graph expansion). The handler may call + # get_decisions_for_files with the caller's original paths or with + # those paths plus 1-hop neighbors; only return decisions whose + # pinned file is among the paths supplied in *this* call. That makes + # the test honest: M6 passes only when the expansion supplies the + # neighbor path that the decision is pinned to. + async def _path_aware_lookup(paths): + out: list[dict] = [] + for fp in paths or []: + for d in pinned_decisions.get(fp, []): + out.append(_make_decision_dict({**d, "file_path": fp})) + return out + + ctx.ledger.get_decisions_for_files = AsyncMock(side_effect=_path_aware_lookup) + else: + ctx.ledger.get_decisions_for_files = AsyncMock( + return_value=[_make_decision_dict(d) for d in region_decisions] + ) + + # Optional graph-neighbor topology for M6-style scenarios. When set, attach + # a stub code_graph adapter to ctx that expands file_paths by 1 hop using + # the supplied dict (file_path → list[neighbor_file_path]). When absent, + # leave ctx without a code_graph attribute — preflight's expansion path + # is defensive (`getattr(ctx, "code_graph", None)`) and falls back to + # exact-match-only retrieval. + graph_neighbors = setup.get("graph_neighbors") or {} + if graph_neighbors: + + class _DatasetCodeGraph: + def expand_file_paths_via_graph( + self, file_paths: list[str], hops: int = 1 + ) -> tuple[list[str], list[str]]: + expanded: list[str] = [] + added: list[str] = [] + seen: set[str] = set() + for fp in file_paths or []: + if fp and fp not in seen: + seen.add(fp) + expanded.append(fp) + for fp in file_paths or []: + for n in graph_neighbors.get(fp, []): + if n and n not in seen: + seen.add(n) + expanded.append(n) + added.append(n) + return expanded, added + + ctx.code_graph = _DatasetCodeGraph() import ledger.queries as lq + monkeypatch.setattr( lq, "get_collision_pending_decisions", @@ -124,8 +173,10 @@ def _isolate_handler_environment(monkeypatch, tmp_path): monkeypatch.delenv("BICAMERAL_PREFLIGHT_MUTE", raising=False) monkeypatch.setenv("HOME", str(tmp_path)) import handlers.sync_middleware as sm + monkeypatch.setattr(sm, "ensure_ledger_synced", AsyncMock(return_value=None)) import handlers.preflight as pf + monkeypatch.setattr(pf, "_should_show_product_stage", lambda: False) diff --git a/tests/eval/run_preflight_skill_eval.py b/tests/eval/run_preflight_skill_eval.py index 82290511..b60057aa 100644 --- a/tests/eval/run_preflight_skill_eval.py +++ b/tests/eval/run_preflight_skill_eval.py @@ -20,6 +20,7 @@ miss/false-fire rows (M1-M4, FF1, FF3 in the catalog). A failure here is real signal: the LLM did not recover the failure mode the row models. """ + from __future__ import annotations import json @@ -40,7 +41,6 @@ judge_relevance, ) - DATASET = Path(__file__).parent / "preflight_skill_dataset.jsonl" REQUIRED_KEYS = {"id", "axis", "title", "topic", "ledger", "expect_relevant"} diff --git a/tests/eval/test_cost_baseline_helpers.py b/tests/eval/test_cost_baseline_helpers.py index 02009024..ef828ced 100644 --- a/tests/eval/test_cost_baseline_helpers.py +++ b/tests/eval/test_cost_baseline_helpers.py @@ -4,6 +4,7 @@ - Synthetic ledger generator: determinism, shape, scaling, status distribution - Token counter: basic call, JSON-serialized payloads, monotonicity """ + from __future__ import annotations import sys @@ -28,7 +29,6 @@ ) from _token_count import count_tokens, count_tokens_json # noqa: E402 - # ── Generator: determinism ────────────────────────────────────────────── @@ -50,7 +50,11 @@ def test_generator_diverges_for_different_seeds(): def test_generator_top_level_shape(): ledger = generate_ledger(n_features=10) assert set(ledger.keys()) >= { - "features", "truncated", "total_features", "as_of", "sync_metrics", + "features", + "truncated", + "total_features", + "as_of", + "sync_metrics", "_generator_version", } assert ledger["total_features"] == 10 @@ -78,12 +82,7 @@ def test_generator_decision_shape(): def test_drifted_decision_has_drift_evidence_and_fulfillment(): ledger = generate_ledger(n_features=200, seed=42) - drifted = [ - d - for f in ledger["features"] - for d in f["decisions"] - if d["status"] == "drifted" - ] + drifted = [d for f in ledger["features"] for d in f["decisions"] if d["status"] == "drifted"] assert drifted, "expected at least one drifted decision at N=200" for d in drifted: assert d["drift_evidence"], "drifted decisions must carry drift_evidence" @@ -93,10 +92,7 @@ def test_drifted_decision_has_drift_evidence_and_fulfillment(): def test_ungrounded_decision_has_no_fulfillment(): ledger = generate_ledger(n_features=200, seed=42) ungrounded = [ - d - for f in ledger["features"] - for d in f["decisions"] - if d["status"] == "ungrounded" + d for f in ledger["features"] for d in f["decisions"] if d["status"] == "ungrounded" ] assert ungrounded, "expected at least one ungrounded decision at N=200" for d in ungrounded: diff --git a/tests/eval_decision_relevance.py b/tests/eval_decision_relevance.py index ada27cf8..397af463 100644 --- a/tests/eval_decision_relevance.py +++ b/tests/eval_decision_relevance.py @@ -31,6 +31,7 @@ The fixture is the single source of truth for corpus + oracle. Adding a new transcript = one entry in TRANSCRIPT_SOURCES. No runner changes. """ + from __future__ import annotations import argparse @@ -79,9 +80,7 @@ def _build_payload_from_fixture(source_ref: str) -> dict: } -def _build_payload_from_skill_md( - transcript_text: str, source_ref: str -) -> tuple[dict, list[dict]]: +def _build_payload_from_skill_md(transcript_text: str, source_ref: str) -> tuple[dict, list[dict]]: """Call the headless extraction driver (Step 1 of the current SKILL.md) and shape the result as a natural-format ingest payload. @@ -136,9 +135,7 @@ async def _ingest_one( if skill_variant == "none": payload = _build_payload_from_fixture(source_ref) elif skill_variant == "from-skill-md": - payload, extracted_decisions = _build_payload_from_skill_md( - transcript_text, source_ref - ) + payload, extracted_decisions = _build_payload_from_skill_md(transcript_text, source_ref) else: raise ValueError(f"unknown skill-variant: {skill_variant!r}") @@ -155,9 +152,7 @@ async def _ingest_one( # its input, so comparing it against itself would be tautological). if skill_variant == "from-skill-md": ground_truth = load_fixture(source_ref) - extraction_metrics = compute_extraction_metrics( - extracted_decisions, ground_truth - ) + extraction_metrics = compute_extraction_metrics(extracted_decisions, ground_truth) else: extraction_metrics = {"skipped": True, "reason": "not applicable in this variant"} @@ -306,11 +301,8 @@ async def run(args) -> tuple[dict, int]: # repo boundaries — precision/recall of the skill is a global property). sys.path.insert(0, str(Path(__file__).resolve().parent)) from _extraction_metrics import aggregate_extraction_metrics # type: ignore[import-not-found] - all_extraction_rows = [ - t["extraction_metrics"] - for r in repo_reports - for t in r["transcripts"] - ] + + all_extraction_rows = [t["extraction_metrics"] for r in repo_reports for t in r["transcripts"]] aggregate_extraction = aggregate_extraction_metrics(all_extraction_rows) combined = { @@ -378,8 +370,7 @@ async def run(args) -> tuple[dict, int]: exit_code = 1 if exit_code == 0 and args.min_grounded_pct is not None: print( - f"\n✅ PASS: grounded_pct {aggregate_pct:.3f} " - f"≥ threshold {args.min_grounded_pct:.3f}" + f"\n✅ PASS: grounded_pct {aggregate_pct:.3f} ≥ threshold {args.min_grounded_pct:.3f}" ) return combined, exit_code diff --git a/tests/fixtures/expected/decisions.py b/tests/fixtures/expected/decisions.py index 4f65c5b2..d947bbd1 100644 --- a/tests/fixtures/expected/decisions.py +++ b/tests/fixtures/expected/decisions.py @@ -20,7 +20,13 @@ { "description": "Add 12-second timeout ceiling on payment provider authorize calls; return requires_more status on timeout", "source_ref": "medusa-payment-timeout", - "keywords": ["payment timeout", "authorize call", "12 second", "requires_more", "checkout timeout"], + "keywords": [ + "payment timeout", + "authorize call", + "12 second", + "requires_more", + "checkout timeout", + ], "expected_symbols": [ "PaymentProviderService", ], @@ -31,7 +37,13 @@ { "description": "Background sweeper job via JobSchedulerService: void payment sessions stuck in pending state for more than 5 minutes", "source_ref": "medusa-payment-timeout", - "keywords": ["sweeper job", "pending payment session", "void", "5 minutes", "job scheduler"], + "keywords": [ + "sweeper job", + "pending payment session", + "void", + "5 minutes", + "job scheduler", + ], "expected_symbols": [ "PaymentProviderService", ], @@ -54,7 +66,13 @@ { "description": "Guard against garbage responses from community payment providers — throw typed error if authorize returns undefined or malformed object", "source_ref": "medusa-payment-timeout", - "keywords": ["validate provider response", "community provider", "undefined response", "typed error", "authorize response"], + "keywords": [ + "validate provider response", + "community provider", + "undefined response", + "typed error", + "authorize response", + ], "expected_symbols": [ "PaymentProviderService", ], @@ -69,7 +87,13 @@ { "description": "Migrate plugin service classes from TransactionBaseService to AbstractModuleService using @Module decorator", "source_ref": "medusa-plugin-migration", - "keywords": ["plugin migration", "AbstractModuleService", "@Module decorator", "TransactionBaseService", "v2 module"], + "keywords": [ + "plugin migration", + "AbstractModuleService", + "@Module decorator", + "TransactionBaseService", + "v2 module", + ], "expected_symbols": [ "AbstractModuleService", ], @@ -80,7 +104,13 @@ { "description": "Convert plugin subscribers to createWorkflow/createStep pattern; subscribers directory no longer auto-registers in v2", "source_ref": "medusa-plugin-migration", - "keywords": ["subscribers", "createWorkflow", "createStep", "workflow migration", "event subscriber"], + "keywords": [ + "subscribers", + "createWorkflow", + "createStep", + "workflow migration", + "event subscriber", + ], "expected_symbols": [ "createWorkflow", "createStep", @@ -92,7 +122,13 @@ { "description": "Service injection must go through Modules registry — no direct imports of core services from other modules", "source_ref": "medusa-plugin-migration", - "keywords": ["Modules registry", "service injection", "no direct imports", "awilix scoping", "module isolation"], + "keywords": [ + "Modules registry", + "service injection", + "no direct imports", + "awilix scoping", + "module isolation", + ], "expected_symbols": [ "Modules", "OrderService", @@ -105,7 +141,13 @@ { "description": "Run v1 and v2 API routes in parallel for one release cycle using middlewares.ts pattern", "source_ref": "medusa-plugin-migration", - "keywords": ["backward compat", "v1 routes", "parallel routes", "middlewares.ts", "legacy API"], + "keywords": [ + "backward compat", + "v1 routes", + "parallel routes", + "middlewares.ts", + "legacy API", + ], "expected_symbols": [ "middlewares", ], @@ -120,7 +162,13 @@ { "description": "Create WebhookEndpoint model with fields: URL, HMAC secret, subscribed event types, per-merchant", "source_ref": "medusa-webhook-notifications", - "keywords": ["WebhookEndpoint", "merchant webhook", "webhook model", "HMAC secret", "event subscription"], + "keywords": [ + "WebhookEndpoint", + "merchant webhook", + "webhook model", + "HMAC secret", + "event subscription", + ], "expected_symbols": [ "AbstractNotificationProviderService", ], @@ -131,7 +179,13 @@ { "description": "Exponential backoff retry: 30s initial delay, max 4h, 6 retries then dead-letter queue to Redis Streams", "source_ref": "medusa-webhook-notifications", - "keywords": ["exponential backoff", "retry webhook", "dead letter queue", "6 retries", "Redis DLQ"], + "keywords": [ + "exponential backoff", + "retry webhook", + "dead letter queue", + "6 retries", + "Redis DLQ", + ], "expected_symbols": [], "expected_file_patterns": ["webhook", "retry"], "prd_failure_mode": "CONSTRAINT_LOST", # Retry policy is an explicit constraint @@ -149,7 +203,12 @@ { "description": "Include idempotency key (UUID per delivery attempt) in webhook payload so merchants can deduplicate", "source_ref": "medusa-webhook-notifications", - "keywords": ["idempotency key", "webhook deduplication", "UUID delivery", "delivery attempt"], + "keywords": [ + "idempotency key", + "webhook deduplication", + "UUID delivery", + "delivery attempt", + ], "expected_symbols": [], "expected_file_patterns": ["webhook"], "prd_failure_mode": "CONSTRAINT_LOST", @@ -163,7 +222,13 @@ { "description": "Synchronous validation hooks in checkout pipeline that can reject operations — plugin raises ValidationError that propagates through GraphQL", "source_ref": "saleor-checkout-extensibility", - "keywords": ["checkout validation", "synchronous hooks", "ValidationError", "reject operation", "pre-validation"], + "keywords": [ + "checkout validation", + "synchronous hooks", + "ValidationError", + "reject operation", + "pre-validation", + ], "expected_symbols": [ "PluginsManager", "CheckoutError", @@ -175,7 +240,13 @@ { "description": "Circuit breaker: 3 consecutive validation endpoint timeouts — skip that plugin for subsequent checkouts; per-app per-event-type tracking in Redis sliding window", "source_ref": "saleor-checkout-extensibility", - "keywords": ["circuit breaker", "validation timeout", "3 consecutive failures", "skip plugin", "sliding window"], + "keywords": [ + "circuit breaker", + "validation timeout", + "3 consecutive failures", + "skip plugin", + "sliding window", + ], "expected_symbols": [], "expected_file_patterns": ["checkout", "plugin", "circuit"], "prd_failure_mode": "CONSTRAINT_LOST", @@ -185,7 +256,13 @@ { "description": "Cache checkout validation results in Redis keyed by last_change timestamp with TTL; invalidate on line changes, address updates, or shipping method changes", "source_ref": "saleor-checkout-extensibility", - "keywords": ["cache validation", "last_change", "Redis TTL", "checkout cache", "validation cache"], + "keywords": [ + "cache validation", + "last_change", + "Redis TTL", + "checkout cache", + "validation cache", + ], "expected_symbols": [ "Checkout", ], @@ -196,7 +273,12 @@ { "description": "Plugins receive serialized checkout data, not raw querysets — security boundary to prevent third-party data access", "source_ref": "saleor-checkout-extensibility", - "keywords": ["plugin data access", "serialized data", "security boundary", "not raw queryset"], + "keywords": [ + "plugin data access", + "serialized data", + "security boundary", + "not raw queryset", + ], "expected_symbols": [ "PluginsManager", ], @@ -212,7 +294,13 @@ { "description": "Channel-scoped JWT permissions: permission claim becomes dict mapping codename to list of channel slugs or ['*'] for global; existing flat format treated as all-channels for backward compat", "source_ref": "saleor-graphql-permissions", - "keywords": ["channel permissions", "JWT scoped", "channel slug", "permission_required", "backward compat"], + "keywords": [ + "channel permissions", + "JWT scoped", + "channel slug", + "permission_required", + "backward compat", + ], "expected_symbols": [ "check_permissions", "effective_permissions", @@ -224,7 +312,11 @@ { "description": "Gate checkoutComplete mutation on channel permission before any side effects — order creation, payment processing, webhooks", "source_ref": "saleor-graphql-permissions", - "keywords": ["checkoutComplete permission", "gate before side effects", "early permission check"], + "keywords": [ + "checkoutComplete permission", + "gate before side effects", + "early permission check", + ], "expected_symbols": [ "checkoutComplete", "check_permissions", @@ -237,7 +329,12 @@ { "description": "App model: add channel_access relationship so third-party apps only access channels they are installed for", "source_ref": "saleor-graphql-permissions", - "keywords": ["app channel access", "channel_access", "third-party app permission", "app installed channels"], + "keywords": [ + "app channel access", + "channel_access", + "third-party app permission", + "app installed channels", + ], "expected_symbols": [ "App", ], @@ -252,7 +349,13 @@ { "description": "Wrap decrease_stock and allocation cleanup in transaction.atomic — currently separate operations causing orphaned allocation records when decrease_stock succeeds but cleanup fails", "source_ref": "saleor-order-workflows", - "keywords": ["transaction.atomic", "decrease_stock", "allocation cleanup", "orphaned allocation", "stock transaction"], + "keywords": [ + "transaction.atomic", + "decrease_stock", + "allocation cleanup", + "orphaned allocation", + "stock transaction", + ], "expected_symbols": [ "decrease_stock", "orderFulfill", @@ -264,7 +367,13 @@ { "description": "Defer FULFILLMENT_CREATED webhook dispatch to Django on_commit hook — currently fires before stock operations complete causing stale data in downstream systems", "source_ref": "saleor-order-workflows", - "keywords": ["on_commit", "webhook timing", "FULFILLMENT_CREATED", "defer webhook", "after transaction"], + "keywords": [ + "on_commit", + "webhook timing", + "FULFILLMENT_CREATED", + "defer webhook", + "after transaction", + ], "expected_symbols": [ "fulfillment_created", "FULFILLMENT_CREATED", @@ -277,7 +386,12 @@ { "description": "Fix update_order_status: missing RETURNED status handling causes orders to stay FULFILLED even after all fulfillments are returned", "source_ref": "saleor-order-workflows", - "keywords": ["update_order_status", "RETURNED status", "fulfillment status sync", "order status bug"], + "keywords": [ + "update_order_status", + "RETURNED status", + "fulfillment status sync", + "order status bug", + ], "expected_symbols": [ "update_order_status", ], @@ -288,7 +402,12 @@ { "description": "Database constraint on Stock: quantity cannot go negative; decrease_stock can produce negative values in race condition", "source_ref": "saleor-order-workflows", - "keywords": ["stock constraint", "negative quantity", "race condition", "database constraint"], + "keywords": [ + "stock constraint", + "negative quantity", + "race condition", + "database constraint", + ], "expected_symbols": [ "Stock", "decrease_stock", @@ -304,7 +423,13 @@ { "description": "Custom ProductVariantPriceUpdateStrategy: strip tax in source channel, convert currency using TaxRateService, reapply destination zone rate; iterate per currency per channel not per channel", "source_ref": "vendure-channel-pricing", - "keywords": ["ProductVariantPriceUpdateStrategy", "currency conversion", "tax stripping", "multi-channel pricing", "InjectableStrategy"], + "keywords": [ + "ProductVariantPriceUpdateStrategy", + "currency conversion", + "tax stripping", + "multi-channel pricing", + "InjectableStrategy", + ], "expected_symbols": [ "ProductVariantPriceUpdateStrategy", "TaxRateService", @@ -345,7 +470,12 @@ { "description": "struct type custom field warning: stores as simple-json, no SQL-level querying or indexing on sub-fields — do not use struct if you need to filter on nested values", "source_ref": "vendure-custom-fields", - "keywords": ["struct custom field", "simple-json", "no SQL indexing", "nested field warning"], + "keywords": [ + "struct custom field", + "simple-json", + "no SQL indexing", + "nested field warning", + ], "expected_symbols": [], "expected_file_patterns": ["custom", "shared-types"], "prd_failure_mode": "TRIBAL_KNOWLEDGE", @@ -359,7 +489,13 @@ { "description": "Enable bufferUpdates on DefaultSearchPlugin to deduplicate by entity ID during bulk imports; switch from SqlJobQueueStrategy to BullMQJobQueuePlugin", "source_ref": "vendure-search-reindexing", - "keywords": ["bufferUpdates", "BullMQJobQueuePlugin", "search reindex", "SqlJobQueueStrategy", "bulk import"], + "keywords": [ + "bufferUpdates", + "BullMQJobQueuePlugin", + "search reindex", + "SqlJobQueueStrategy", + "bulk import", + ], "expected_symbols": [ "DefaultSearchPlugin", "BullMQJobQueuePlugin", @@ -372,7 +508,12 @@ { "description": "Split workers using activeQueues option: dedicated search worker plus general worker so reindex does not block order confirmation emails", "source_ref": "vendure-search-reindexing", - "keywords": ["activeQueues", "split workers", "dedicated search worker", "worker isolation"], + "keywords": [ + "activeQueues", + "split workers", + "dedicated search worker", + "worker isolation", + ], "expected_symbols": [], "expected_file_patterns": ["search", "worker", "config"], "prd_failure_mode": "CONSTRAINT_LOST", @@ -381,7 +522,12 @@ { "description": "Performance targets: reindex p95 search latency under 200ms (was 800ms during reindex), database CPU under 50% during full reindex", "source_ref": "vendure-search-reindexing", - "keywords": ["search latency 200ms", "database CPU reindex", "p95 latency", "reindex performance"], + "keywords": [ + "search latency 200ms", + "database CPU reindex", + "p95 latency", + "reindex performance", + ], "expected_symbols": ["DefaultSearchPlugin"], "expected_file_patterns": ["search-plugin", "search-strategy", "reindex"], "prd_failure_mode": "CONSTRAINT_LOST", @@ -437,7 +583,13 @@ { "description": "Drift detection flow: detect changed files in a commit, look up intents grounded to those files, recompute status via hash comparison, update intent status", "source_ref": "bicameral-mcp-multi-region", - "keywords": ["drift detection", "link_commit", "derive_status", "hash comparison", "detect_drift"], + "keywords": [ + "drift detection", + "link_commit", + "derive_status", + "hash comparison", + "detect_drift", + ], "expected_symbols": [ "handle_link_commit", "handle_detect_drift", @@ -457,7 +609,13 @@ { "description": "Team collaboration mode: dual-write adapter intercepts mutations, emits event files, materializes peer events on startup for multi-user ledger sync", "source_ref": "bicameral-mcp-multi-region", - "keywords": ["team mode", "dual-write", "event sourcing", "TeamWriteAdapter", "materializer"], + "keywords": [ + "team mode", + "dual-write", + "event sourcing", + "TeamWriteAdapter", + "materializer", + ], "expected_symbols": [ "TeamWriteAdapter", "EventFileWriter", diff --git a/tests/fixtures/flow2_prompt.json b/tests/fixtures/flow2_prompt.json new file mode 100644 index 00000000..b29abc4f --- /dev/null +++ b/tests/fixtures/flow2_prompt.json @@ -0,0 +1,3 @@ +{ + "prompt": "I know the roadmap said drag-and-drop to reorder commits, but actually we're switching to a text-editor approach. Please update cherry-pick.ts and reorder.ts." +} diff --git a/tests/generate_e2e_report.py b/tests/generate_e2e_report.py index 2ec43a96..9771246b 100644 --- a/tests/generate_e2e_report.py +++ b/tests/generate_e2e_report.py @@ -15,7 +15,7 @@ import json import sys -from datetime import datetime, timezone +from datetime import UTC, datetime, timezone from pathlib import Path E2E_DIR = Path(__file__).parent.parent / "test-results" / "e2e" @@ -112,11 +112,12 @@ def _render_json(data: dict, max_lines: int = 40) -> str: text += f"\n... ({len(raw.split(chr(10))) - max_lines} more lines)" # Basic syntax coloring import re + text = text.replace("&", "&").replace("<", "<").replace(">", ">") text = re.sub(r'"([^"]*)"(?=\s*:)', r'<span style="color:#a88af0">"\1"</span>', text) text = re.sub(r':\s*"([^"]*)"', r': <span style="color:#6af0a0">"\1"</span>', text) - text = re.sub(r':\s*(\d+\.?\d*)', r': <span style="color:#4af0c4">\1</span>', text) - text = re.sub(r':\s*(true|false|null)', r': <span style="color:#f0b94a">\1</span>', text) + text = re.sub(r":\s*(\d+\.?\d*)", r': <span style="color:#4af0c4">\1</span>', text) + text = re.sub(r":\s*(true|false|null)", r': <span style="color:#f0b94a">\1</span>', text) return text @@ -141,19 +142,23 @@ def _render_graph_section(graph: dict) -> str: nid = str(intent.get("id", "")) desc = str(intent.get("description", ""))[:50] status = intent.get("cached_status", "—") - cy_elements.append({ - "data": {"id": nid, "label": desc, "status": status, "type": "intent"}, - "classes": "intent", - }) + cy_elements.append( + { + "data": {"id": nid, "label": desc, "status": status, "type": "intent"}, + "classes": "intent", + } + ) node_id_set.add(nid) for symbol in nodes.get("symbols", []): nid = str(symbol.get("id", "")) name = str(symbol.get("name", nid)) - cy_elements.append({ - "data": {"id": nid, "label": name, "type": "symbol"}, - "classes": "symbol", - }) + cy_elements.append( + { + "data": {"id": nid, "label": name, "type": "symbol"}, + "classes": "symbol", + } + ) node_id_set.add(nid) for region in nodes.get("code_regions", []): @@ -161,10 +166,12 @@ def _render_graph_section(graph: dict) -> str: fp = str(region.get("file_path", "?")) sym = str(region.get("symbol", "")) label = f"{sym}\n{fp.split('/')[-1]}" if sym else fp.split("/")[-1] - cy_elements.append({ - "data": {"id": nid, "label": label, "file": fp, "type": "code_region"}, - "classes": "code_region", - }) + cy_elements.append( + { + "data": {"id": nid, "label": label, "file": fp, "type": "code_region"}, + "classes": "code_region", + } + ) node_id_set.add(nid) for edge_type, edge_list in edges.items(): @@ -174,14 +181,16 @@ def _render_graph_section(graph: dict) -> str: src = str(edge.get("out", "")) tgt = str(edge.get("in", "")) if src in node_id_set and tgt in node_id_set: - cy_elements.append({ - "data": { - "id": f"e_{edge_type}_{i}_{_graph_counter}", - "source": src, - "target": tgt, - "label": edge_type, - }, - }) + cy_elements.append( + { + "data": { + "id": f"e_{edge_type}_{i}_{_graph_counter}", + "source": src, + "target": tgt, + "label": edge_type, + }, + } + ) elements_json = json.dumps(cy_elements, default=str) @@ -196,25 +205,30 @@ def _render_graph_section(graph: dict) -> str: for intent in nodes.get("intents", []): desc = str(intent.get("description", ""))[:80] status = intent.get("cached_status", "—") - color = {"reflected": "#6af0a0", "drifted": "#f06a6a", "pending": "#f0b94a", "ungrounded": "#4ab8f0"}.get(status, "#6b7699") + color = { + "reflected": "#6af0a0", + "drifted": "#f06a6a", + "pending": "#f0b94a", + "ungrounded": "#4ab8f0", + }.get(status, "#6b7699") intent_rows += f'<tr><td class="mono">{str(intent.get("id", "?"))[-12:]}</td><td>{desc}</td><td style="color:{color};font-weight:600">{status}</td></tr>\n' region_rows = "" for region in nodes.get("code_regions", []): fp = str(region.get("file_path", "?")) sym = str(region.get("symbol", "?")) - lines = f'{region.get("start_line", "?")}-{region.get("end_line", "?")}' + lines = f"{region.get('start_line', '?')}-{region.get('end_line', '?')}" region_rows += f'<tr><td class="mono">{fp}</td><td>{sym}</td><td>{lines}</td></tr>\n' tables_html = "" if intent_rows: - tables_html += f'''<h4 style="color:#a88af0;margin:12px 0 6px">Intents</h4> + tables_html += f"""<h4 style="color:#a88af0;margin:12px 0 6px">Intents</h4> <table class="data-table"><tr><th>ID</th><th>Description</th><th>Status</th></tr> -{intent_rows}</table>''' +{intent_rows}</table>""" if region_rows: - tables_html += f'''<h4 style="color:#4af0c4;margin:12px 0 6px">Code Regions</h4> + tables_html += f"""<h4 style="color:#4af0c4;margin:12px 0 6px">Code Regions</h4> <table class="data-table"><tr><th>File</th><th>Symbol</th><th>Lines</th></tr> -{region_rows}</table>''' +{region_rows}</table>""" return f''' <div class="graph-summary">{summary}</div> @@ -317,7 +331,7 @@ def _render_graph_section(graph: dict) -> str: def generate() -> str: global _graph_counter _graph_counter = 0 - now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + now = datetime.now(UTC).strftime("%Y-%m-%d %H:%M UTC") sections_html = "" total_artifacts = 0 @@ -330,37 +344,37 @@ def generate() -> str: response_panels = "" for resp in responses: rendered = _render_json(resp["data"]) - response_panels += f''' + response_panels += f""" <details class="artifact-panel"> <summary>{resp["name"].replace("_", " ").title()}</summary> <pre class="json-output">{rendered}</pre> -</details>''' +</details>""" # Graph panels graph_panels = "" for graph in graphs: graph_html = _render_graph_section(graph["data"]) c = graph["data"].get("counts", {}) - graph_panels += f''' + graph_panels += f""" <div class="artifact-panel graph-panel" style="padding:14px;"> <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;"> <span style="color:var(--accent);font-weight:600;font-size:13px;">Knowledge Graph — {c.get("intents", 0)} intents, {c.get("symbols", 0)} symbols, {c.get("code_regions", 0)} regions</span> <div class="cy-legend"><span class="lg-intent">intent</span><span class="lg-symbol">symbol</span><span class="lg-region">code_region</span></div> </div> {graph_html} -</div>''' +</div>""" has_content = responses or graphs - sections_html += f''' + sections_html += f""" <div class="sdlc-section" style="border-left-color:{section["color"]}"> <div class="sdlc-badge" style="color:{section["color"]}">{section["sdlc"]}</div> <h3>{section["title"]}</h3> <p class="sdlc-desc">{section["description"]}</p> <div class="tools-used">Tools: <span class="mono">{section["tools"]}</span></div> {"<div class='artifacts'>" + response_panels + graph_panels + "</div>" if has_content else '<p class="no-artifacts">No artifacts generated — test may not have run.</p>'} -</div>''' +</div>""" - return f'''<!DOCTYPE html> + return f"""<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> @@ -445,7 +459,7 @@ def generate() -> str: </div> </body> -</html>''' +</html>""" def main(): diff --git a/tests/regen_extraction_fixtures.py b/tests/regen_extraction_fixtures.py index eccebbca..60585f49 100644 --- a/tests/regen_extraction_fixtures.py +++ b/tests/regen_extraction_fixtures.py @@ -34,18 +34,18 @@ After running, `git diff tests/fixtures/extraction/` should show the new/changed fixtures. Review, hand-edit if needed, commit. """ + from __future__ import annotations import argparse import json import sys -from datetime import datetime, timezone +from datetime import UTC, datetime, timezone from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parents[1])) sys.path.insert(0, str(Path(__file__).resolve().parent)) -from fixtures.expected.decisions import TRANSCRIPT_SOURCES # noqa: E402 from _extract_headless import ( # noqa: E402 (sibling module) DEFAULT_MODEL, SKILL_MD_PATH, @@ -53,6 +53,7 @@ _sha, extract_from_current_skill, ) +from fixtures.expected.decisions import TRANSCRIPT_SOURCES # noqa: E402 MCP_ROOT = Path(__file__).resolve().parents[1] FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures" / "extraction" @@ -104,7 +105,7 @@ def _regenerate_one( "transcript_path": src["transcript"], "repo_key": src["repo_key"], "generated_by": model, - "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"), + "generated_at": datetime.now(UTC).isoformat(timespec="seconds"), "skill_md_sha": _sha(skill_md)[:12], "decisions": extracted.get("decisions", []), "action_items": extracted.get("action_items", []), diff --git a/tests/test_alpha_contract.py b/tests/test_alpha_contract.py index 62873847..481064d7 100644 --- a/tests/test_alpha_contract.py +++ b/tests/test_alpha_contract.py @@ -26,6 +26,7 @@ real commits — labeled under one suite so the v0.7.0 refactor can be gated on it. """ + from __future__ import annotations import subprocess @@ -44,7 +45,6 @@ from handlers.search_decisions import handle_search_decisions from handlers.sync_middleware import ensure_ledger_synced, get_session_start_banner - # ── Git + ingest helpers ───────────────────────────────────────────── @@ -104,14 +104,16 @@ def _ingest_payload(description: str, *, with_region: bool, signoff: bool) -> di "code_regions": [], } if with_region: - mapping["code_regions"] = [{ - "file_path": "impl.py", - "symbol": "fetch_user", - "type": "function", - "start_line": 1, - "end_line": 3, - "purpose": description, - }] + mapping["code_regions"] = [ + { + "file_path": "impl.py", + "symbol": "fetch_user", + "type": "function", + "start_line": 1, + "end_line": 3, + "purpose": description, + } + ] if signoff: mapping["signoff"] = { "state": "ratified", @@ -210,21 +212,28 @@ async def test_ingest_bind_commit_marks_reflected(alpha_env): # Decision is searchable by description tokens (invariant 1 — "searchable # by feature area"). Uses BM25 via handle_search_decisions. search_resp = await handle_search_decisions( - ctx, query="JWT session authentication", max_results=5, + ctx, + query="JWT session authentication", + max_results=5, ) assert any(m.decision_id == decision_id for m in search_resp.matches), ( "ingested decision must be retrievable via BM25 search" ) # 2. Caller-LLM bind (invariant 2, author-attested via provenance=caller_llm). - bind_resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "impl.py", - "symbol_name": "fetch_user", - "start_line": 1, - "end_line": 3, - "purpose": "JWT validation entrypoint", - }]) + bind_resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "impl.py", + "symbol_name": "fetch_user", + "start_line": 1, + "end_line": 3, + "purpose": "JWT validation entrypoint", + } + ], + ) assert len(bind_resp.bindings) == 1 b = bind_resp.bindings[0] assert b.error is None, f"bind failed: {b.error}" @@ -235,14 +244,16 @@ async def test_ingest_bind_commit_marks_reflected(alpha_env): rc_resp = await handle_resolve_compliance( ctx, phase="ingest", - verdicts=[{ - "decision_id": decision_id, - "region_id": b.region_id, - "content_hash": b.content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "fetch_user performs JWT lookup as decided.", - }], + verdicts=[ + { + "decision_id": decision_id, + "region_id": b.region_id, + "content_hash": b.content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "fetch_user performs JWT lookup as decided.", + } + ], ) assert len(rc_resp.accepted) == 1 assert not rc_resp.rejected @@ -274,26 +285,34 @@ async def test_code_edit_without_rebind_marks_drifted(alpha_env): ) decision_id = ingest_resp.pending_grounding_decisions[0]["decision_id"] - bind_resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "impl.py", - "symbol_name": "fetch_user", - "start_line": 1, - "end_line": 3, - }]) + bind_resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "impl.py", + "symbol_name": "fetch_user", + "start_line": 1, + "end_line": 3, + } + ], + ) b = bind_resp.bindings[0] assert b.error is None await handle_resolve_compliance( - ctx, phase="ingest", - verdicts=[{ - "decision_id": decision_id, - "region_id": b.region_id, - "content_hash": b.content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "baseline verified", - }], + ctx, + phase="ingest", + verdicts=[ + { + "decision_id": decision_id, + "region_id": b.region_id, + "content_hash": b.content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "baseline verified", + } + ], ) assert await _decision_status(ctx, decision_id) == "reflected" @@ -392,13 +411,18 @@ async def test_preflight_surfaces_bound_decisions(monkeypatch, alpha_env): ) decision_id = ingest_resp.pending_grounding_decisions[0]["decision_id"] - bind_resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "impl.py", - "symbol_name": "fetch_user", - "start_line": 1, - "end_line": 3, - }]) + bind_resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "impl.py", + "symbol_name": "fetch_user", + "start_line": 1, + "end_line": 3, + } + ], + ) assert bind_resp.bindings[0].error is None pf_resp = await handle_preflight( @@ -410,8 +434,7 @@ async def test_preflight_surfaces_bound_decisions(monkeypatch, alpha_env): assert "region" in pf_resp.sources_chained decision_ids = [d.decision_id for d in pf_resp.decisions] assert decision_id in decision_ids, ( - f"bound decision {decision_id} missing from preflight response " - f"(got: {decision_ids})" + f"bound decision {decision_id} missing from preflight response (got: {decision_ids})" ) @@ -441,26 +464,34 @@ async def test_hook_no_fire_still_syncs(alpha_env): ) decision_id = ingest_resp.pending_grounding_decisions[0]["decision_id"] - bind_resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "impl.py", - "symbol_name": "fetch_user", - "start_line": 1, - "end_line": 3, - }]) + bind_resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "impl.py", + "symbol_name": "fetch_user", + "start_line": 1, + "end_line": 3, + } + ], + ) b = bind_resp.bindings[0] assert b.error is None await handle_resolve_compliance( - ctx, phase="ingest", - verdicts=[{ - "decision_id": decision_id, - "region_id": b.region_id, - "content_hash": b.content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "baseline", - }], + ctx, + phase="ingest", + verdicts=[ + { + "decision_id": decision_id, + "region_id": b.region_id, + "content_hash": b.content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "baseline", + } + ], ) assert await _decision_status(ctx, decision_id) == "reflected" diff --git a/tests/test_alpha_flow.py b/tests/test_alpha_flow.py index aeace213..d8c15230 100644 --- a/tests/test_alpha_flow.py +++ b/tests/test_alpha_flow.py @@ -18,6 +18,7 @@ Plus one v0.7-specific invariant: 6. Proposal state — new ingests enter as 'proposal'; drift-exempt until ratified. """ + from __future__ import annotations import os @@ -38,7 +39,6 @@ from handlers.sync_middleware import ensure_ledger_synced, get_session_start_banner from ledger.queries import project_decision_status - # ── Shared helpers ─────────────────────────────────────────────────── @@ -96,14 +96,16 @@ def _ratified_payload(description: str, *, with_region: bool = False) -> dict: }, } if with_region: - mapping["code_regions"] = [{ - "file_path": "impl.py", - "symbol": "fetch_user", - "type": "function", - "start_line": 1, - "end_line": 3, - "purpose": description, - }] + mapping["code_regions"] = [ + { + "file_path": "impl.py", + "symbol": "fetch_user", + "type": "function", + "start_line": 1, + "end_line": 3, + "purpose": description, + } + ] return {"query": description, "repo": "jacob-repo", "mappings": [mapping]} @@ -139,9 +141,13 @@ async def test_ingest_bind_commit_marks_reflected(alpha_env): ctx, _ = alpha_env # Invariant 1: ingest lands in ledger, searchable. - ingest_resp = await handle_ingest(ctx, _ratified_payload( - "JWT is the session-auth primitive, not cookies.", with_region=False, - )) + ingest_resp = await handle_ingest( + ctx, + _ratified_payload( + "JWT is the session-auth primitive, not cookies.", + with_region=False, + ), + ) assert ingest_resp.ingested assert len(ingest_resp.pending_grounding_decisions) == 1 decision_id = ingest_resp.pending_grounding_decisions[0]["decision_id"] @@ -152,26 +158,37 @@ async def test_ingest_bind_commit_marks_reflected(alpha_env): ) # Invariant 2: bind is author-attested. - bind_resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "impl.py", - "symbol_name": "fetch_user", - "start_line": 1, - "end_line": 3, - }]) + bind_resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "impl.py", + "symbol_name": "fetch_user", + "start_line": 1, + "end_line": 3, + } + ], + ) b = bind_resp.bindings[0] assert b.error is None, f"Invariant 2 FAIL: bind error: {b.error}" assert b.region_id and b.content_hash # Invariant 3: compliant verdict + ratified signoff → reflected. - rc = await handle_resolve_compliance(ctx, phase="ingest", verdicts=[{ - "decision_id": decision_id, - "region_id": b.region_id, - "content_hash": b.content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "fetch_user performs JWT lookup as decided.", - }]) + rc = await handle_resolve_compliance( + ctx, + phase="ingest", + verdicts=[ + { + "decision_id": decision_id, + "region_id": b.region_id, + "content_hash": b.content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "fetch_user performs JWT lookup as decided.", + } + ], + ) assert len(rc.accepted) == 1 status = await _decision_status(ctx, decision_id) assert status == "reflected", f"Invariant 3 FAIL: expected reflected, got {status}" @@ -187,36 +204,55 @@ async def test_code_edit_without_rebind_marks_drifted(alpha_env): """Invariant 3 drift arm — file edit after bind, no rebind → drifted.""" ctx, repo_root = alpha_env - ingest_resp = await handle_ingest(ctx, _ratified_payload( - "Fetch user returns JWT-validated identity.", with_region=False, - )) + ingest_resp = await handle_ingest( + ctx, + _ratified_payload( + "Fetch user returns JWT-validated identity.", + with_region=False, + ), + ) decision_id = ingest_resp.pending_grounding_decisions[0]["decision_id"] - bind_resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "impl.py", - "symbol_name": "fetch_user", - "start_line": 1, - "end_line": 3, - }]) + bind_resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "impl.py", + "symbol_name": "fetch_user", + "start_line": 1, + "end_line": 3, + } + ], + ) b = bind_resp.bindings[0] assert b.error is None - await handle_resolve_compliance(ctx, phase="ingest", verdicts=[{ - "decision_id": decision_id, - "region_id": b.region_id, - "content_hash": b.content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "baseline verified", - }]) + await handle_resolve_compliance( + ctx, + phase="ingest", + verdicts=[ + { + "decision_id": decision_id, + "region_id": b.region_id, + "content_hash": b.content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "baseline verified", + } + ], + ) assert await _decision_status(ctx, decision_id) == "reflected" - _commit_edit(repo_root, """ + _commit_edit( + repo_root, + """ def fetch_user(user_id: int): # Cookie-based (violates JWT decision). return {"id": user_id, "session_cookie": "opaque"} - """, msg="drift-impl") + """, + msg="drift-impl", + ) invalidate_sync_cache(ctx) lc = await handle_link_commit(ctx, "HEAD") @@ -236,22 +272,29 @@ async def test_session_start_banner_surfaces_drifts(alpha_env): """Invariant 4 — cold MCP session with drifted decision → banner fires.""" ctx, _ = alpha_env - ingest_resp = await handle_ingest(ctx, _ratified_payload( - "Billing webhook uses exponential backoff with jitter.", with_region=True, - )) + ingest_resp = await handle_ingest( + ctx, + _ratified_payload( + "Billing webhook uses exponential backoff with jitter.", + with_region=True, + ), + ) assert ingest_resp.ingested decision_id = ( ingest_resp.pending_grounding_decisions[0]["decision_id"] if ingest_resp.pending_grounding_decisions - else (ingest_resp.sync_status.pending_compliance_checks[0].decision_id - if (ingest_resp.sync_status and ingest_resp.sync_status.pending_compliance_checks) - else None) + else ( + ingest_resp.sync_status.pending_compliance_checks[0].decision_id + if (ingest_resp.sync_status and ingest_resp.sync_status.pending_compliance_checks) + else None + ) ) assert decision_id, "Could not extract decision_id from ingest" # Force drift by writing a drifted verdict directly. inner = getattr(ctx.ledger, "_inner", ctx.ledger) from ledger.queries import update_decision_status + await update_decision_status(inner._client, decision_id, "drifted") # Fresh session — clear banner cache. @@ -283,22 +326,32 @@ async def test_preflight_surfaces_bound_decisions(monkeypatch, alpha_env): ctx = BicameralContext.from_env() assert ctx.guided_mode is True - ingest_resp = await handle_ingest(ctx, _ratified_payload( - "User fetch enforces per-tenant rate limits in middleware.", with_region=False, - )) + ingest_resp = await handle_ingest( + ctx, + _ratified_payload( + "User fetch enforces per-tenant rate limits in middleware.", + with_region=False, + ), + ) decision_id = ingest_resp.pending_grounding_decisions[0]["decision_id"] - bind_resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "impl.py", - "symbol_name": "fetch_user", - "start_line": 1, - "end_line": 3, - }]) + bind_resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "impl.py", + "symbol_name": "fetch_user", + "start_line": 1, + "end_line": 3, + } + ], + ) assert bind_resp.bindings[0].error is None - pf = await handle_preflight(ctx, topic="user fetch rate limit middleware", - file_paths=["impl.py"]) + pf = await handle_preflight( + ctx, topic="user fetch rate limit middleware", file_paths=["impl.py"] + ) assert pf.fired, f"Invariant 5 FAIL: preflight did not fire; reason={pf.reason}" decision_ids_returned = [d.decision_id for d in pf.decisions] assert decision_id in decision_ids_returned, ( @@ -319,37 +372,56 @@ async def test_hook_no_fire_still_syncs(alpha_env): """ ctx, repo_root = alpha_env - ingest_resp = await handle_ingest(ctx, _ratified_payload( - "Audit log retention 30 days, enforced at write path.", with_region=False, - )) + ingest_resp = await handle_ingest( + ctx, + _ratified_payload( + "Audit log retention 30 days, enforced at write path.", + with_region=False, + ), + ) decision_id = ingest_resp.pending_grounding_decisions[0]["decision_id"] - bind_resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "impl.py", - "symbol_name": "fetch_user", - "start_line": 1, - "end_line": 3, - }]) + bind_resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "impl.py", + "symbol_name": "fetch_user", + "start_line": 1, + "end_line": 3, + } + ], + ) b = bind_resp.bindings[0] assert b.error is None - await handle_resolve_compliance(ctx, phase="ingest", verdicts=[{ - "decision_id": decision_id, - "region_id": b.region_id, - "content_hash": b.content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "baseline", - }]) + await handle_resolve_compliance( + ctx, + phase="ingest", + verdicts=[ + { + "decision_id": decision_id, + "region_id": b.region_id, + "content_hash": b.content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "baseline", + } + ], + ) assert await _decision_status(ctx, decision_id) == "reflected" # Commit drift — no explicit link_commit call (simulates hook silence). - _commit_edit(repo_root, """ + _commit_edit( + repo_root, + """ def fetch_user(user_id: int): # Audit log bypassed. raise NotImplementedError - """, msg="bypass-audit-log") + """, + msg="bypass-audit-log", + ) # ensure_ledger_synced must detect the new commit and sync. invalidate_sync_cache(ctx) @@ -379,16 +451,18 @@ async def test_new_ingest_enters_as_proposal(alpha_env): payload = { "query": "Pagination defaults to 25 items per page.", "repo": "jacob-repo", - "mappings": [{ - "intent": "Pagination defaults to 25 items per page.", - "span": { - "source_type": "transcript", - "text": "Pagination defaults to 25 items per page.", - "source_ref": "jacob-v0.7-test", - }, - "symbols": [], - "code_regions": [], - }], + "mappings": [ + { + "intent": "Pagination defaults to 25 items per page.", + "span": { + "source_type": "transcript", + "text": "Pagination defaults to 25 items per page.", + "source_ref": "jacob-v0.7-test", + }, + "symbols": [], + "code_regions": [], + } + ], } ingest_resp = await handle_ingest(ctx, payload) assert ingest_resp.ingested @@ -397,14 +471,12 @@ async def test_new_ingest_enters_as_proposal(alpha_env): # Code-compliance status is 'ungrounded' (no regions bound yet). # Human-approval axis lives on signoff.state = 'proposed'. status = await _decision_status(ctx, decision_id) - assert status == "ungrounded", ( - f"v0.9+ invariant FAIL: expected 'ungrounded', got '{status}'" - ) + assert status == "ungrounded", f"v0.9+ invariant FAIL: expected 'ungrounded', got '{status}'" # After ratification, it remains ungrounded (no code regions bound). from handlers.ratify import handle_ratify - ratify_resp = await handle_ratify(ctx, decision_id=decision_id, - signer="jacob@example.com") + + ratify_resp = await handle_ratify(ctx, decision_id=decision_id, signer="jacob@example.com") assert ratify_resp.was_new is True assert ratify_resp.signoff["state"] == "ratified" @@ -428,22 +500,28 @@ async def test_ratify_idempotent(alpha_env): original signer and ratified_at timestamp must be preserved. """ from handlers.ratify import handle_ratify + ctx, _ = alpha_env - ingest_resp = await handle_ingest(ctx, { - "query": "Cache TTL is 5 minutes.", - "repo": "jacob-repo", - "mappings": [{ - "intent": "Cache TTL is 5 minutes.", - "span": { - "source_type": "transcript", - "text": "Cache TTL is 5 minutes.", - "source_ref": "arch-review", - }, - "symbols": [], - "code_regions": [], - }], - }) + ingest_resp = await handle_ingest( + ctx, + { + "query": "Cache TTL is 5 minutes.", + "repo": "jacob-repo", + "mappings": [ + { + "intent": "Cache TTL is 5 minutes.", + "span": { + "source_type": "transcript", + "text": "Cache TTL is 5 minutes.", + "source_ref": "arch-review", + }, + "symbols": [], + "code_regions": [], + } + ], + }, + ) assert ingest_resp.ingested decision_id = ingest_resp.pending_grounding_decisions[0]["decision_id"] @@ -457,4 +535,4 @@ async def test_ratify_idempotent(alpha_env): assert resp2.was_new is False assert resp2.signoff["state"] == "ratified" assert resp2.signoff["signer"] == "jin@example.com" # original signer preserved - assert resp2.signoff["ratified_at"] == ratified_at # timestamp unchanged + assert resp2.signoff["ratified_at"] == ratified_at # timestamp unchanged diff --git a/tests/test_ast_diff.py b/tests/test_ast_diff.py index 1c2ddbec..89a57882 100644 --- a/tests/test_ast_diff.py +++ b/tests/test_ast_diff.py @@ -7,13 +7,13 @@ bias the V2 caller-LLM verdict prompt toward "looks fine" on behaviorally-different code. """ + from __future__ import annotations import pytest from ledger.ast_diff import is_cosmetic_change - # ── Whitelist: must return True ───────────────────────────────────── diff --git a/tests/test_b2_cosmetic_hint.py b/tests/test_b2_cosmetic_hint.py index 41953ec9..3132cb6a 100644 --- a/tests/test_b2_cosmetic_hint.py +++ b/tests/test_b2_cosmetic_hint.py @@ -9,6 +9,7 @@ - cosmetic_hint stays False for renames / docstring edits / etc. - cosmetic_hint=True only for whitespace-only diffs """ + from __future__ import annotations from pathlib import Path @@ -44,6 +45,7 @@ def repo_with_baseline(tmp_path): the working-tree file to whatever they need to compare against HEAD. """ import subprocess + repo = tmp_path / "repo" repo.mkdir() subprocess.run(["git", "init", "-q"], cwd=repo, check=True) @@ -80,6 +82,7 @@ def test_docstring_edit_keeps_cosmetic_hint_false(repo_with_baseline, tmp_path): _write_file(repo, rel, "def f(x):\n return x + 1\n") # Now overwrite baseline by committing a docstring-only version, then edit working tree. import subprocess + _write_file(repo, rel, 'def f(x):\n """Old."""\n return x + 1\n') subprocess.run(["git", "add", "-A"], cwd=repo, check=True) subprocess.run(["git", "commit", "-q", "-m", "add docstring"], cwd=repo, check=True) @@ -111,6 +114,7 @@ def test_no_diff_keeps_cosmetic_hint_false(repo_with_baseline): def test_unsupported_extension_keeps_cosmetic_hint_false(tmp_path): """Files outside EXTENSION_LANGUAGE never get a hint.""" import subprocess + repo = tmp_path / "repo2" repo.mkdir() subprocess.run(["git", "init", "-q"], cwd=repo, check=True) diff --git a/tests/test_bind.py b/tests/test_bind.py index bec9e988..b264ed44 100644 --- a/tests/test_bind.py +++ b/tests/test_bind.py @@ -8,6 +8,7 @@ 5. test_bind_idempotent — calling bind twice for same (decision, region) is a no-op 6. test_bind_status_transition — after bind, decision status transitions to "pending" """ + from __future__ import annotations from unittest.mock import AsyncMock, patch @@ -18,7 +19,6 @@ from ledger.client import LedgerClient from ledger.schema import init_schema, migrate - # ── Fixtures ────────────────────────────────────────────────────────────────── @@ -57,6 +57,7 @@ async def test_bind_success_with_explicit_lines(): client = await _fresh_client() try: from ledger.adapter import SurrealDBLedgerAdapter + adapter = SurrealDBLedgerAdapter(url="memory://") adapter._client = client adapter._connected = True @@ -64,14 +65,19 @@ async def test_bind_success_with_explicit_lines(): decision_id = await _seed_decision(client, "Use BM25 for search") ctx = _StubCtx(adapter) - resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "server.py", - "symbol_name": "handle_search", - "start_line": 10, - "end_line": 30, - "purpose": "search handler", - }]) + resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "server.py", + "symbol_name": "handle_search", + "start_line": 10, + "end_line": 30, + "purpose": "search handler", + } + ], + ) assert len(resp.bindings) == 1 b = resp.bindings[0] @@ -94,6 +100,7 @@ async def test_bind_symbol_resolution(): client = await _fresh_client() try: from ledger.adapter import SurrealDBLedgerAdapter + adapter = SurrealDBLedgerAdapter(url="memory://") adapter._client = client adapter._connected = True @@ -102,11 +109,16 @@ async def test_bind_symbol_resolution(): ctx = _StubCtx(adapter) with patch("ledger.status.resolve_symbol_lines", return_value=(5, 25)): - resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "middleware.py", - "symbol_name": "rate_limit", - }]) + resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "middleware.py", + "symbol_name": "rate_limit", + } + ], + ) assert len(resp.bindings) == 1 b = resp.bindings[0] @@ -126,6 +138,7 @@ async def test_bind_unknown_decision_id(): client = await _fresh_client() try: from ledger.adapter import SurrealDBLedgerAdapter + adapter = SurrealDBLedgerAdapter(url="memory://") adapter._client = client adapter._connected = True @@ -133,13 +146,18 @@ async def test_bind_unknown_decision_id(): ctx = _StubCtx(adapter) fake_id = "decision:fake_does_not_exist_xyz" - resp = await handle_bind(ctx, bindings=[{ - "decision_id": fake_id, - "file_path": "server.py", - "symbol_name": "some_func", - "start_line": 1, - "end_line": 10, - }]) + resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": fake_id, + "file_path": "server.py", + "symbol_name": "some_func", + "start_line": 1, + "end_line": 10, + } + ], + ) assert len(resp.bindings) == 1 b = resp.bindings[0] @@ -159,6 +177,7 @@ async def test_bind_symbol_not_found(): client = await _fresh_client() try: from ledger.adapter import SurrealDBLedgerAdapter + adapter = SurrealDBLedgerAdapter(url="memory://") adapter._client = client adapter._connected = True @@ -167,11 +186,16 @@ async def test_bind_symbol_not_found(): ctx = _StubCtx(adapter) with patch("ledger.status.resolve_symbol_lines", return_value=None): - resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "cache.py", - "symbol_name": "evict_stale", - }]) + resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "cache.py", + "symbol_name": "evict_stale", + } + ], + ) assert len(resp.bindings) == 1 b = resp.bindings[0] @@ -191,6 +215,7 @@ async def test_bind_idempotent(): client = await _fresh_client() try: from ledger.adapter import SurrealDBLedgerAdapter + adapter = SurrealDBLedgerAdapter(url="memory://") adapter._client = client adapter._connected = True @@ -227,6 +252,7 @@ async def test_bind_status_transition(): client = await _fresh_client() try: from ledger.adapter import SurrealDBLedgerAdapter + adapter = SurrealDBLedgerAdapter(url="memory://") adapter._client = client adapter._connected = True @@ -235,25 +261,26 @@ async def test_bind_status_transition(): ctx = _StubCtx(adapter) # Verify starting status is ungrounded - rows = await client.query( - f"SELECT status FROM {decision_id} LIMIT 1" - ) + rows = await client.query(f"SELECT status FROM {decision_id} LIMIT 1") assert rows and rows[0].get("status") == "ungrounded" - resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "pagination.py", - "symbol_name": "paginate", - "start_line": 1, - "end_line": 15, - }]) + resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "pagination.py", + "symbol_name": "paginate", + "start_line": 1, + "end_line": 15, + } + ], + ) assert resp.bindings[0].error is None # Status should now be "pending" - rows = await client.query( - f"SELECT status FROM {decision_id} LIMIT 1" - ) + rows = await client.query(f"SELECT status FROM {decision_id} LIMIT 1") assert rows and rows[0].get("status") == "pending" finally: await client.close() diff --git a/tests/test_codegenome_adapter.py b/tests/test_codegenome_adapter.py index ce01128c..3df5bb42 100644 --- a/tests/test_codegenome_adapter.py +++ b/tests/test_codegenome_adapter.py @@ -22,7 +22,6 @@ DeterministicCodeGenomeAdapter, ) - # ── Phase 1: ABC + dataclasses ────────────────────────────────────────────── diff --git a/tests/test_codegenome_bind_integration.py b/tests/test_codegenome_bind_integration.py index 6bdaae26..0854b407 100644 --- a/tests/test_codegenome_bind_integration.py +++ b/tests/test_codegenome_bind_integration.py @@ -56,7 +56,9 @@ def __init__(self, ledger, *, write_identity_records): def _stub_bind_dependencies(content_hash="abc123"): stack = ExitStack() stack.enter_context(patch("ledger.adapter.compute_content_hash", return_value=content_hash)) - stack.enter_context(patch("ledger.status.get_git_content", return_value="def foo():\n return 1\n")) + stack.enter_context( + patch("ledger.status.get_git_content", return_value="def foo():\n return 1\n") + ) stack.enter_context(patch("ledger.status.hash_lines", return_value=content_hash)) return stack @@ -75,13 +77,18 @@ async def test_bind_with_flag_off_writes_no_identity(): ctx = _CtxWithCodegenome(adapter, write_identity_records=False) with _stub_bind_dependencies(content_hash="hash_off"): - resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "server.py", - "symbol_name": "handle_search", - "start_line": 10, - "end_line": 30, - }]) + resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "server.py", + "symbol_name": "handle_search", + "start_line": 10, + "end_line": 30, + } + ], + ) assert len(resp.bindings) == 1 assert resp.bindings[0].error is None @@ -111,13 +118,18 @@ async def test_bind_with_flag_on_writes_identity_and_links_decision(): fixed_hash = "deadbeefcafe1234" with _stub_bind_dependencies(content_hash=fixed_hash): - resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "checkout/rate_limit.py", - "symbol_name": "enforce_checkout_rate_limit", - "start_line": 24, - "end_line": 67, - }]) + resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "checkout/rate_limit.py", + "symbol_name": "enforce_checkout_rate_limit", + "start_line": 24, + "end_line": 67, + } + ], + ) assert len(resp.bindings) == 1 bind_result = resp.bindings[0] @@ -189,17 +201,26 @@ async def test_codegenome_failure_does_not_change_bind_response(): decision_id = await _seed_decision(client, "x") ctx = _CtxWithCodegenome(adapter, write_identity_records=True) - with patch.object( - ctx.codegenome, "compute_identity", - side_effect=RuntimeError("simulated codegenome failure"), - ), _stub_bind_dependencies(content_hash="h2"): - resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "a.py", - "symbol_name": "f", - "start_line": 1, - "end_line": 5, - }]) + with ( + patch.object( + ctx.codegenome, + "compute_identity", + side_effect=RuntimeError("simulated codegenome failure"), + ), + _stub_bind_dependencies(content_hash="h2"), + ): + resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "a.py", + "symbol_name": "f", + "start_line": 1, + "end_line": 5, + } + ], + ) assert len(resp.bindings) == 1 assert resp.bindings[0].error is None diff --git a/tests/test_codegenome_confidence.py b/tests/test_codegenome_confidence.py index cad66c28..80d3a835 100644 --- a/tests/test_codegenome_confidence.py +++ b/tests/test_codegenome_confidence.py @@ -8,7 +8,6 @@ from codegenome.confidence import noisy_or, weighted_average - # ── noisy_or ──────────────────────────────────────────────────────────────── diff --git a/tests/test_codegenome_config.py b/tests/test_codegenome_config.py index 3a25e9b9..6fc8699d 100644 --- a/tests/test_codegenome_config.py +++ b/tests/test_codegenome_config.py @@ -6,7 +6,6 @@ from codegenome.config import CodeGenomeConfig - _ALL_FLAGS = ( "BICAMERAL_CODEGENOME_ENABLED", "BICAMERAL_CODEGENOME_WRITE_IDENTITY_RECORDS", @@ -67,6 +66,10 @@ def test_identity_writes_active_requires_both_flags(): assert CodeGenomeConfig().identity_writes_active() is False assert CodeGenomeConfig(enabled=True).identity_writes_active() is False assert CodeGenomeConfig(write_identity_records=True).identity_writes_active() is False - assert CodeGenomeConfig( - enabled=True, write_identity_records=True, - ).identity_writes_active() is True + assert ( + CodeGenomeConfig( + enabled=True, + write_identity_records=True, + ).identity_writes_active() + is True + ) diff --git a/tests/test_codegenome_l1_exemption.py b/tests/test_codegenome_l1_exemption.py index 9baf8796..0605b741 100644 --- a/tests/test_codegenome_l1_exemption.py +++ b/tests/test_codegenome_l1_exemption.py @@ -24,7 +24,6 @@ from ledger.client import LedgerClient from ledger.schema import init_schema, migrate - # ── Fixtures ──────────────────────────────────────────────────────────────── @@ -60,7 +59,8 @@ def __init__(self, ledger): self.codegenome = DeterministicCodeGenomeAdapter(repo_path=self.repo_path) # Both flags ON — L1 guard is the only thing that should suppress writes. self.codegenome_config = CodeGenomeConfig( - enabled=True, write_identity_records=True, + enabled=True, + write_identity_records=True, ) @@ -97,12 +97,18 @@ async def test_bind_l2_writes_identity(): ctx = _CtxWithCodegenome(adapter) with _stub_bind_dependencies("h_l2"): - resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "ledger/client.py", - "symbol_name": "WALWriter", - "start_line": 10, "end_line": 30, - }]) + resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "ledger/client.py", + "symbol_name": "WALWriter", + "start_line": 10, + "end_line": 30, + } + ], + ) assert resp.bindings[0].error is None cs, si, ab = await _count_codegenome_rows(client) @@ -131,17 +137,25 @@ async def test_bind_l1_skips_codegenome_writes(): adapter._client = client adapter._connected = True decision_id = await _seed_decision( - client, description="Users can pause subscription for 90 days", level="L1", + client, + description="Users can pause subscription for 90 days", + level="L1", ) ctx = _CtxWithCodegenome(adapter) with _stub_bind_dependencies("h_l1"): - resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "subscriptions/pause.py", - "symbol_name": "pause_subscription", - "start_line": 1, "end_line": 20, - }]) + resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "subscriptions/pause.py", + "symbol_name": "pause_subscription", + "start_line": 1, + "end_line": 20, + } + ], + ) # Bind itself succeeds (binds_to + code_region still written — # the bind contract is unchanged). Only the codegenome # side-effect is suppressed. @@ -168,16 +182,25 @@ async def test_bind_l3_skips_codegenome_writes(): adapter._client = client adapter._connected = True decision_id = await _seed_decision( - client, description="Loop unroll factor 4 in hot path", level="L3", + client, + description="Loop unroll factor 4 in hot path", + level="L3", ) ctx = _CtxWithCodegenome(adapter) with _stub_bind_dependencies("h_l3"): - await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "vm/eval.py", "symbol_name": "eval_loop", - "start_line": 100, "end_line": 200, - }]) + await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "vm/eval.py", + "symbol_name": "eval_loop", + "start_line": 100, + "end_line": 200, + } + ], + ) cs, si, ab = await _count_codegenome_rows(client) assert (cs, si, ab) == (0, 0, 0) @@ -202,16 +225,25 @@ async def test_bind_unclassified_decision_level_skips_codegenome_writes(): adapter._client = client adapter._connected = True decision_id = await _seed_decision( - client, description="legacy ungrouped decision", level=None, + client, + description="legacy ungrouped decision", + level=None, ) ctx = _CtxWithCodegenome(adapter) with _stub_bind_dependencies("h_null"): - await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "x.py", "symbol_name": "x", - "start_line": 1, "end_line": 5, - }]) + await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "x.py", + "symbol_name": "x", + "start_line": 1, + "end_line": 5, + } + ], + ) cs, si, ab = await _count_codegenome_rows(client) assert (cs, si, ab) == (0, 0, 0) @@ -234,16 +266,25 @@ async def test_bind_response_shape_unchanged_for_l1(): adapter._client = client adapter._connected = True decision_id = await _seed_decision( - client, description="Members can pause subscription", level="L1", + client, + description="Members can pause subscription", + level="L1", ) ctx = _CtxWithCodegenome(adapter) with _stub_bind_dependencies("h_shape"): - resp = await handle_bind(ctx, bindings=[{ - "decision_id": decision_id, - "file_path": "src/x.py", "symbol_name": "x", - "start_line": 1, "end_line": 5, - }]) + resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "src/x.py", + "symbol_name": "x", + "start_line": 1, + "end_line": 5, + } + ], + ) bind = resp.bindings[0] assert bind.error is None diff --git a/tests/test_compliance_cache_semantics.py b/tests/test_compliance_cache_semantics.py index d900af0c..9606ad49 100644 --- a/tests/test_compliance_cache_semantics.py +++ b/tests/test_compliance_cache_semantics.py @@ -6,6 +6,7 @@ - Seeding a compliance_check row via resolve_compliance (simulated here by direct write) promotes the decision out of PENDING """ + from __future__ import annotations import pytest @@ -15,7 +16,6 @@ from ledger.schema import init_schema, migrate from ledger.status import derive_status - # ── Pure unit tests: derive_status decision table ──────────────────── diff --git a/tests/test_compliance_check_schema.py b/tests/test_compliance_check_schema.py index b1409af4..7c55e92c 100644 --- a/tests/test_compliance_check_schema.py +++ b/tests/test_compliance_check_schema.py @@ -11,6 +11,7 @@ These tests pin the fields, the enum constraints, the defaults, and the UNIQUE cache-key index. They run against memory:// for hermetic isolation. """ + from __future__ import annotations import pytest @@ -165,9 +166,7 @@ async def test_phase_accepts_all_five_reserved_values(): """ c = await _fresh_client() try: - for i, phase in enumerate( - ("ingest", "drift", "regrounding", "supersession", "divergence") - ): + for i, phase in enumerate(("ingest", "drift", "regrounding", "supersession", "divergence")): await c.execute( "CREATE compliance_check SET decision_id = $i, region_id = $r, " "content_hash = $h, verdict = 'compliant', confidence = 'high', " @@ -298,10 +297,7 @@ async def test_init_schema_is_idempotent_against_existing_db(): await init_schema(c) # Sanity: schema still works after repeated inits. - await c.execute( - "CREATE intent SET description = 'init-idem test', " - "source_type = 'manual'" - ) + await c.execute("CREATE intent SET description = 'init-idem test', source_type = 'manual'") rows = await c.query("SELECT description FROM intent") assert len(rows) == 1 assert rows[0]["description"] == "init-idem test" diff --git a/tests/test_consent_notice.py b/tests/test_consent_notice.py index caced0e9..1682173d 100644 --- a/tests/test_consent_notice.py +++ b/tests/test_consent_notice.py @@ -13,7 +13,9 @@ def _reload_consent(): import importlib + import consent + importlib.reload(consent) return consent @@ -21,7 +23,9 @@ def _reload_consent(): # ── telemetry_allowed() — gating behavior ────────────────────────────── -def test_telemetry_allowed_no_marker_default_on(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: +def test_telemetry_allowed_no_marker_default_on( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: """No marker: default-on (preserves upgrade-path behavior).""" monkeypatch.setenv("HOME", str(tmp_path)) monkeypatch.setenv("USERPROFILE", str(tmp_path)) @@ -30,7 +34,9 @@ def test_telemetry_allowed_no_marker_default_on(tmp_path: Path, monkeypatch: pyt assert consent.telemetry_allowed() is True -def test_telemetry_allowed_env_off_overrides_marker(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: +def test_telemetry_allowed_env_off_overrides_marker( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: """Env BICAMERAL_TELEMETRY=0 wins even when marker says enabled.""" monkeypatch.setenv("HOME", str(tmp_path)) monkeypatch.setenv("USERPROFILE", str(tmp_path)) @@ -137,7 +143,14 @@ def test_notice_re_emitted_on_policy_version_bump( # Simulate a stale marker (older policy version). (tmp_path / ".bicameral").mkdir(parents=True, exist_ok=True) (tmp_path / ".bicameral" / "consent.json").write_text( - json.dumps({"telemetry": "enabled", "policy_version": 0, "acknowledged_at": "x", "acknowledged_via": "wizard"}), + json.dumps( + { + "telemetry": "enabled", + "policy_version": 0, + "acknowledged_at": "x", + "acknowledged_via": "wizard", + } + ), encoding="utf-8", ) @@ -170,7 +183,9 @@ def test_notice_swallows_marker_write_failure( monkeypatch.setenv("USERPROFILE", str(tmp_path)) monkeypatch.delenv("BICAMERAL_SKIP_CONSENT_NOTICE", raising=False) consent = _reload_consent() - monkeypatch.setattr(consent, "write_consent", lambda *a, **kw: (_ for _ in ()).throw(OSError("disk full"))) + monkeypatch.setattr( + consent, "write_consent", lambda *a, **kw: (_ for _ in ()).throw(OSError("disk full")) + ) # Must not raise. consent.notify_if_first_run() @@ -186,7 +201,9 @@ def test_telemetry_send_event_blocked_when_consent_disabled( consent.write_consent(telemetry=False, via="wizard") import importlib + import telemetry + importlib.reload(telemetry) # Patch the network path; if relay was attempted, this would be called. @@ -195,6 +212,7 @@ def test_telemetry_send_event_blocked_when_consent_disabled( telemetry.send_event("0.13.3", skill="bicameral-ingest", duration_ms=100) # Counter should still increment locally. import local_counters + importlib.reload(local_counters) # Relay was NOT called (consent denied). assert sent == [] diff --git a/tests/test_desync_scenarios.py b/tests/test_desync_scenarios.py index c70a88c3..2b04ef03 100644 --- a/tests/test_desync_scenarios.py +++ b/tests/test_desync_scenarios.py @@ -30,6 +30,7 @@ than via server-side magic. Scenarios depending on V2-only tools (``bicameral_rebind``, ``record_compliance_verdict``) are marked xfail. """ + from __future__ import annotations import subprocess @@ -45,7 +46,6 @@ from handlers.ingest import handle_ingest from handlers.link_commit import handle_link_commit, invalidate_sync_cache - # ── Helpers ────────────────────────────────────────────────────────── @@ -109,16 +109,19 @@ def _scenario_repo(monkeypatch, tmp_path): monkeypatch.setenv("USE_REAL_LEDGER", "1") monkeypatch.setenv("SURREAL_URL", "memory://") repo = tmp_path / "repo" - _seed_repo(repo, { - "src/payments.py": """ + _seed_repo( + repo, + { + "src/payments.py": """ def calculate_discount(order_total: float) -> float: return order_total * 0.1 """, - "src/auth.py": """ + "src/auth.py": """ def verify_token(token: str) -> bool: return token.startswith("valid:") """, - }) + }, + ) monkeypatch.setenv("REPO_PATH", str(repo)) monkeypatch.setenv("BICAMERAL_AUTHORITATIVE_REF", "main") monkeypatch.chdir(repo) @@ -156,11 +159,16 @@ async def test_scenario_01_new_decision_with_existing_code(_scenario_repo): assert ungrounded, f"Expected ungrounded grounding check, got: {lc.pending_grounding_checks}" decision_id = ungrounded[0]["decision_id"] - bind_resp = await handle_bind(ctx, [{ - "decision_id": decision_id, - "file_path": "src/payments.py", - "symbol_name": "calculate_discount", - }]) + bind_resp = await handle_bind( + ctx, + [ + { + "decision_id": decision_id, + "file_path": "src/payments.py", + "symbol_name": "calculate_discount", + } + ], + ) assert bind_resp.bindings assert not bind_resp.bindings[0].error, bind_resp.bindings[0].error @@ -184,14 +192,16 @@ async def test_scenario_02_code_changed_after_grounded_pending_until_verdict(_sc _scenario_repo, text="Apply discount", intent="Apply 10% discount", - code_regions=[{ - "file_path": "src/payments.py", - "symbol": "calculate_discount", - "start_line": 1, - "end_line": 2, - "type": "function", - "purpose": "discount calc", - }], + code_regions=[ + { + "file_path": "src/payments.py", + "symbol": "calculate_discount", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "discount calc", + } + ], ) await handle_ingest(ctx, payload) @@ -230,12 +240,16 @@ async def test_scenario_03_code_deleted_after_grounded_pending(_scenario_repo): _scenario_repo, text="Apply discount", intent="Apply 10% discount", - code_regions=[{ - "file_path": "src/payments.py", - "symbol": "calculate_discount", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "discount calc", - }], + code_regions=[ + { + "file_path": "src/payments.py", + "symbol": "calculate_discount", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "discount calc", + } + ], ) await handle_ingest(ctx, payload) @@ -245,7 +259,9 @@ async def test_scenario_03_code_deleted_after_grounded_pending(_scenario_repo): lc = await handle_link_commit(ctx, "HEAD") # Symbol disappeared on authoritative ref. - disappeared = [c for c in lc.pending_grounding_checks if c.get("reason") == "symbol_disappeared"] + disappeared = [ + c for c in lc.pending_grounding_checks if c.get("reason") == "symbol_disappeared" + ] assert disappeared, f"Expected symbol_disappeared check, got: {lc.pending_grounding_checks}" @@ -258,12 +274,16 @@ async def test_scenario_04_symbol_renamed_in_file(_scenario_repo): _scenario_repo, text="Apply discount", intent="Apply 10% discount", - code_regions=[{ - "file_path": "src/payments.py", - "symbol": "calculate_discount", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "discount calc", - }], + code_regions=[ + { + "file_path": "src/payments.py", + "symbol": "calculate_discount", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "discount calc", + } + ], ) await handle_ingest(ctx, payload) @@ -274,7 +294,9 @@ async def test_scenario_04_symbol_renamed_in_file(_scenario_repo): invalidate_sync_cache(ctx) lc = await handle_link_commit(ctx, "HEAD") - disappeared = [c for c in lc.pending_grounding_checks if c.get("reason") == "symbol_disappeared"] + disappeared = [ + c for c in lc.pending_grounding_checks if c.get("reason") == "symbol_disappeared" + ] assert disappeared, f"Expected symbol_disappeared, got: {lc.pending_grounding_checks}" assert disappeared[0]["symbol"] == "calculate_discount" # V1 D1: original_lines is part of the payload. @@ -290,12 +312,16 @@ async def test_scenario_05_symbol_moved_to_different_file(_scenario_repo): _scenario_repo, text="Apply discount", intent="Apply 10% discount", - code_regions=[{ - "file_path": "src/payments.py", - "symbol": "calculate_discount", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "discount calc", - }], + code_regions=[ + { + "file_path": "src/payments.py", + "symbol": "calculate_discount", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "discount calc", + } + ], ) await handle_ingest(ctx, payload) @@ -307,8 +333,12 @@ async def test_scenario_05_symbol_moved_to_different_file(_scenario_repo): invalidate_sync_cache(ctx) lc = await handle_link_commit(ctx, "HEAD") - disappeared = [c for c in lc.pending_grounding_checks if c.get("reason") == "symbol_disappeared"] - assert disappeared, f"Expected symbol_disappeared on cross-file move, got: {lc.pending_grounding_checks}" + disappeared = [ + c for c in lc.pending_grounding_checks if c.get("reason") == "symbol_disappeared" + ] + assert disappeared, ( + f"Expected symbol_disappeared on cross-file move, got: {lc.pending_grounding_checks}" + ) @pytest.mark.phase2 @@ -347,13 +377,18 @@ async def test_scenario_06_code_added_ungrounded_resolvable(_scenario_repo): # Pass explicit lines — ctx.authoritative_sha is captured at ctx # creation and is stale after the new commit, so resolve_symbol_lines # would look at the wrong ref. Explicit lines bypass resolution. - bind_resp = await handle_bind(ctx, [{ - "decision_id": decision_id, - "file_path": "src/cart.py", - "symbol_name": "cart_total", - "start_line": 1, - "end_line": 2, - }]) + bind_resp = await handle_bind( + ctx, + [ + { + "decision_id": decision_id, + "file_path": "src/cart.py", + "symbol_name": "cart_total", + "start_line": 1, + "end_line": 2, + } + ], + ) assert bind_resp.bindings and not bind_resp.bindings[0].error, ( f"bind failed: {bind_resp.bindings[0].error if bind_resp.bindings else 'no result'}" ) @@ -413,24 +448,32 @@ async def test_scenario_09_intent_description_supersession(_scenario_repo): _scenario_repo, text="Apply discount", intent="Apply 10% discount on orders", - code_regions=[{ - "file_path": "src/payments.py", - "symbol": "calculate_discount", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "discount calc", - }], + code_regions=[ + { + "file_path": "src/payments.py", + "symbol": "calculate_discount", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "discount calc", + } + ], source_ref="meeting-1", ) p2 = _build_payload( _scenario_repo, text="Apply discount with backoff", intent="Apply 15% discount on orders over $100", - code_regions=[{ - "file_path": "src/payments.py", - "symbol": "calculate_discount", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "discount calc", - }], + code_regions=[ + { + "file_path": "src/payments.py", + "symbol": "calculate_discount", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "discount calc", + } + ], source_ref="meeting-2", ) r1 = await handle_ingest(ctx, p1) @@ -446,17 +489,31 @@ async def test_scenario_10_multiple_intents_share_symbol(_scenario_repo): region = { "file_path": "src/auth.py", "symbol": "verify_token", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "auth check", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "auth check", } - await handle_ingest(ctx, _build_payload( - _scenario_repo, text="Verify JWT", intent="Use JWT verification", - code_regions=[region], source_ref="m1", - )) - await handle_ingest(ctx, _build_payload( - _scenario_repo, text="Reject invalid", intent="Reject malformed tokens", - code_regions=[region], source_ref="m2", - )) + await handle_ingest( + ctx, + _build_payload( + _scenario_repo, + text="Verify JWT", + intent="Use JWT verification", + code_regions=[region], + source_ref="m1", + ), + ) + await handle_ingest( + ctx, + _build_payload( + _scenario_repo, + text="Reject invalid", + intent="Reject malformed tokens", + code_regions=[region], + source_ref="m2", + ), + ) invalidate_sync_cache(ctx) drift = await handle_detect_drift(ctx, "src/auth.py") decision_ids = {d.decision_id for d in drift.decisions} @@ -507,18 +564,25 @@ async def test_scenario_12_line_shift_does_not_trigger_drift(_scenario_repo): region = { "file_path": "src/auth.py", "symbol": "verify_token", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "auth check", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "auth check", } - await handle_ingest(ctx, _build_payload( - _scenario_repo, text="Use JWT", intent="JWT verification", - code_regions=[region], - )) + await handle_ingest( + ctx, + _build_payload( + _scenario_repo, + text="Use JWT", + intent="JWT verification", + code_regions=[region], + ), + ) # Insert blank lines above — line numbers shift but the symbol bytes # are identical. (_scenario_repo / "src/auth.py").write_text( - "\n\n\ndef verify_token(token: str) -> bool:\n return token.startswith(\"valid:\")\n" + '\n\n\ndef verify_token(token: str) -> bool:\n return token.startswith("valid:")\n' ) _commit(_scenario_repo, "insert blank lines above") invalidate_sync_cache(ctx) @@ -526,7 +590,9 @@ async def test_scenario_12_line_shift_does_not_trigger_drift(_scenario_repo): drift = await handle_detect_drift(ctx, "src/auth.py") drifted = [d for d in drift.decisions if d.status == "drifted"] - assert not drifted, f"Line-shift edit must NOT trigger drift, got: {[(d.status, d.symbol, d.lines) for d in drift.decisions]}" + assert not drifted, ( + f"Line-shift edit must NOT trigger drift, got: {[(d.status, d.symbol, d.lines) for d in drift.decisions]}" + ) @pytest.mark.phase2 diff --git a/tests/test_e2e_asserters.py b/tests/test_e2e_asserters.py new file mode 100644 index 00000000..e5ec4073 --- /dev/null +++ b/tests/test_e2e_asserters.py @@ -0,0 +1,176 @@ +"""Unit tests for the e2e flow asserters. + +Run the asserter functions in isolation against synthetic tool-call lists. +Lets us pin behaviour like "Flow 1 accepts any commit-history-area file as +a legitimate anchor for the bundled reorder/squash/amend/branch-from +decision" without paying for a full claude-CLI e2e cycle. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +E2E_DIR = Path(__file__).resolve().parent.parent / "tests" / "e2e" +if str(E2E_DIR) not in sys.path: + sys.path.insert(0, str(E2E_DIR)) + +# Importing the orchestrator triggers env-var checks (DESKTOP_REPO_PATH etc.) +# and CLI presence checks that we don't want to fire in unit tests. Stub them +# before import so the module loads without bailing out. +import os # noqa: E402 + +os.environ.setdefault("DESKTOP_REPO_PATH", str(Path(__file__).resolve().parent)) +os.environ.setdefault("PATH", os.environ.get("PATH", "")) + +import shutil # noqa: E402 + +_orig_which = shutil.which + + +def _which_stub(name: str, *args, **kwargs): + if name in ("claude", "bicameral-mcp"): + return f"/stub/{name}" + return _orig_which(name, *args, **kwargs) + + +shutil.which = _which_stub # type: ignore[assignment] +try: + import run_e2e_flows # noqa: E402 +finally: + shutil.which = _orig_which # type: ignore[assignment] + + +def _ingest_call(decisions: list[dict]) -> dict: + return { + "name": "mcp__bicameral__bicameral_ingest", + "input": {"payload": {"decisions": decisions}}, + } + + +def _ratify_call(decision_id: str) -> dict: + return { + "name": "mcp__bicameral__bicameral_ratify", + "input": {"decision_id": decision_id}, + } + + +def _seed_calls(commit_history_anchor: str) -> list[dict]: + """Standard Flow 1 sequence: ingest the 3 seed decisions with inline + bindings, then ratify each. ``commit_history_anchor`` is the file path + chosen for the bundled commit-history decision — varied across tests + to confirm the asserter accepts any legitimate area path. + """ + decisions = [ + { + "description": "High-signal notifications", + "code_regions": [{"file_path": "app/src/lib/stores/notifications-store.ts"}], + }, + { + "description": "Improved commit history", + "code_regions": [{"file_path": commit_history_anchor}], + }, + { + "description": "Cherry-pick between branches", + "code_regions": [{"file_path": "app/src/lib/git/cherry-pick.ts"}], + }, + ] + return [ + _ingest_call(decisions), + _ratify_call("decision:1"), + _ratify_call("decision:2"), + _ratify_call("decision:3"), + ] + + +# ── Flow 1: feature-area binding ──────────────────────────────────────── + + +def test_flow1_passes_with_canonical_git_layer_anchor(): + """The previously-required exact path — must still pass.""" + calls = _seed_calls(commit_history_anchor="app/src/lib/git/reorder.ts") + ok, detail = run_e2e_flows.assert_flow_1(calls) + assert ok, f"Flow 1 should pass with canonical reorder.ts anchor; detail: {detail}" + + +def test_flow1_passes_with_ui_layer_anchor(): + """Previously failing case — agent picks UI-layer commit-list.tsx for the + bundled commit-history decision. Now accepted as a legitimate anchor.""" + calls = _seed_calls(commit_history_anchor="app/src/ui/history/commit-list.tsx") + ok, detail = run_e2e_flows.assert_flow_1(calls) + assert ok, f"Flow 1 should accept commit-list.tsx as commit-history anchor; detail: {detail}" + + +def test_flow1_passes_with_dispatcher_anchor(): + """Dispatcher also backs the bundled ops (amend, branch-from).""" + calls = _seed_calls(commit_history_anchor="app/src/ui/dispatcher/dispatcher.ts") + ok, detail = run_e2e_flows.assert_flow_1(calls) + assert ok, f"Flow 1 should accept dispatcher.ts as commit-history anchor; detail: {detail}" + + +def test_flow1_passes_with_squash_anchor(): + """Bundled decision includes drag-to-squash; squash.ts is a legitimate anchor.""" + calls = _seed_calls(commit_history_anchor="app/src/lib/git/squash.ts") + ok, detail = run_e2e_flows.assert_flow_1(calls) + assert ok, f"Flow 1 should accept squash.ts as commit-history anchor; detail: {detail}" + + +def test_flow1_fails_when_commit_history_unbound(): + """Bind something far from the commit-history area — asserter still fails.""" + calls = _seed_calls(commit_history_anchor="app/src/lib/some-unrelated-file.ts") + ok, detail = run_e2e_flows.assert_flow_1(calls) + assert not ok, f"Flow 1 must fail when no commit-history-area file is bound; detail: {detail}" + assert "commit-history area" in detail + + +def test_flow1_fails_when_cherry_pick_unbound(): + """Replace cherry-pick.ts with something unrelated — asserter fails.""" + decisions = [ + { + "description": "High-signal notifications", + "code_regions": [{"file_path": "app/src/lib/stores/notifications-store.ts"}], + }, + { + "description": "Improved commit history", + "code_regions": [{"file_path": "app/src/lib/git/reorder.ts"}], + }, + { + "description": "Cherry-pick between branches", + "code_regions": [{"file_path": "app/src/lib/some-other-thing.ts"}], + }, + ] + calls = [_ingest_call(decisions), _ratify_call("d1"), _ratify_call("d2"), _ratify_call("d3")] + ok, detail = run_e2e_flows.assert_flow_1(calls) + assert not ok + assert "cherry-pick area" in detail + + +def test_flow1_accepts_cherry_pick_tsx(): + """UI-layer cherry-pick.tsx is also a legitimate cherry-pick anchor.""" + decisions = [ + { + "description": "High-signal notifications", + "code_regions": [{"file_path": "app/src/lib/stores/notifications-store.ts"}], + }, + { + "description": "Improved commit history", + "code_regions": [{"file_path": "app/src/lib/git/reorder.ts"}], + }, + { + "description": "Cherry-pick between branches", + "code_regions": [{"file_path": "app/src/ui/multi-commit-operation/cherry-pick.tsx"}], + }, + ] + calls = [_ingest_call(decisions), _ratify_call("d1"), _ratify_call("d2"), _ratify_call("d3")] + ok, detail = run_e2e_flows.assert_flow_1(calls) + assert ok, f"Flow 1 should accept cherry-pick.tsx; detail: {detail}" + + +def test_flow1_fails_without_ratify(): + """Even if bindings are fine, missing ratify still fails the asserter.""" + calls = _seed_calls(commit_history_anchor="app/src/lib/git/reorder.ts") + # Drop the three ratify calls. + calls = [c for c in calls if "ratify" not in c["name"]] + ok, detail = run_e2e_flows.assert_flow_1(calls) + assert not ok + assert "ratify" in detail.lower() diff --git a/tests/test_ephemeral_authoritative.py b/tests/test_ephemeral_authoritative.py index fe628ffe..550fd813 100644 --- a/tests/test_ephemeral_authoritative.py +++ b/tests/test_ephemeral_authoritative.py @@ -40,6 +40,7 @@ E21 — ungrounded → feature branch bind → reflected + ephemeral=True [PASS] E22 — switch back to main: no stale ephemeral 'reflected' (→ drifted) [PASS] """ + from __future__ import annotations import subprocess @@ -55,7 +56,6 @@ from handlers.link_commit import handle_link_commit, invalidate_sync_cache from handlers.resolve_compliance import handle_resolve_compliance - # ── Helpers ─────────────────────────────────────────────────────────────────── @@ -98,7 +98,9 @@ def _merge(repo: Path, branch: str, *, squash: bool = False, no_ff: bool = False _git(repo, "merge", "--squash", branch) _git(repo, "-c", "commit.gpgsign=false", "commit", "-q", "-m", f"Squash-merge {branch}") elif no_ff: - _git(repo, "-c", "commit.gpgsign=false", "merge", "--no-ff", "-m", f"Merge {branch}", branch) + _git( + repo, "-c", "commit.gpgsign=false", "merge", "--no-ff", "-m", f"Merge {branch}", branch + ) else: _git(repo, "-c", "commit.gpgsign=false", "merge", branch) @@ -170,13 +172,18 @@ async def _ingest_and_bind( assert ingest.ingested, f"ingest failed: {ingest}" decision_id = ingest.created_decisions[0].decision_id - bind_resp = await handle_bind(ctx, [{ - "decision_id": decision_id, - "file_path": file_path, - "symbol_name": symbol_name, - "start_line": start_line, - "end_line": end_line, - }]) + bind_resp = await handle_bind( + ctx, + [ + { + "decision_id": decision_id, + "file_path": file_path, + "symbol_name": symbol_name, + "start_line": start_line, + "end_line": end_line, + } + ], + ) assert bind_resp.bindings, "no bind results" assert not bind_resp.bindings[0].error, f"bind error: {bind_resp.bindings[0].error}" return decision_id, bind_resp.bindings[0].region_id, bind_resp.bindings[0].content_hash @@ -200,14 +207,16 @@ async def _resolve_verdict( return await handle_resolve_compliance( ctx, phase=phase, - verdicts=[{ - "decision_id": decision_id, - "region_id": p.region_id, - "content_hash": p.content_hash, - "verdict": verdict, - "confidence": "high", - "explanation": "test", - }], + verdicts=[ + { + "decision_id": decision_id, + "region_id": p.region_id, + "content_hash": p.content_hash, + "verdict": verdict, + "confidence": "high", + "explanation": "test", + } + ], flow_id=lc.flow_id, ) @@ -226,12 +235,15 @@ def _eph_repo(monkeypatch, tmp_path): monkeypatch.setenv("USE_REAL_LEDGER", "1") monkeypatch.setenv("SURREAL_URL", "memory://") repo = tmp_path / "repo" - _seed_repo(repo, { - "src/calc.py": """ + _seed_repo( + repo, + { + "src/calc.py": """ def rate(order_total: float) -> float: return order_total * 0.1 """, - }) + }, + ) monkeypatch.setenv("REPO_PATH", str(repo)) monkeypatch.setenv("BICAMERAL_AUTHORITATIVE_REF", "main") monkeypatch.chdir(repo) @@ -259,13 +271,21 @@ async def test_e01_authoritative_branch_full_cycle(_eph_repo): # Ingest with code_regions so the binding exists before the internal link_commit. ingest = await handle_ingest( ctx, - _payload(repo, text="10% discount rule", intent="Apply 10% discount on all orders", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate calc", - }]), + _payload( + repo, + text="10% discount rule", + intent="Apply 10% discount on all orders", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate calc", + } + ], + ), ) assert ingest.ingested decision_id = ingest.created_decisions[0].decision_id @@ -314,13 +334,21 @@ async def test_e02_feature_branch_full_cycle(_eph_repo): # Ingest on the feature branch — code_regions reference the original file on main. ingest = await handle_ingest( ctx, - _payload(repo, text="Pricing rate", intent="Apply rate to order total", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate calc", - }]), + _payload( + repo, + text="Pricing rate", + intent="Apply rate to order total", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate calc", + } + ], + ), ) assert ingest.ingested decision_id = ingest.created_decisions[0].decision_id @@ -371,13 +399,21 @@ async def test_e03_ff_merge_verdict_survives(_eph_repo): ctx = BicameralContext.from_env() ingest = await handle_ingest( ctx, - _payload(repo, text="Pricing", intent="Apply rate", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Pricing", + intent="Apply rate", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) assert ingest.ingested decision_id = ingest.created_decisions[0].decision_id @@ -400,9 +436,7 @@ async def test_e03_ff_merge_verdict_survives(_eph_repo): ) # No new pending compliance check for this decision (verdict already exists). new_pending = [p for p in lc_main.pending_compliance_checks if p.decision_id == decision_id] - assert not new_pending, ( - f"Should not re-pend after FF merge with same hash, got: {new_pending}" - ) + assert not new_pending, f"Should not re-pend after FF merge with same hash, got: {new_pending}" # ── E4: Squash merge → same content hash → reflected ────────────────────────── @@ -429,13 +463,21 @@ async def test_e04_squash_merge_verdict_survives(_eph_repo): ctx = BicameralContext.from_env() ingest = await handle_ingest( ctx, - _payload(repo, text="Rate policy", intent="Set 18% rate", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Rate policy", + intent="Set 18% rate", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc = await handle_link_commit(ctx, "HEAD") @@ -479,13 +521,21 @@ async def test_e05_content_change_becomes_drifted(_eph_repo): ingest = await handle_ingest( ctx, - _payload(repo, text="10% discount rule", intent="Apply 10% rate", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="10% discount rule", + intent="Apply 10% rate", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc1 = await handle_link_commit(ctx, "HEAD") @@ -546,13 +596,21 @@ async def test_e06_branch_switch_stale_not_cleared(_eph_repo): ctx = BicameralContext.from_env() ingest = await handle_ingest( ctx, - _payload(repo, text="Rate policy", intent="Apply 15% rate", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Rate policy", + intent="Apply 15% rate", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc_a = await handle_link_commit(ctx, "HEAD") @@ -602,13 +660,21 @@ async def test_e07_feature_to_main_ephemeral_not_promoted(_eph_repo): ctx = BicameralContext.from_env() ingest = await handle_ingest( ctx, - _payload(repo, text="Rate", intent="11% rate", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Rate", + intent="11% rate", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc = await handle_link_commit(ctx, "HEAD") @@ -657,13 +723,21 @@ async def test_e08_detached_head_non_ephemeral(_eph_repo): ctx = BicameralContext.from_env() ingest = await handle_ingest( ctx, - _payload(repo, text="Rate", intent="Rate policy", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Rate", + intent="Rate policy", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc = await handle_link_commit(ctx, "HEAD") @@ -706,13 +780,21 @@ async def test_e09_process_restart_flag_lost_status_ok(_eph_repo): ctx = BicameralContext.from_env() ingest = await handle_ingest( ctx, - _payload(repo, text="Rate", intent="13% rate", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Rate", + intent="13% rate", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc = await handle_link_commit(ctx, "HEAD") @@ -731,22 +813,22 @@ async def test_e09_process_restart_flag_lost_status_ok(_eph_repo): rc = await handle_resolve_compliance( ctx2, phase="ingest", - verdicts=[{ - "decision_id": decision_id, - "region_id": pending[0].region_id, - "content_hash": pending[0].content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "post-restart", - }], + verdicts=[ + { + "decision_id": decision_id, + "region_id": pending[0].region_id, + "content_hash": pending[0].content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "post-restart", + } + ], # No flow_id — simulating process restart ) assert rc.accepted, f"resolve rejected post-restart: {rc.rejected}" status = await _get_decision_status(ctx2, decision_id) - assert status == "reflected", ( - f"Status must be reflected after restart, got {status}" - ) + assert status == "reflected", f"Status must be reflected after restart, got {status}" checks = await _get_compliance_checks(ctx2, decision_id) assert checks @@ -778,29 +860,41 @@ async def test_e10_idempotent_resolve_compliance(_eph_repo): ingest = await handle_ingest( ctx, - _payload(repo, text="Discount rate", intent="Apply rate", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Discount rate", + intent="Apply rate", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc = await handle_link_commit(ctx, "HEAD") pending = [p for p in lc.pending_compliance_checks if p.decision_id == decision_id] assert pending - verdict_payload = [{ - "decision_id": decision_id, - "region_id": pending[0].region_id, - "content_hash": pending[0].content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "first call", - }] + verdict_payload = [ + { + "decision_id": decision_id, + "region_id": pending[0].region_id, + "content_hash": pending[0].content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "first call", + } + ] - rc1 = await handle_resolve_compliance(ctx, phase="ingest", verdicts=verdict_payload, flow_id=lc.flow_id) + rc1 = await handle_resolve_compliance( + ctx, phase="ingest", verdicts=verdict_payload, flow_id=lc.flow_id + ) assert rc1.accepted # Second call with same payload — must succeed silently. @@ -842,13 +936,21 @@ async def test_e11_flow_id_mismatch_ephemeral_false_status_ok(_eph_repo): ctx = BicameralContext.from_env() ingest = await handle_ingest( ctx, - _payload(repo, text="Rate 14%", intent="Apply 14% rate", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Rate 14%", + intent="Apply 14% rate", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc = await handle_link_commit(ctx, "HEAD") @@ -861,14 +963,16 @@ async def test_e11_flow_id_mismatch_ephemeral_false_status_ok(_eph_repo): rc = await handle_resolve_compliance( ctx, phase="ingest", - verdicts=[{ - "decision_id": decision_id, - "region_id": pending[0].region_id, - "content_hash": pending[0].content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "stale flow", - }], + verdicts=[ + { + "decision_id": decision_id, + "region_id": pending[0].region_id, + "content_hash": pending[0].content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "stale flow", + } + ], flow_id=stale_flow_id, ) assert rc.accepted, f"Expected accepted despite flow_id mismatch, got: {rc.rejected}" @@ -919,13 +1023,21 @@ async def test_e12_feature_branch_reflected_drift_not_detected(_eph_repo): # calc.py IS in changed_files → pending check surfaced → we can verify it. ingest = await handle_ingest( ctx, - _payload(repo, text="Rate 20%", intent="Rate policy", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Rate 20%", + intent="Rate policy", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc1 = await handle_link_commit(ctx, "HEAD") @@ -989,13 +1101,21 @@ async def test_e13_rebase_same_hash_verdict_survives(_eph_repo): ctx = BicameralContext.from_env() ingest = await handle_ingest( ctx, - _payload(repo, text="Tax calc", intent="Compute 7% tax", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "tax", - "start_line": 4, "end_line": 5, - "type": "function", "purpose": "tax", - }]), + _payload( + repo, + text="Tax calc", + intent="Compute 7% tax", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "tax", + "start_line": 4, + "end_line": 5, + "type": "function", + "purpose": "tax", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc1 = await handle_link_commit(ctx, "HEAD") @@ -1062,13 +1182,21 @@ async def test_e14_deleted_branch_verdict_survives(_eph_repo): ctx = BicameralContext.from_env() ingest = await handle_ingest( ctx, - _payload(repo, text="Rate 16%", intent="16% rate", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Rate 16%", + intent="16% rate", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc = await handle_link_commit(ctx, "HEAD") @@ -1136,13 +1264,21 @@ async def test_e15_custom_authoritative_ref_non_ephemeral(_eph_repo, monkeypatch ingest = await handle_ingest( ctx, - _payload(repo, text="Rate 19%", intent="19% rate on develop", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Rate 19%", + intent="19% rate on develop", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc = await handle_link_commit(ctx, "HEAD") @@ -1191,25 +1327,29 @@ async def test_e16_resolve_compliance_without_link_commit(_eph_repo): ctx = BicameralContext.from_env() decision_id, region_id, bind_hash = await _ingest_and_bind( - ctx, repo, + ctx, + repo, intent="Direct resolve no link_commit", file_path="src/calc.py", symbol_name="rate", - start_line=1, end_line=2, + start_line=1, + end_line=2, ) # Call resolve_compliance directly (no link_commit, no flow_id). rc = await handle_resolve_compliance( ctx, phase="ingest", - verdicts=[{ - "decision_id": decision_id, - "region_id": region_id, - "content_hash": bind_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "direct resolve", - }], + verdicts=[ + { + "decision_id": decision_id, + "region_id": region_id, + "content_hash": bind_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "direct resolve", + } + ], ) assert rc.accepted, f"Direct resolve rejected: {rc.rejected}" @@ -1243,13 +1383,21 @@ async def test_e17_ephemeral_first_write_wins_flag_stuck(_eph_repo): ctx = BicameralContext.from_env() ingest = await handle_ingest( ctx, - _payload(repo, text="Rate 17%", intent="17% rate", - code_regions=[{ - "file_path": "src/calc.py", - "symbol": "rate", - "start_line": 1, "end_line": 2, - "type": "function", "purpose": "rate", - }]), + _payload( + repo, + text="Rate 17%", + intent="17% rate", + code_regions=[ + { + "file_path": "src/calc.py", + "symbol": "rate", + "start_line": 1, + "end_line": 2, + "type": "function", + "purpose": "rate", + } + ], + ), ) decision_id = ingest.created_decisions[0].decision_id lc_feat = await handle_link_commit(ctx, "HEAD") @@ -1275,14 +1423,16 @@ async def test_e17_ephemeral_first_write_wins_flag_stuck(_eph_repo): rc_main = await handle_resolve_compliance( ctx, phase="drift", - verdicts=[{ - "decision_id": decision_id, - "region_id": pending[0].region_id, - "content_hash": feature_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "main confirmation", - }], + verdicts=[ + { + "decision_id": decision_id, + "region_id": pending[0].region_id, + "content_hash": feature_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "main confirmation", + } + ], # No flow_id — ctx is on main, no pending_ephemeral in sync_state ) assert rc_main.accepted @@ -1318,9 +1468,7 @@ async def test_e18_bind_branch_local_file(_eph_repo): repo = _eph_repo _checkout(repo, "feat/new-module", create=True) - (repo / "src/new_module.py").write_text( - "def compute(x: int) -> int:\n return x * 2\n" - ) + (repo / "src/new_module.py").write_text("def compute(x: int) -> int:\n return x * 2\n") _commit(repo, "add new_module.py (branch-only file)") ctx = BicameralContext.from_env() @@ -1332,19 +1480,22 @@ async def test_e18_bind_branch_local_file(_eph_repo): assert ingest.ingested decision_id = ingest.created_decisions[0].decision_id - bind_resp = await handle_bind(ctx, [{ - "decision_id": decision_id, - "file_path": "src/new_module.py", - "symbol_name": "compute", - "start_line": 1, - "end_line": 2, - }]) + bind_resp = await handle_bind( + ctx, + [ + { + "decision_id": decision_id, + "file_path": "src/new_module.py", + "symbol_name": "compute", + "start_line": 1, + "end_line": 2, + } + ], + ) assert bind_resp.bindings, "no bind results" b = bind_resp.bindings[0] - assert not b.error, ( - f"bind must succeed for a branch-local file; got error: {b.error}" - ) + assert not b.error, f"bind must succeed for a branch-local file; got error: {b.error}" assert b.content_hash, "content_hash must be non-empty after successful bind" @@ -1395,13 +1546,18 @@ async def test_e19_bind_modified_function_uses_branch_hash(_eph_repo): assert ingest.ingested decision_id = ingest.created_decisions[0].decision_id - bind_resp = await handle_bind(ctx, [{ - "decision_id": decision_id, - "file_path": "src/calc.py", - "symbol_name": "rate", - "start_line": 1, - "end_line": 2, - }]) + bind_resp = await handle_bind( + ctx, + [ + { + "decision_id": decision_id, + "file_path": "src/calc.py", + "symbol_name": "rate", + "start_line": 1, + "end_line": 2, + } + ], + ) assert bind_resp.bindings, "no bind results" b = bind_resp.bindings[0] @@ -1462,13 +1618,18 @@ async def test_e20_bind_link_commit_hash_consistency_no_phantom_drift(_eph_repo) assert ingest.ingested decision_id = ingest.created_decisions[0].decision_id - bind_resp = await handle_bind(ctx, [{ - "decision_id": decision_id, - "file_path": "src/calc.py", - "symbol_name": "rate", - "start_line": 1, - "end_line": 2, - }]) + bind_resp = await handle_bind( + ctx, + [ + { + "decision_id": decision_id, + "file_path": "src/calc.py", + "symbol_name": "rate", + "start_line": 1, + "end_line": 2, + } + ], + ) assert bind_resp.bindings and not bind_resp.bindings[0].error bind_hash = bind_resp.bindings[0].content_hash assert bind_hash, "bind must return content_hash" @@ -1482,7 +1643,7 @@ async def test_e20_bind_link_commit_hash_consistency_no_phantom_drift(_eph_repo) # First link_commit: surfaces pending check at H_branch. lc1 = await handle_link_commit(ctx, "HEAD") pending = [p for p in lc1.pending_compliance_checks if p.decision_id == decision_id] - assert pending, f"link_commit must surface pending check for the bound decision" + assert pending, "link_commit must surface pending check for the bound decision" assert pending[0].content_hash == bind_hash, ( f"pending_check.content_hash ({pending[0].content_hash[:8]}) must equal " f"bind_result.content_hash ({bind_hash[:8]}) — hash consistency invariant" @@ -1492,14 +1653,16 @@ async def test_e20_bind_link_commit_hash_consistency_no_phantom_drift(_eph_repo) rc = await handle_resolve_compliance( ctx, phase="ingest", - verdicts=[{ - "decision_id": decision_id, - "region_id": pending[0].region_id, - "content_hash": pending[0].content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "branch content verified", - }], + verdicts=[ + { + "decision_id": decision_id, + "region_id": pending[0].region_id, + "content_hash": pending[0].content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "branch content verified", + } + ], flow_id=lc1.flow_id, ) assert rc.accepted, f"resolve_compliance rejected: {rc.rejected}" @@ -1565,8 +1728,7 @@ async def test_e21_ungrounded_feature_bind_reflected_ephemeral(_eph_repo): # Engineer creates feature branch and writes the implementation. _checkout(repo, "feat/cap-discount", create=True) (repo / "src/calc.py").write_text( - "def rate(order_total: float) -> float:\n" - " return min(order_total * 0.30, order_total)\n" + "def rate(order_total: float) -> float:\n return min(order_total * 0.30, order_total)\n" ) _commit(repo, "cap discount at 30% (feat/cap-discount)") @@ -1579,13 +1741,18 @@ async def test_e21_ungrounded_feature_bind_reflected_ephemeral(_eph_repo): ) # Bind to the implementation on the feature branch. - bind_resp = await handle_bind(ctx_feat, [{ - "decision_id": decision_id, - "file_path": "src/calc.py", - "symbol_name": "rate", - "start_line": 1, - "end_line": 2, - }]) + bind_resp = await handle_bind( + ctx_feat, + [ + { + "decision_id": decision_id, + "file_path": "src/calc.py", + "symbol_name": "rate", + "start_line": 1, + "end_line": 2, + } + ], + ) assert bind_resp.bindings and not bind_resp.bindings[0].error, ( f"bind must succeed on feature branch: " f"{bind_resp.bindings[0].error if bind_resp.bindings else 'no results'}" @@ -1607,14 +1774,16 @@ async def test_e21_ungrounded_feature_bind_reflected_ephemeral(_eph_repo): rc = await handle_resolve_compliance( ctx_feat, phase="ingest", - verdicts=[{ - "decision_id": decision_id, - "region_id": pending[0].region_id, - "content_hash": pending[0].content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "cap implementation verified", - }], + verdicts=[ + { + "decision_id": decision_id, + "region_id": pending[0].region_id, + "content_hash": pending[0].content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "cap implementation verified", + } + ], flow_id=lc.flow_id, ) assert rc.accepted, f"resolve_compliance rejected: {rc.rejected}" @@ -1675,21 +1844,25 @@ async def test_e22_switch_to_main_no_stale_reflected(_eph_repo): # Feature branch: implement + bind + resolve → reflected, ephemeral=True. _checkout(repo, "feat/cap-v2", create=True) (repo / "src/calc.py").write_text( - "def rate(order_total: float) -> float:\n" - " return min(order_total * 0.30, order_total)\n" + "def rate(order_total: float) -> float:\n return min(order_total * 0.30, order_total)\n" ) _commit(repo, "cap at 30%") # Fresh ctx on the feature branch. ctx_feat = BicameralContext.from_env() - bind_resp = await handle_bind(ctx_feat, [{ - "decision_id": decision_id, - "file_path": "src/calc.py", - "symbol_name": "rate", - "start_line": 1, - "end_line": 2, - }]) + bind_resp = await handle_bind( + ctx_feat, + [ + { + "decision_id": decision_id, + "file_path": "src/calc.py", + "symbol_name": "rate", + "start_line": 1, + "end_line": 2, + } + ], + ) assert bind_resp.bindings and not bind_resp.bindings[0].error lc_feat = await handle_link_commit(ctx_feat, "HEAD") @@ -1700,14 +1873,16 @@ async def test_e22_switch_to_main_no_stale_reflected(_eph_repo): rc = await handle_resolve_compliance( ctx_feat, phase="ingest", - verdicts=[{ - "decision_id": decision_id, - "region_id": pending[0].region_id, - "content_hash": pending[0].content_hash, - "verdict": "compliant", - "confidence": "high", - "explanation": "verified on branch", - }], + verdicts=[ + { + "decision_id": decision_id, + "region_id": pending[0].region_id, + "content_hash": pending[0].content_hash, + "verdict": "compliant", + "confidence": "high", + "explanation": "verified on branch", + } + ], flow_id=lc_feat.flow_id, ) assert rc.accepted diff --git a/tests/test_extract_headless.py b/tests/test_extract_headless.py index a7d5878e..9916be67 100644 --- a/tests/test_extract_headless.py +++ b/tests/test_extract_headless.py @@ -8,6 +8,7 @@ Network-dependent end-to-end tests live in CI only, gated on ANTHROPIC_API_KEY being present. """ + from __future__ import annotations import json @@ -101,9 +102,7 @@ def test_cache_hit_returns_without_auth(monkeypatch): monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) try: - result = extract_from_current_skill( - transcript, source_ref="test", skill_md_path=skill_md - ) + result = extract_from_current_skill(transcript, source_ref="test", skill_md_path=skill_md) finally: cache_file.unlink(missing_ok=True) diff --git a/tests/test_extraction_metrics.py b/tests/test_extraction_metrics.py index fb9a9d9b..56028090 100644 --- a/tests/test_extraction_metrics.py +++ b/tests/test_extraction_metrics.py @@ -3,6 +3,7 @@ Exercises the fuzzy matching, 1:1 assignment, and aggregate math with synthetic extracted/fixture pairs. No network, no fixture files on disk. """ + from __future__ import annotations import sys @@ -33,14 +34,18 @@ def test_skipped_when_fixture_absent(): def test_perfect_match_is_p1_r1_f1_1(): - fixture = _f([ - "Add 12-second timeout to payment authorize calls", - "Emit payment.timeout event via EventBus", - ]) - extracted = _e([ - "Add 12-second timeout to payment authorize calls", - "Emit payment.timeout event via EventBus", - ]) + fixture = _f( + [ + "Add 12-second timeout to payment authorize calls", + "Emit payment.timeout event via EventBus", + ] + ) + extracted = _e( + [ + "Add 12-second timeout to payment authorize calls", + "Emit payment.timeout event via EventBus", + ] + ) out = compute_extraction_metrics(extracted, fixture, matcher="rapidfuzz") assert out["skipped"] is False assert out["true_positives"] == 2 @@ -74,16 +79,20 @@ def test_low_similarity_is_false_positive_and_false_negative(): def test_partial_match_mixed_precision_and_recall(): - fixture = _f([ - "Add timeout to authorize calls", - "Emit timeout event via EventBus", - "Drop garbage provider responses", - ]) - extracted = _e([ - "Add timeout to authorize calls", # TP - "Drop garbage provider responses", # TP - "Use circuit breaker for rate limiting", # FP - ]) + fixture = _f( + [ + "Add timeout to authorize calls", + "Emit timeout event via EventBus", + "Drop garbage provider responses", + ] + ) + extracted = _e( + [ + "Add timeout to authorize calls", # TP + "Drop garbage provider responses", # TP + "Use circuit breaker for rate limiting", # FP + ] + ) out = compute_extraction_metrics(extracted, fixture, matcher="rapidfuzz") assert out["true_positives"] == 2 assert out["false_positives"] == 1 @@ -95,10 +104,12 @@ def test_partial_match_mixed_precision_and_recall(): def test_one_to_one_matching_prevents_double_counting(): """If two extracted items both look like one fixture item, only one wins.""" fixture = _f(["Add 12-second timeout to payment authorize calls"]) - extracted = _e([ - "Add 12-second timeout to payment authorize calls", - "Add a 12-second timeout to authorize calls in payments", # very similar - ]) + extracted = _e( + [ + "Add 12-second timeout to payment authorize calls", + "Add a 12-second timeout to authorize calls in payments", # very similar + ] + ) out = compute_extraction_metrics(extracted, fixture, matcher="rapidfuzz") assert out["true_positives"] == 1 # not 2 assert out["false_positives"] == 1 # the second one doesn't match anything new @@ -109,13 +120,21 @@ def test_aggregate_sums_across_scored_and_ignores_skipped(): per_transcript = [ { "skipped": False, - "true_positives": 3, "false_positives": 1, "false_negatives": 2, - "precision": 0.75, "recall": 0.6, "f1": 0.667, + "true_positives": 3, + "false_positives": 1, + "false_negatives": 2, + "precision": 0.75, + "recall": 0.6, + "f1": 0.667, }, { "skipped": False, - "true_positives": 5, "false_positives": 0, "false_negatives": 1, - "precision": 1.0, "recall": 0.833, "f1": 0.909, + "true_positives": 5, + "false_positives": 0, + "false_negatives": 1, + "precision": 1.0, + "recall": 0.833, + "f1": 0.909, }, {"skipped": True, "reason": "no fixture"}, ] @@ -126,8 +145,8 @@ def test_aggregate_sums_across_scored_and_ignores_skipped(): assert out["false_positives"] == 1 assert out["false_negatives"] == 3 # precision = 8/9, recall = 8/11 - assert abs(out["precision"] - 8/9) < 1e-3 - assert abs(out["recall"] - 8/11) < 1e-3 + assert abs(out["precision"] - 8 / 9) < 1e-3 + assert abs(out["recall"] - 8 / 11) < 1e-3 def test_aggregate_all_skipped_returns_skipped(): @@ -153,18 +172,21 @@ def test_empty_extraction_and_empty_fixture_gives_zero_not_error(): def test_pick_matcher_auto_picks_llm_when_key_present(monkeypatch): from _extraction_metrics import _pick_matcher + monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-fake") assert _pick_matcher("auto") == "llm" def test_pick_matcher_auto_falls_back_to_rapidfuzz(monkeypatch): from _extraction_metrics import _pick_matcher + monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) assert _pick_matcher("auto") == "rapidfuzz" def test_pick_matcher_explicit_overrides_env(monkeypatch): from _extraction_metrics import _pick_matcher + monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-fake") assert _pick_matcher("rapidfuzz") == "rapidfuzz" @@ -227,8 +249,8 @@ def test_llm_match_parses_valid_response_into_pairs(): def test_compute_extraction_metrics_dispatches_to_llm(monkeypatch): """When matcher='llm', compute_extraction_metrics calls llm_match instead of rapidfuzz. We stub llm_match so no network is needed.""" - import _extraction_metrics import _extraction_matcher + import _extraction_metrics actual = _e(["X", "Y", "Z"]) fixture = _f(["P", "Q"]) diff --git a/tests/test_flow4_ledger_validation.py b/tests/test_flow4_ledger_validation.py new file mode 100644 index 00000000..22d1b226 --- /dev/null +++ b/tests/test_flow4_ledger_validation.py @@ -0,0 +1,149 @@ +"""Functionality tests for Flow 4 path-X-(b) ledger validation. + +Tests the pure helper `count_agent_session_decisions` from +`tests/e2e/_ledger_helpers.py` and the merge logic that +`_validate_flow4_via_ledger` applies to a `FlowResult`. +""" + +from __future__ import annotations + +import sys +from dataclasses import dataclass +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT / "tests" / "e2e")) + +from _ledger_helpers import count_agent_session_decisions # noqa: E402 + + +@dataclass +class FlowResultStub: + flow_id: str + passed: bool + verdict_reason: str + body: str + + +def test_counts_zero_when_no_agent_session_decisions(): + snapshot = { + "decisions": [ + {"decision_id": "d1", "source_type": "manual"}, + {"decision_id": "d2", "source_type": "transcript"}, + ] + } + assert count_agent_session_decisions(snapshot) == 0 + + +def test_counts_only_agent_session_decisions(): + snapshot = { + "decisions": [ + {"decision_id": "d1", "source_type": "agent_session"}, + {"decision_id": "d2", "source_type": "manual"}, + {"decision_id": "d3", "source_type": "agent_session"}, + {"decision_id": "d4", "source_type": "transcript"}, + {"decision_id": "d5", "source_type": "manual"}, + {"decision_id": "d6", "source_type": "manual"}, + {"decision_id": "d7", "source_type": "manual"}, + {"decision_id": "d8", "source_type": "agent_session"}, + ] + } + assert count_agent_session_decisions(snapshot) == 3 + + +def test_handles_missing_source_type_field(): + snapshot = { + "decisions": [ + {"decision_id": "d1"}, # legacy row, no source_type + {"decision_id": "d2", "source_type": "agent_session"}, + {"decision_id": "d3", "source_type": None}, + ] + } + assert count_agent_session_decisions(snapshot) == 1 + + +def test_handles_error_snapshot(): + snapshot = {"error": "connection failed"} + assert count_agent_session_decisions(snapshot) is None + + +def _merge(flow: FlowResultStub, snapshot: dict) -> None: + """Mirror of `_validate_flow4_via_ledger`'s merge logic on a stub + FlowResult, so unit tests exercise the merge invariants without + importing the full harness module.""" + count = count_agent_session_decisions(snapshot) + if count is None: + flow.body += ( + f"\n— Ledger validation —\nINCONCLUSIVE: ledger query failed: {snapshot.get('error')}\n" + ) + return + if count > 0: + if not flow.passed: + flow.passed = True + flow.verdict_reason = ( + f"in-stream asserter FAIL but SessionEnd subprocess effect " + f"observed in ledger ({count} agent_session decisions, path-X-b)" + ) + flow.body += ( + f"\n— Ledger validation —\n" + f"PASS: {count} decision(s) with source_type='agent_session' " + f"present in ledger after harness completion (path-X-b: SessionEnd " + f"subprocess and/or in-session capture-corrections wrote them).\n" + ) + else: + flow.body += ( + "\n— Ledger validation —\n" + "path-X-b absent: zero decisions with source_type='agent_session' " + "after harness completion. SessionEnd subprocess either did not " + "fire, did not detect uningested corrections, or failed silently.\n" + ) + + +def test_validate_merges_pass_into_flow4_result(): + """Asserter FAIL + ledger has agent_session → upgrade to PASS.""" + flow = FlowResultStub( + flow_id="Flow 4", + passed=False, + verdict_reason="initial", + body="initial body", + ) + snapshot = { + "decisions": [ + {"decision_id": "d1", "source_type": "agent_session"}, + {"decision_id": "d2", "source_type": "agent_session"}, + ] + } + _merge(flow, snapshot) + assert flow.passed is True + assert "SessionEnd subprocess effect observed" in flow.verdict_reason + assert "agent_session" in flow.body + + +def test_validate_preserves_existing_pass(): + """Asserter PASS + ledger has agent_session → keep PASS, append note only.""" + flow = FlowResultStub( + flow_id="Flow 4", + passed=True, + verdict_reason="initial", + body="initial body", + ) + snapshot = {"decisions": [{"decision_id": "d1", "source_type": "agent_session"}]} + _merge(flow, snapshot) + assert flow.passed is True + assert flow.verdict_reason == "initial" + assert "Ledger validation" in flow.body + + +def test_validate_handles_inconclusive_ledger(): + """Ledger query error → INCONCLUSIVE annotation, verdict unchanged.""" + flow = FlowResultStub( + flow_id="Flow 4", + passed=False, + verdict_reason="initial", + body="initial body", + ) + snapshot = {"error": "connection failed"} + _merge(flow, snapshot) + assert flow.passed is False + assert flow.verdict_reason == "initial" + assert "INCONCLUSIVE" in flow.body diff --git a/tests/test_link_commit_grounding.py b/tests/test_link_commit_grounding.py index f96deba4..2d2aa9a1 100644 --- a/tests/test_link_commit_grounding.py +++ b/tests/test_link_commit_grounding.py @@ -6,6 +6,7 @@ 2. test_pending_grounding_checks_symbol_not_found — ingest a decision with a binding, then simulate symbol disappearing → link_commit emits grounding check for that decision """ + from __future__ import annotations import subprocess @@ -19,7 +20,6 @@ from handlers.ingest import handle_ingest from handlers.link_commit import handle_link_commit - # ── Helpers ─────────────────────────────────────────────────────────────────── @@ -178,6 +178,7 @@ async def test_pending_grounding_checks_symbol_not_found(_isolated_ledger): # Invalidate the within-call sync cache so the handler runs a real sweep from handlers.link_commit import invalidate_sync_cache + invalidate_sync_cache(ctx) # Simulate the old symbol (fetch_user) not being found in the new commit diff --git a/tests/test_local_counters.py b/tests/test_local_counters.py index 1b804204..fc7c8d25 100644 --- a/tests/test_local_counters.py +++ b/tests/test_local_counters.py @@ -18,7 +18,9 @@ def test_increment_creates_counter_file(tmp_path: Path, monkeypatch: pytest.Monk monkeypatch.setenv("HOME", str(tmp_path)) monkeypatch.setenv("USERPROFILE", str(tmp_path)) import importlib + import local_counters + importlib.reload(local_counters) local_counters.increment("bicameral-ingest") @@ -33,7 +35,9 @@ def test_increment_appends(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> N monkeypatch.setenv("HOME", str(tmp_path)) monkeypatch.setenv("USERPROFILE", str(tmp_path)) import importlib + import local_counters + importlib.reload(local_counters) for _ in range(50): @@ -46,7 +50,9 @@ def test_read_counters_aggregates(tmp_path: Path, monkeypatch: pytest.MonkeyPatc monkeypatch.setenv("HOME", str(tmp_path)) monkeypatch.setenv("USERPROFILE", str(tmp_path)) import importlib + import local_counters + importlib.reload(local_counters) for _ in range(3): @@ -63,7 +69,9 @@ def test_no_network_calls(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> No monkeypatch.setenv("HOME", str(tmp_path)) monkeypatch.setenv("USERPROFILE", str(tmp_path)) import importlib + import local_counters + importlib.reload(local_counters) with patch("urllib.request.urlopen", side_effect=RuntimeError("net down")): @@ -71,11 +79,15 @@ def test_no_network_calls(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> No assert _counters_path(tmp_path).exists() -def test_concurrent_increments_no_data_loss(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: +def test_concurrent_increments_no_data_loss( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: monkeypatch.setenv("HOME", str(tmp_path)) monkeypatch.setenv("USERPROFILE", str(tmp_path)) import importlib + import local_counters + importlib.reload(local_counters) def _worker(idx: int) -> None: @@ -97,18 +109,24 @@ def test_disabled_when_env_off(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) monkeypatch.setenv("USERPROFILE", str(tmp_path)) monkeypatch.setenv("BICAMERAL_LOCAL_COUNTERS", "0") import importlib + import local_counters + importlib.reload(local_counters) local_counters.increment("bicameral-ingest") assert not _counters_path(tmp_path).exists() -def test_read_counters_handles_missing_file(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: +def test_read_counters_handles_missing_file( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: monkeypatch.setenv("HOME", str(tmp_path)) monkeypatch.setenv("USERPROFILE", str(tmp_path)) import importlib + import local_counters + importlib.reload(local_counters) assert local_counters.read_counters() == {} diff --git a/tests/test_phase1_code_locator.py b/tests/test_phase1_code_locator.py index e5b7e7fd..860ad283 100644 --- a/tests/test_phase1_code_locator.py +++ b/tests/test_phase1_code_locator.py @@ -16,7 +16,6 @@ from adapters.code_locator import get_code_locator - # ── Real adapter tests (Phase 1 — require indexed repo) ───────────── @@ -71,6 +70,7 @@ def test_get_neighbors_returns_valid_edges(monkeypatch, repo_path): # ── extract_symbols ────────────────────────────────────────────────── + @pytest.mark.phase1 @pytest.mark.asyncio async def test_extract_symbols_from_known_file(monkeypatch, repo_path): diff --git a/tests/test_phase1_l1_wiring.py b/tests/test_phase1_l1_wiring.py index bf15afb8..08072904 100644 --- a/tests/test_phase1_l1_wiring.py +++ b/tests/test_phase1_l1_wiring.py @@ -31,7 +31,6 @@ from handlers.decision_status import handle_decision_status from handlers.link_commit import handle_link_commit - # ── Tiny git repo fixture ───────────────────────────────────────────── @@ -178,12 +177,10 @@ async def test_ingest_of_existing_symbol_is_pending_until_verified(_isolated_led ctx = _ctx() status = await handle_decision_status(ctx, filter="all") assert status.summary.get("reflected", 0) == 0, ( - f"v3 must not auto-promote to REFLECTED without a verdict, " - f"got summary={status.summary!r}" + f"v3 must not auto-promote to REFLECTED without a verdict, got summary={status.summary!r}" ) assert status.summary.get("pending", 0) == 1, ( - f"Expected 1 pending intent (grounded but unverified), " - f"got summary={status.summary!r}" + f"Expected 1 pending intent (grounded but unverified), got summary={status.summary!r}" ) @@ -221,8 +218,7 @@ async def test_hash_change_alone_does_not_flip_status_without_verdict(_isolated_ ctx = _ctx() pre = await handle_decision_status(ctx, filter="all") assert pre.summary.get("pending", 0) == 1, ( - f"Pre-edit baseline is PENDING under v3 (grounded, unverified), " - f"got summary={pre.summary!r}" + f"Pre-edit baseline is PENDING under v3 (grounded, unverified), got summary={pre.summary!r}" ) # Invert the discount threshold — real semantic change, not cosmetic @@ -335,14 +331,11 @@ async def test_backfill_restores_hash_but_stays_pending_without_verdict(_isolate f"got summary={status.summary!r}" ) assert status.summary.get("pending", 0) == 1, ( - f"Post-backfill region is hashed but unverified → PENDING, " - f"got summary={status.summary!r}" + f"Post-backfill region is hashed but unverified → PENDING, got summary={status.summary!r}" ) # Defensive: confirm backfill actually re-stamped the content_hash # (the cache-key is now populated even though the verdict isn't). post_rows = await client.query("SELECT content_hash FROM code_region") hashes = [r.get("content_hash", "") for r in post_rows] - assert any(h for h in hashes), ( - f"Backfill should have populated content_hash, got {hashes!r}" - ) + assert any(h for h in hashes), f"Backfill should have populated content_hash, got {hashes!r}" diff --git a/tests/test_phase2_ledger.py b/tests/test_phase2_ledger.py index ce66a558..7b639e28 100644 --- a/tests/test_phase2_ledger.py +++ b/tests/test_phase2_ledger.py @@ -33,6 +33,7 @@ def _ctx(): # ── Adapter availability ────────────────────────────────────────────── + @pytest.mark.phase2 @pytest.mark.asyncio async def test_real_ledger_adapter_instantiates(monkeypatch, surreal_url): @@ -45,6 +46,7 @@ async def test_real_ledger_adapter_instantiates(monkeypatch, surreal_url): # ── Ingestion idempotency ───────────────────────────────────────────── + @pytest.mark.phase2 @pytest.mark.asyncio async def test_ingest_payload_creates_intent_node(monkeypatch, surreal_url, minimal_payload): @@ -87,6 +89,7 @@ async def test_ingest_is_idempotent(monkeypatch, surreal_url, minimal_payload): # ── BM25 search ─────────────────────────────────────────────────────── + @pytest.mark.phase2 @pytest.mark.asyncio async def test_bm25_search_finds_ingested_intent(monkeypatch, surreal_url): @@ -99,24 +102,47 @@ async def test_bm25_search_finds_ingested_intent(monkeypatch, surreal_url): await ledger.connect() desc = "exponential backoff retry on webhook failure" - await ledger.ingest_payload({ - "query": desc, "repo": "test-repo", "commit_hash": "bm25test", - "analyzed_at": "2026-03-27T12:00:00Z", - "mappings": [{ - "span": {"span_id": "bm25-0", "source_type": "transcript", "text": desc, "speaker": "", "source_ref": ""}, - "intent": desc, "symbols": ["WebhookDispatcher.send"], - "code_regions": [{"file_path": "webhooks/dispatcher.py", "symbol": "WebhookDispatcher.send", - "type": "function", "start_line": 134, "end_line": 180, "purpose": "dispatch"}], - "dependency_edges": [], - }], - }) - - results = await ledger.search_by_query("retry webhook backoff", max_results=10, min_confidence=0.1) + await ledger.ingest_payload( + { + "query": desc, + "repo": "test-repo", + "commit_hash": "bm25test", + "analyzed_at": "2026-03-27T12:00:00Z", + "mappings": [ + { + "span": { + "span_id": "bm25-0", + "source_type": "transcript", + "text": desc, + "speaker": "", + "source_ref": "", + }, + "intent": desc, + "symbols": ["WebhookDispatcher.send"], + "code_regions": [ + { + "file_path": "webhooks/dispatcher.py", + "symbol": "WebhookDispatcher.send", + "type": "function", + "start_line": 134, + "end_line": 180, + "purpose": "dispatch", + } + ], + "dependency_edges": [], + } + ], + } + ) + + results = await ledger.search_by_query( + "retry webhook backoff", max_results=10, min_confidence=0.1 + ) assert len(results) > 0, "BM25 returned no results for recently ingested intent" descs = [r["description"] for r in results] - assert any("webhook" in d.lower() or "retry" in d.lower() or "backoff" in d.lower() for d in descs), ( - f"Relevant intent not surfaced by BM25. Got: {descs}" - ) + assert any( + "webhook" in d.lower() or "retry" in d.lower() or "backoff" in d.lower() for d in descs + ), f"Relevant intent not surfaced by BM25. Got: {descs}" @pytest.mark.phase2 @@ -136,6 +162,7 @@ async def test_bm25_min_confidence_filters_results(monkeypatch, surreal_url): # ── Reverse traversal: file → decisions ────────────────────────────── + @pytest.mark.phase2 @pytest.mark.asyncio async def test_file_reverse_traversal_finds_decision(monkeypatch, surreal_url): @@ -149,17 +176,38 @@ async def test_file_reverse_traversal_finds_decision(monkeypatch, surreal_url): file_path = "payments/processor.py" desc = "optimistic locking for cart updates" - await ledger.ingest_payload({ - "query": desc, "repo": "test-repo", "commit_hash": "reversetest", - "analyzed_at": "2026-03-27T12:00:00Z", - "mappings": [{ - "span": {"span_id": "rev-0", "source_type": "transcript", "text": desc, "speaker": "", "source_ref": ""}, - "intent": desc, "symbols": ["CartService.updateItem"], - "code_regions": [{"file_path": file_path, "symbol": "CartService.updateItem", - "type": "function", "start_line": 87, "end_line": 120, "purpose": "cart update"}], - "dependency_edges": [], - }], - }) + await ledger.ingest_payload( + { + "query": desc, + "repo": "test-repo", + "commit_hash": "reversetest", + "analyzed_at": "2026-03-27T12:00:00Z", + "mappings": [ + { + "span": { + "span_id": "rev-0", + "source_type": "transcript", + "text": desc, + "speaker": "", + "source_ref": "", + }, + "intent": desc, + "symbols": ["CartService.updateItem"], + "code_regions": [ + { + "file_path": file_path, + "symbol": "CartService.updateItem", + "type": "function", + "start_line": 87, + "end_line": 120, + "purpose": "cart update", + } + ], + "dependency_edges": [], + } + ], + } + ) decisions = await ledger.get_decisions_for_file(file_path) assert len(decisions) > 0, f"No decisions found for {file_path!r} via reverse traversal" @@ -182,6 +230,7 @@ async def test_unknown_file_returns_empty(monkeypatch, surreal_url): # ── link_commit idempotency ─────────────────────────────────────────── + @pytest.mark.phase2 @pytest.mark.asyncio async def test_link_commit_idempotent(monkeypatch, surreal_url): @@ -223,6 +272,7 @@ async def test_link_commit_updates_sync_cursor(monkeypatch, surreal_url): # ── decision_status via real graph ──────────────────────────────────── + @pytest.mark.phase2 @pytest.mark.asyncio async def test_decision_status_reflects_ingested_data(monkeypatch, surreal_url, minimal_payload): @@ -262,27 +312,41 @@ async def test_ungrounded_intent_has_correct_status(monkeypatch, surreal_url): await ledger.connect() desc = "zzqx qqzzyy nonsensetoken glarbflumph deliberate-gibberish wlrdpfnz" - await ledger.ingest_payload({ - "query": desc, "repo": "test-repo", "commit_hash": "unground01", - "analyzed_at": "2026-03-27T12:00:00Z", - "mappings": [{ - "span": {"span_id": "ug-0", "source_type": "transcript", "text": desc, "speaker": "", "source_ref": ""}, - "intent": desc, "symbols": [], "code_regions": [], "dependency_edges": [], - }], - }) + await ledger.ingest_payload( + { + "query": desc, + "repo": "test-repo", + "commit_hash": "unground01", + "analyzed_at": "2026-03-27T12:00:00Z", + "mappings": [ + { + "span": { + "span_id": "ug-0", + "source_type": "transcript", + "text": desc, + "speaker": "", + "source_ref": "", + }, + "intent": desc, + "symbols": [], + "code_regions": [], + "dependency_edges": [], + } + ], + } + ) # Query the ledger directly — handle_decision_status auto-syncs via # link_commit which triggers _reground_ungrounded, potentially changing # the status before we can assert on it. ungrounded = await ledger.get_all_decisions(filter="ungrounded") descs = [d.get("description", "") for d in ungrounded] - assert any(desc in d for d in descs), ( - f"Expected {desc!r} in ungrounded filter. Got: {descs}" - ) + assert any(desc in d for d in descs), f"Expected {desc!r} in ungrounded filter. Got: {descs}" # ── detect_drift with real reverse traversal ────────────────────────── + @pytest.mark.phase2 @pytest.mark.asyncio async def test_detect_drift_returns_decisions_for_ingested_file(monkeypatch, surreal_url): @@ -296,17 +360,38 @@ async def test_detect_drift_returns_decisions_for_ingested_file(monkeypatch, sur file_path = "services/checkout.py" desc = "rate limit checkout endpoint" - await ledger.ingest_payload({ - "query": desc, "repo": "test-repo", "commit_hash": "drift001", - "analyzed_at": "2026-03-27T12:00:00Z", - "mappings": [{ - "span": {"span_id": "d-0", "source_type": "transcript", "text": desc, "speaker": "", "source_ref": "mtg-001"}, - "intent": desc, "symbols": ["CheckoutService.process"], - "code_regions": [{"file_path": file_path, "symbol": "CheckoutService.process", - "type": "function", "start_line": 45, "end_line": 90, "purpose": "checkout"}], - "dependency_edges": [], - }], - }) + await ledger.ingest_payload( + { + "query": desc, + "repo": "test-repo", + "commit_hash": "drift001", + "analyzed_at": "2026-03-27T12:00:00Z", + "mappings": [ + { + "span": { + "span_id": "d-0", + "source_type": "transcript", + "text": desc, + "speaker": "", + "source_ref": "mtg-001", + }, + "intent": desc, + "symbols": ["CheckoutService.process"], + "code_regions": [ + { + "file_path": file_path, + "symbol": "CheckoutService.process", + "type": "function", + "start_line": 45, + "end_line": 90, + "purpose": "checkout", + } + ], + "dependency_edges": [], + } + ], + } + ) ctx = _ctx() result = await handle_detect_drift(ctx, file_path) @@ -326,7 +411,9 @@ async def test_source_cursor_upserts_after_ingest(monkeypatch, surreal_url, mini from handlers.ingest import handle_ingest ctx = _ctx() - result = await handle_ingest(ctx, minimal_payload, source_scope="slack:C123", cursor="1743210021.123") + result = await handle_ingest( + ctx, minimal_payload, source_scope="slack:C123", cursor="1743210021.123" + ) assert result.source_cursor is not None assert result.source_cursor.repo == "test-repo" @@ -338,6 +425,7 @@ async def test_source_cursor_upserts_after_ingest(monkeypatch, surreal_url, mini # ── M1 decision-relevance instrumentation ──────────────────────────── + @pytest.mark.phase2 @pytest.mark.asyncio async def test_ingest_stats_populates_grounded_fields( @@ -346,6 +434,7 @@ async def test_ingest_stats_populates_grounded_fields( """handle_ingest must populate stats.grounded + stats.grounded_pct and emit a [ingest] complete log line. This is the M1 instrumentation gate.""" import logging + monkeypatch.setenv("USE_REAL_LEDGER", "1") monkeypatch.setenv("SURREAL_URL", surreal_url) diff --git a/tests/test_phase3_integration.py b/tests/test_phase3_integration.py index 253404db..c3a363b0 100644 --- a/tests/test_phase3_integration.py +++ b/tests/test_phase3_integration.py @@ -107,6 +107,7 @@ def _response_dict(response) -> dict: # ── Real code locator helpers ──────────────────────────────────────── + def _locate_hits(adapter, query_str: str, limit: int = 2) -> list[dict]: """Resolve a bag-of-words query to {file_path, symbol_name, line_number} hits for test payload construction. @@ -134,12 +135,14 @@ def _locate_hits(adapter, query_str: str, limit: int = 2) -> list[dict]: row = db.lookup_by_id(sid) if row is None: continue - hits.append({ - "file_path": row["file_path"], - "symbol_name": row["name"], - "line_number": row["start_line"], - "score": v.get("match_score", 0) / 100.0, - }) + hits.append( + { + "file_path": row["file_path"], + "symbol_name": row["name"], + "line_number": row["start_line"], + "score": v.get("match_score", 0) / 100.0, + } + ) if len(hits) >= limit: break return hits @@ -175,30 +178,34 @@ def _build_payload_from_real_code( sym = hit.get("symbol_name", "") line = hit.get("line_number", 1) if fp: - code_regions.append({ - "file_path": fp, - "symbol": sym or fp.split("/")[-1], - "type": "function", - "start_line": line, - "end_line": line + 20, - "purpose": f"Located from search terms: {item['search']!r}", - }) + code_regions.append( + { + "file_path": fp, + "symbol": sym or fp.split("/")[-1], + "type": "function", + "start_line": line, + "end_line": line + 20, + "purpose": f"Located from search terms: {item['search']!r}", + } + ) if sym: symbols.append(sym) - mappings.append({ - "span": { - "span_id": f"e2e-{i}", - "source_type": source_type, - "text": item["text"], - "speaker": item.get("speaker", ""), - "source_ref": source_ref, - }, - "intent": item["intent"], - "symbols": symbols, - "code_regions": code_regions, - "dependency_edges": [], - }) + mappings.append( + { + "span": { + "span_id": f"e2e-{i}", + "source_type": source_type, + "text": item["text"], + "speaker": item.get("speaker", ""), + "source_ref": source_ref, + }, + "intent": item["intent"], + "symbols": symbols, + "code_regions": code_regions, + "dependency_edges": [], + } + ) return { "query": query, @@ -215,6 +222,7 @@ def _build_payload_from_real_code( # Tool: bicameral.search — pre-flight before coding # ══════════════════════════════════════════════════════════════════════ + @pytest.mark.phase3 @pytest.mark.asyncio async def test_constraint_lost__search_surfaces_prior_decisions(ctx): @@ -278,6 +286,7 @@ async def test_constraint_lost__search_surfaces_prior_decisions(ctx): # Tool: bicameral.ingest — normalizes intent from multiple sources # ══════════════════════════════════════════════════════════════════════ + @pytest.mark.phase3 @pytest.mark.asyncio async def test_context_scattered__ingest_unifies_sources(ctx): @@ -355,6 +364,7 @@ async def test_context_scattered__ingest_unifies_sources(ctx): # Tool: bicameral.status — tracks decided vs built, surfaces ungrounded # ══════════════════════════════════════════════════════════════════════ + @pytest.mark.phase3 @pytest.mark.asyncio async def test_decision_undocumented__status_surfaces_ungrounded(ctx): @@ -414,6 +424,7 @@ async def test_decision_undocumented__status_surfaces_ungrounded(ctx): # Tool: search + code locator — retrieves full decision provenance # ══════════════════════════════════════════════════════════════════════ + @pytest.mark.phase3 @pytest.mark.asyncio async def test_repeated_explanation__search_returns_full_provenance(ctx): @@ -471,6 +482,7 @@ async def test_repeated_explanation__search_returns_full_provenance(ctx): # Tool: bicameral.drift — surfaces institutional memory tied to code # ══════════════════════════════════════════════════════════════════════ + @pytest.mark.phase3 @pytest.mark.asyncio async def test_tribal_knowledge__drift_surfaces_decisions_for_file(ctx): @@ -522,6 +534,7 @@ async def test_tribal_knowledge__drift_surfaces_decisions_for_file(ctx): # INTEGRATION: Full lifecycle + graph integrity # ══════════════════════════════════════════════════════════════════════ + @pytest.mark.phase3 @pytest.mark.asyncio async def test_full_lifecycle_graph_integrity(ctx): @@ -576,7 +589,9 @@ async def test_full_lifecycle_graph_integrity(ctx): _dump("06_lifecycle_03_status", _response_dict(r_status)) # Step 4: Search - r_search = await handle_search_decisions(ctx, query="BM25 search provenance", min_confidence=0.1) + r_search = await handle_search_decisions( + ctx, query="BM25 search provenance", min_confidence=0.1 + ) assert len(r_search.matches) >= 1 _dump("06_lifecycle_04_search", _response_dict(r_search)) diff --git a/tests/test_pollution_bug.py b/tests/test_pollution_bug.py index 94e3102b..b60ddb12 100644 --- a/tests/test_pollution_bug.py +++ b/tests/test_pollution_bug.py @@ -30,13 +30,16 @@ from handlers.ingest import handle_ingest from handlers.link_commit import handle_link_commit - # ── Tiny git repo fixture with main + feature branch ───────────────── def _git(cwd: Path, *args: str, check: bool = True) -> str: result = subprocess.run( - ["git", *args], cwd=cwd, capture_output=True, text=True, check=check, + ["git", *args], + cwd=cwd, + capture_output=True, + text=True, + check=check, ) return result.stdout.strip() @@ -133,7 +136,9 @@ def _payload(repo: Path) -> dict: @pytest.mark.phase2 @pytest.mark.asyncio async def test_ingest_on_branch_stamps_main_baseline( - monkeypatch, branched_repo, surreal_url, + monkeypatch, + branched_repo, + surreal_url, ): """Bug 3 (F1a) — ``handle_ingest`` from a feature branch must stamp baseline hashes against the authoritative ref (main), not the branch. @@ -168,20 +173,27 @@ async def test_ingest_on_branch_stamps_main_baseline( # Query the ledger directly for the stamped content_hash ledger = get_ledger() client = ledger._client - rows = await client.query( - "SELECT content_hash FROM code_region WHERE file_path = 'pricing.py'" - ) + rows = await client.query("SELECT content_hash FROM code_region WHERE file_path = 'pricing.py'") assert len(rows) >= 1, "code_region not created" stamped_hash = rows[0].get("content_hash", "") assert stamped_hash, "content_hash is empty — pollution guard failed upstream" # Compute what main's content hash SHOULD be from ledger.status import compute_content_hash + main_hash = compute_content_hash( - "pricing.py", 1, 4, str(branched_repo), ref=ctx.authoritative_sha, + "pricing.py", + 1, + 4, + str(branched_repo), + ref=ctx.authoritative_sha, ) branch_hash = compute_content_hash( - "pricing.py", 1, 4, str(branched_repo), ref="HEAD", + "pricing.py", + 1, + 4, + str(branched_repo), + ref="HEAD", ) assert main_hash != branch_hash, "test setup broken: branch and main have the same hash" @@ -196,7 +208,9 @@ async def test_ingest_on_branch_stamps_main_baseline( @pytest.mark.phase2 @pytest.mark.asyncio async def test_link_commit_on_branch_runs_read_only( - monkeypatch, branched_repo, surreal_url, + monkeypatch, + branched_repo, + surreal_url, ): """Bug 1 (F1) — ``handle_link_commit`` on a branch must not update stored baseline hashes. Drift is computed for reporting, but the diff --git a/tests/test_post_commit_sync_hook.py b/tests/test_post_commit_sync_hook.py new file mode 100644 index 00000000..bd96d44f --- /dev/null +++ b/tests/test_post_commit_sync_hook.py @@ -0,0 +1,135 @@ +"""Functionality tests for scripts/hooks/post_commit_sync_reminder.py. + +The hook is invoked as a subprocess by Claude Code on every PostToolUse +matching ``Bash``. Tests run it the same way to exercise stdin/stdout +exactly as production does. + +Claude Code 2.x requires PostToolUse hook output shaped as +``{"hookSpecificOutput": {"hookEventName": "PostToolUse", +"additionalContext": "..."}}``. Plain stdout from PostToolUse hooks is +silently dropped to the debug log (per +https://code.claude.com/docs/en/hooks). These tests assert against the +envelope shape — anything else is a broken contract. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +HOOK_SCRIPT = REPO_ROOT / "scripts" / "hooks" / "post_commit_sync_reminder.py" + + +def _run_hook(stdin_text: str) -> tuple[int, str, str]: + proc = subprocess.run( + [sys.executable, str(HOOK_SCRIPT)], + input=stdin_text, + capture_output=True, + text=True, + timeout=10, + ) + return proc.returncode, proc.stdout, proc.stderr + + +def _make_stdin(*, tool_name: str = "Bash", command: str = "") -> str: + return json.dumps({"tool_name": tool_name, "tool_input": {"command": command}}) + + +def _hook_output(parsed: dict) -> dict: + """Extract hookSpecificOutput.additionalContext, asserting envelope shape.""" + assert "hookSpecificOutput" in parsed, ( + f"hook must emit hookSpecificOutput envelope (Claude Code 2.x contract); got {parsed!r}" + ) + inner = parsed["hookSpecificOutput"] + assert inner.get("hookEventName") == "PostToolUse" + return inner + + +def _assert_silent(out: str) -> None: + """No envelope written. Tolerate fully-empty stdout or `{}`.""" + if not out.strip(): + return + parsed = json.loads(out) + assert "hookSpecificOutput" not in parsed + + +def test_emits_reminder_on_git_commit(): + rc, out, _ = _run_hook(_make_stdin(command="git commit -m 'feat: add foo'")) + assert rc == 0 + inner = _hook_output(json.loads(out)) + ctx = inner["additionalContext"] + assert "bicameral: new commit detected" in ctx + assert "/bicameral:sync" in ctx + + +def test_emits_reminder_on_git_merge(): + rc, out, _ = _run_hook(_make_stdin(command="git merge feature/foo --no-ff")) + assert rc == 0 + inner = _hook_output(json.loads(out)) + assert "bicameral: new commit detected" in inner["additionalContext"] + + +def test_emits_reminder_on_git_pull(): + rc, out, _ = _run_hook(_make_stdin(command="git pull origin main")) + assert rc == 0 + inner = _hook_output(json.loads(out)) + assert "bicameral: new commit detected" in inner["additionalContext"] + + +def test_emits_reminder_on_git_rebase_continue(): + rc, out, _ = _run_hook(_make_stdin(command="git rebase --continue")) + assert rc == 0 + inner = _hook_output(json.loads(out)) + assert "bicameral: new commit detected" in inner["additionalContext"] + + +def test_silent_on_read_only_git_command(): + """git status, git log, git diff, etc. → silent.""" + for cmd in ["git status", "git log -10", "git diff HEAD", "git branch -a"]: + rc, out, _ = _run_hook(_make_stdin(command=cmd)) + assert rc == 0 + _assert_silent(out) + + +def test_silent_on_non_bash_tool(): + """Hook only fires for Bash; other tools → silent.""" + rc, out, _ = _run_hook(_make_stdin(tool_name="Edit", command="git commit")) + assert rc == 0 + _assert_silent(out) + + +def test_silent_on_non_git_bash_command(): + rc, out, _ = _run_hook(_make_stdin(command="ls -la")) + assert rc == 0 + _assert_silent(out) + + +def test_handles_malformed_stdin(): + rc, out, _ = _run_hook("this is not JSON at all {[}") + assert rc == 0 + _assert_silent(out) + + +def test_handles_missing_tool_input(): + payload = json.dumps({"tool_name": "Bash"}) + rc, out, _ = _run_hook(payload) + assert rc == 0 + _assert_silent(out) + + +def test_handles_non_dict_tool_input(): + payload = json.dumps({"tool_name": "Bash", "tool_input": "git commit"}) + rc, out, _ = _run_hook(payload) + assert rc == 0 + _assert_silent(out) + + +def test_idempotent_on_double_fire(): + stdin = _make_stdin(command="git commit -m 'whatever'") + rc1, out1, _ = _run_hook(stdin) + rc2, out2, _ = _run_hook(stdin) + assert rc1 == rc2 == 0 + assert out1 == out2 diff --git a/tests/test_post_preflight_capture_hook.py b/tests/test_post_preflight_capture_hook.py new file mode 100644 index 00000000..58395653 --- /dev/null +++ b/tests/test_post_preflight_capture_hook.py @@ -0,0 +1,197 @@ +"""Functionality tests for scripts/hooks/post_preflight_capture_reminder.py. + +The hook is invoked as a subprocess by Claude Code on every PostToolUse +matching ``mcp__bicameral__bicameral_preflight``. Tests run it the same +way to exercise stdin/stdout exactly as production does. + +Claude Code 2.x requires PostToolUse hook output shaped as +``{"hookSpecificOutput": {"hookEventName": "PostToolUse", +"additionalContext": "..."}}``. Plain stdout from PostToolUse hooks is +silently dropped to the debug log (per +https://code.claude.com/docs/en/hooks — only UserPromptSubmit / +UserPromptExpansion / SessionStart treat raw stdout as agent-visible +context). These tests assert against the envelope shape — anything else +is a broken contract regardless of whether the hook process exits +cleanly. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +HOOK_SCRIPT = REPO_ROOT / "scripts" / "hooks" / "post_preflight_capture_reminder.py" + +PREFLIGHT_TOOL_NAME = "mcp__bicameral__bicameral_preflight" + + +def _run_hook(stdin_text: str) -> tuple[int, str, str]: + """Invoke the hook with stdin_text on stdin; return (rc, stdout, stderr).""" + proc = subprocess.run( + [sys.executable, str(HOOK_SCRIPT)], + input=stdin_text, + capture_output=True, + text=True, + timeout=10, + ) + return proc.returncode, proc.stdout, proc.stderr + + +def _make_stdin(*, fired: bool, decisions: list[dict], response_as_string: bool = False) -> str: + response = {"fired": fired, "decisions": decisions} + payload = { + "tool_name": PREFLIGHT_TOOL_NAME, + "tool_input": {"topic": "reorder commits", "file_paths": ["app/src/lib/git/reorder.ts"]}, + "tool_response": json.dumps(response) if response_as_string else response, + } + return json.dumps(payload) + + +def _hook_output(parsed: dict) -> dict: + """Extract hookSpecificOutput.additionalContext, asserting envelope shape.""" + assert "hookSpecificOutput" in parsed, ( + f"hook must emit hookSpecificOutput envelope (Claude Code 2.x contract); got {parsed!r}" + ) + inner = parsed["hookSpecificOutput"] + assert inner.get("hookEventName") == "PostToolUse" + return inner + + +def test_emits_reminder_when_decisions_surfaced(): + """fired=True with ≥1 decision → envelope with reminder containing each + decision_id + the Step 5.6.1 AskUserQuestion disambiguation template + (per #175). The reminder no longer templates the bare ingest+ + resolve_collision sequence — it templates the user-disambiguation + question whose answer drives Step 5.6.2's mechanical capture. + """ + stdin = _make_stdin( + fired=True, + decisions=[ + {"decision_id": "decision:abc123", "description": "Drag-and-drop to reorder commits"}, + {"decision_id": "decision:def456", "description": "Cherry-pick across branches"}, + ], + ) + rc, out, _ = _run_hook(stdin) + assert rc == 0 + inner = _hook_output(json.loads(out)) + ctx = inner["additionalContext"] + assert "<system-reminder>" in ctx + # Surfaced decisions are listed verbatim so the agent can scope the + # disambiguation question. + assert "decision:abc123" in ctx + assert "decision:def456" in ctx + assert "Drag-and-drop to reorder commits" in ctx + # The Step 5.6.1 AskUserQuestion shape is templated. + assert "AskUserQuestion" in ctx + assert "supersede" in ctx and "keep_both" in ctx + assert "unrelated" in ctx + # Branch instructions for Step 5.6.2 are still present so the agent + # knows what to do with each answer. + assert "agent_session" in ctx + assert "resolve_collision" in ctx + + +def test_reminder_routes_judgment_to_user_not_agent(): + """Per #175, the agent must NOT judge contradiction itself — it asks + the user via ``AskUserQuestion`` and acts on the answer mechanically. + Lock the user-disambiguation posture in so future edits don't quietly + regress to ``"you MUST capture"`` (which the agent demonstrably + ignored on borderline prompts) or to the original ``"IF you + contradict ..."`` conditional gate. + """ + stdin = _make_stdin( + fired=True, + decisions=[{"decision_id": "decision:abc", "description": "Some prior decision"}], + ) + _, out, _ = _run_hook(stdin) + ctx = _hook_output(json.loads(out))["additionalContext"] + # Affirmative: judgment moves to the user. + assert "do NOT judge contradiction yourself" in ctx + assert "ask the user" in ctx + assert "AskUserQuestion" in ctx + # Negative: must NOT contain the prior unconditional capture wording + # (which short-circuited the user-in-the-loop design) NOR the original + # conditional escape hatch (which over-deferred to agent judgment). + assert "BEFORE any code edits, you MUST capture" not in ctx + assert "If your current prompt CONTRADICTS" not in ctx + assert "If your prompt is COMPATIBLE" not in ctx + assert "ignore this and proceed normally" not in ctx + + +def _assert_silent(out: str) -> None: + """No envelope written. Tolerate fully-empty stdout or `{}`.""" + if not out.strip(): + return + parsed = json.loads(out) + assert "hookSpecificOutput" not in parsed + + +def test_silent_when_fired_false(): + """fired=False → no envelope.""" + stdin = _make_stdin(fired=False, decisions=[]) + rc, out, _ = _run_hook(stdin) + assert rc == 0 + _assert_silent(out) + + +def test_silent_when_decisions_empty(): + """fired=True but decisions=[] → no envelope (nothing to contradict).""" + stdin = _make_stdin(fired=True, decisions=[]) + rc, out, _ = _run_hook(stdin) + assert rc == 0 + _assert_silent(out) + + +def test_handles_response_as_json_string(): + """tool_response can arrive as a JSON-encoded string; reminder still fires.""" + stdin = _make_stdin( + fired=True, + decisions=[{"decision_id": "decision:xyz", "description": "Some constraint"}], + response_as_string=True, + ) + rc, out, _ = _run_hook(stdin) + assert rc == 0 + inner = _hook_output(json.loads(out)) + assert "decision:xyz" in inner["additionalContext"] + + +def test_silent_when_tool_name_does_not_match(): + """Hook only fires for bicameral_preflight; other tools → silent.""" + payload = { + "tool_name": "Bash", + "tool_input": {"command": "git commit"}, + "tool_response": {"fired": True, "decisions": [{"decision_id": "x", "description": "y"}]}, + } + rc, out, _ = _run_hook(json.dumps(payload)) + assert rc == 0 + _assert_silent(out) + + +def test_handles_malformed_stdin(): + """Non-JSON stdin returns rc 0 with no envelope — never blocks user.""" + rc, out, _ = _run_hook("this is not JSON at all {[}") + assert rc == 0 + _assert_silent(out) + + +def test_handles_missing_tool_response(): + """Payload without tool_response → silent (no contradiction signal).""" + payload = {"tool_name": PREFLIGHT_TOOL_NAME, "tool_input": {}} + rc, out, _ = _run_hook(json.dumps(payload)) + assert rc == 0 + _assert_silent(out) + + +def test_idempotent_on_double_fire(): + """Same input twice produces identical output (no state leak).""" + stdin = _make_stdin( + fired=True, + decisions=[{"decision_id": "decision:abc", "description": "Some decision"}], + ) + rc1, out1, _ = _run_hook(stdin) + rc2, out2, _ = _run_hook(stdin) + assert rc1 == rc2 == 0 + assert out1 == out2 diff --git a/tests/test_preflight_graph_expansion.py b/tests/test_preflight_graph_expansion.py new file mode 100644 index 00000000..1e5d3fb3 --- /dev/null +++ b/tests/test_preflight_graph_expansion.py @@ -0,0 +1,434 @@ +"""Tests for the 1-hop code-graph expansion in region-anchored preflight (#173). + +Two layers: + +1. **Adapter unit** — ``RealCodeLocatorAdapter.expand_file_paths_via_graph`` + returns the union of input file paths plus 1-hop neighbor files, + bounded by ``max_neighbors_per_result``. Exercised against an + in-memory ``SymbolDB`` with hand-inserted symbols and edges so the + test doesn't depend on a real codebase index. + +2. **Handler integration** — ``_region_anchored_preflight`` in + ``handlers/preflight.py`` calls the expander, surfaces decisions + bound to expanded paths with ``confidence=0.7``, and tags + ``"graph"`` on ``sources_chained``. The structural + distance scenario: a decision is bound to ``app/src/lib/git/reorder.ts``; + the caller passes ``["app/src/ui/multi-commit-operation/reorder.tsx"]`` + (a graph neighbor); the decision still surfaces. +""" + +from __future__ import annotations + +import pytest + +from adapters.code_locator import RealCodeLocatorAdapter +from adapters.ledger import reset_ledger_singleton +from code_locator.config import CodeLocatorConfig +from code_locator.indexing.sqlite_store import SymbolDB, SymbolRecord +from context import BicameralContext +from handlers.bind import handle_bind +from handlers.ingest import handle_ingest +from handlers.preflight import handle_preflight + + +def _build_ingest_payload(description: str) -> dict: + """Internal-format ingest payload that produces a single ratified mapping. + + Mirrors the shape used by ``test_alpha_contract::_ingest_payload`` with + ``with_region=False`` + ``signoff=True`` so the test ingest produces an + ungrounded decision ready to bind in the next step. + """ + return { + "query": description, + "repo": "graph-expand-test-repo", + "mappings": [ + { + "intent": description, + "span": { + "source_type": "transcript", + "text": description, + "source_ref": "graph-expand-test", + "speakers": ["test@example.com"], + "meeting_date": "2026-05-04", + }, + "symbols": [], + "code_regions": [], + "signoff": { + "state": "ratified", + "signer": "test@example.com", + "ratified_at": "2026-05-04T00:00:00Z", + "session_id": None, + }, + } + ], + } + + +def _stub_adapter_with(db: SymbolDB, max_neighbors: int = 10) -> RealCodeLocatorAdapter: + """Build a RealCodeLocatorAdapter wired to a hand-built SymbolDB. + + Bypasses the ``_ensure_initialized`` index-presence check so we don't + have to point at a real codebase. Sets ``_initialized=True`` and + populates ``_db`` + ``_config`` directly — the only attributes + ``expand_file_paths_via_graph`` reads. + """ + adapter = RealCodeLocatorAdapter(repo_path=".") + adapter._db = db + adapter._config = CodeLocatorConfig(max_neighbors_per_result=max_neighbors) + adapter._initialized = True + return adapter + + +def _build_synthetic_db(tmp_path) -> SymbolDB: + """Two files, one edge: ``reorder.tsx`` imports a symbol from ``reorder.ts``.""" + db = SymbolDB(str(tmp_path / "sym.db")) + db.init_db() + db.insert_symbols_batch( + [ + # symbol id 1 — git-layer (where the decision is bound) + SymbolRecord( + name="reorder", + qualified_name="reorder", + type="function", + file_path="app/src/lib/git/reorder.ts", + start_line=10, + end_line=80, + signature="export function reorder(...)", + parent_qualified_name="", + ), + # symbol id 2 — UI layer (caller's chosen file) + SymbolRecord( + name="Reorder", + qualified_name="Reorder", + type="class", + file_path="app/src/ui/multi-commit-operation/reorder.tsx", + start_line=4, + end_line=27, + signature="export class Reorder ...", + parent_qualified_name="", + ), + ] + ) + # The UI symbol invokes / imports the git-layer symbol → bidirectional edge. + db.insert_edges_batch([(2, 1, "imports")]) + return db + + +# ── Adapter unit tests ────────────────────────────────────────────────── + + +def test_expander_finds_1_hop_neighbor_file(tmp_path): + """Passing the UI file alone returns it + the git-layer neighbor.""" + db = _build_synthetic_db(tmp_path) + adapter = _stub_adapter_with(db) + expanded, added = adapter.expand_file_paths_via_graph( + ["app/src/ui/multi-commit-operation/reorder.tsx"], hops=1 + ) + assert "app/src/ui/multi-commit-operation/reorder.tsx" in expanded + assert "app/src/lib/git/reorder.ts" in expanded + assert added == ["app/src/lib/git/reorder.ts"] + + +def test_expander_preserves_input_paths_when_no_neighbors(tmp_path): + """A file with indexed symbols but no edges yields no expansion.""" + db = SymbolDB(str(tmp_path / "lonely.db")) + db.init_db() + db.insert_symbols_batch( + [ + SymbolRecord( + name="standalone", + qualified_name="standalone", + type="function", + file_path="app/src/lonely.ts", + start_line=1, + end_line=10, + signature="", + parent_qualified_name="", + ) + ] + ) + adapter = _stub_adapter_with(db) + expanded, added = adapter.expand_file_paths_via_graph(["app/src/lonely.ts"], hops=1) + assert expanded == ["app/src/lonely.ts"] + assert added == [] + + +def test_expander_handles_empty_input(): + db = SymbolDB(":memory:") + db.init_db() + adapter = _stub_adapter_with(db) + expanded, added = adapter.expand_file_paths_via_graph([], hops=1) + assert expanded == [] + assert added == [] + + +def test_expander_handles_unindexed_file(tmp_path): + """A file with NO symbols in the index contributes nothing — no crash.""" + db = _build_synthetic_db(tmp_path) + adapter = _stub_adapter_with(db) + expanded, added = adapter.expand_file_paths_via_graph(["app/src/never-indexed.ts"], hops=1) + assert expanded == ["app/src/never-indexed.ts"] + assert added == [] + + +def test_expander_caps_hub_file_explosion(tmp_path): + """A hub file with many neighbors does not blow up the result set. + + Per-symbol cap = ``max_neighbors_per_result``; global cap scales with + input size. With one input file and ``max_neighbors=2``, expansion + should add at most 2 paths. + """ + db = SymbolDB(str(tmp_path / "hub.db")) + db.init_db() + # 1 hub symbol + 5 neighbor symbols, each in a different file. + records = [ + SymbolRecord("hub", "hub", "function", "hub.ts", 1, 5, "", ""), + ] + for i in range(5): + records.append( + SymbolRecord( + f"neigh_{i}", + f"neigh_{i}", + "function", + f"neigh_{i}.ts", + 1, + 3, + "", + "", + ) + ) + db.insert_symbols_batch(records) + # Hub imports each of the 5 neighbors. (Use ``imports`` not ``invokes`` + # because the expander now filters to import edges only — see + # ``test_expander_filters_to_imports_only`` and #64.) + db.insert_edges_batch([(1, i + 2, "imports") for i in range(5)]) + + adapter = _stub_adapter_with(db, max_neighbors=2) + expanded, added = adapter.expand_file_paths_via_graph(["hub.ts"], hops=1) + # Per-symbol cap caps the per-symbol neighbor walk at 2, so even though 5 + # neighbors exist, expansion adds at most 2. + assert len(added) <= 2 + assert len(added) > 0, "imports-edges hub should produce some expansion" + assert "hub.ts" in expanded + + +def test_expander_filters_to_imports_only(tmp_path): + """Per #64: only ``imports`` edges expand; ``invokes`` / ``inherits`` / + ``contains`` are symbol-level edges that over-broaden the file-level + expansion. A neighbor reachable only via a non-import edge must NOT + appear in the expanded set. + """ + db = SymbolDB(str(tmp_path / "edge_filter.db")) + db.init_db() + db.insert_symbols_batch( + [ + SymbolRecord("caller", "caller", "function", "caller.ts", 1, 5, "", ""), + SymbolRecord("import_target", "import_target", "function", "imp.ts", 1, 5, "", ""), + SymbolRecord("invoke_target", "invoke_target", "function", "inv.ts", 1, 5, "", ""), + SymbolRecord("inherit_target", "inherit_target", "class", "inh.ts", 1, 5, "", ""), + ] + ) + db.insert_edges_batch( + [ + (1, 2, "imports"), # caller → imp.ts (should expand) + (1, 3, "invokes"), # caller → inv.ts (should NOT expand) + (1, 4, "inherits"), # caller → inh.ts (should NOT expand) + ] + ) + adapter = _stub_adapter_with(db) + _, added = adapter.expand_file_paths_via_graph(["caller.ts"], hops=1) + assert added == ["imp.ts"], f"only imports-edged neighbors should expand; got: {added}" + + +def test_expander_falls_back_when_uninitialized(): + """If the symbol index isn't available, returns inputs unchanged.""" + adapter = RealCodeLocatorAdapter(repo_path=".") + # _initialized stays False; calling _ensure_initialized would raise + # because there's no index. The expander must catch that and fall back. + expanded, added = adapter.expand_file_paths_via_graph(["a.ts", "b.ts"], hops=1) + assert expanded == ["a.ts", "b.ts"] + assert added == [] + + +# ── Handler integration test ──────────────────────────────────────────── + + +class _FakeCodeGraph: + """Minimal code_graph wrapper for handle_preflight: overrides + ``expand_file_paths_via_graph`` with a hard-coded expansion, forwards + every other attribute to the real adapter (so ``resolve_symbols`` etc. + still work for the surrounding ingest/bind calls). Lets us prove the + handler wiring (sources_chained tag, expansion-provenance confidence) + without depending on a real symbol index in the test environment. + """ + + def __init__(self, real, *, expansion_for_tsx: list[str]) -> None: + self._real = real + self._expansion = expansion_for_tsx + self.calls: list[list[str]] = [] + + def expand_file_paths_via_graph( + self, + file_paths: list[str], + hops: int = 1, + ) -> tuple[list[str], list[str]]: + self.calls.append(list(file_paths)) + added = [p for p in self._expansion if p not in file_paths] + return list(file_paths) + added, added + + def __getattr__(self, name: str): + # Forward unknown attribute access to the real adapter so other + # handlers (ingest's resolve_symbols, etc.) keep working. + return getattr(self._real, name) + + +@pytest.fixture +def integration_env(monkeypatch, tmp_path): + """In-memory ledger + git-initialized repo + repo-rooted ctx; same shape + as ``test_alpha_contract::alpha_env`` pared down to what graph-expansion + needs. Requires git init because ``ensure_ledger_synced`` walks HEAD. + """ + import subprocess + + monkeypatch.setenv("SURREAL_URL", "memory://") + repo_root = tmp_path / "graph-expand-repo" + repo_root.mkdir() + subprocess.run(["git", "init", "-q", "-b", "main"], cwd=repo_root, check=True) + subprocess.run(["git", "config", "user.email", "test@example.com"], cwd=repo_root, check=True) + subprocess.run(["git", "config", "user.name", "Test User"], cwd=repo_root, check=True) + # Seed the two files the tests bind / preflight against. handle_bind + # verifies the file exists at HEAD so we have to materialize them. + git_layer = repo_root / "app" / "src" / "lib" / "git" + git_layer.mkdir(parents=True) + (git_layer / "reorder.ts").write_text( + "// stub for graph-expansion test\nexport function reorder() { return 0 }\n" + ) + ui_layer = repo_root / "app" / "src" / "ui" / "multi-commit-operation" + ui_layer.mkdir(parents=True) + (ui_layer / "reorder.tsx").write_text( + "// stub for graph-expansion test\nexport class Reorder {}\n" + ) + subprocess.run(["git", "add", "."], cwd=repo_root, check=True) + subprocess.run( + ["git", "-c", "commit.gpgsign=false", "commit", "-q", "-m", "seed"], + cwd=repo_root, + check=True, + ) + + monkeypatch.setenv("REPO_PATH", str(repo_root)) + monkeypatch.chdir(repo_root) + reset_ledger_singleton() + ctx = BicameralContext.from_env() + yield ctx + reset_ledger_singleton() + + +@pytest.mark.asyncio +async def test_preflight_surfaces_via_graph_expansion(integration_env, monkeypatch): + """Caller passes a UI-layer file; the decision is bound to a git-layer + file 1 hop away; preflight surfaces it via expansion with + ``confidence=0.7`` and tags ``sources_chained`` accordingly. + """ + import dataclasses + + monkeypatch.setenv("BICAMERAL_GUIDED_MODE", "1") + # Stub code_graph: when caller passes the UI file, expansion adds the + # git-layer file (where the bind lives). BicameralContext is a frozen + # dataclass; clone with dataclasses.replace to swap in the fake. + base = BicameralContext.from_env() + ctx = dataclasses.replace( + base, + code_graph=_FakeCodeGraph( + base.code_graph, + expansion_for_tsx=["app/src/lib/git/reorder.ts"], + ), + ) + + ingest_resp = await handle_ingest( + ctx, + _build_ingest_payload("Drag-to-reorder commits via the git-layer reorder helper."), + ) + decision_id = ingest_resp.pending_grounding_decisions[0]["decision_id"] + bind_resp = await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "app/src/lib/git/reorder.ts", + "symbol_name": "reorder", + "start_line": 10, + "end_line": 80, + } + ], + ) + assert bind_resp.bindings[0].error is None + + pf_resp = await handle_preflight( + ctx, + topic="refactor the reorder UI to use a text-editor flow", + file_paths=["app/src/ui/multi-commit-operation/reorder.tsx"], + ) + + # The bound decision must surface even though caller passed the UI file. + decision_ids = [d.decision_id for d in pf_resp.decisions] + assert decision_id in decision_ids, ( + f"bound decision {decision_id} must surface via 1-hop expansion; " + f"got: {decision_ids}; sources={pf_resp.sources_chained}" + ) + + # And it should be marked as expansion-provenance, not direct. + # `decisions` on PreflightResponse is BriefDecision (no confidence field); + # the confidence lives on the underlying DecisionMatch via the region + # lookup. The signal we can assert end-to-end is sources_chained. + assert "region" in pf_resp.sources_chained + assert "graph" in pf_resp.sources_chained, ( + f"expected 'graph' in sources_chained when graph " + f"expansion produced extra hits; got: {pf_resp.sources_chained}" + ) + + +@pytest.mark.asyncio +async def test_preflight_does_not_tag_expanded_when_direct_pin_alone(integration_env, monkeypatch): + """When caller passes the bound file directly, expansion may add neighbors + but the decision is reached via a direct pin — `sources_chained` should + contain `region` but NOT `graph` (the existing decision + is direct, not expanded). + """ + import dataclasses + + monkeypatch.setenv("BICAMERAL_GUIDED_MODE", "1") + # Expander returns no extra paths when the caller already passed the + # bound file directly (simulates a clean discovery). + base = BicameralContext.from_env() + ctx = dataclasses.replace( + base, + code_graph=_FakeCodeGraph(base.code_graph, expansion_for_tsx=[]), + ) + + ingest_resp = await handle_ingest(ctx, _build_ingest_payload("Direct-pin baseline.")) + decision_id = ingest_resp.pending_grounding_decisions[0]["decision_id"] + await handle_bind( + ctx, + bindings=[ + { + "decision_id": decision_id, + "file_path": "app/src/lib/git/reorder.ts", + "symbol_name": "reorder", + "start_line": 10, + "end_line": 80, + } + ], + ) + + pf_resp = await handle_preflight( + ctx, + topic="edit reorder", + file_paths=["app/src/lib/git/reorder.ts"], + ) + + decision_ids = [d.decision_id for d in pf_resp.decisions] + assert decision_id in decision_ids + assert "region" in pf_resp.sources_chained + assert "graph" not in pf_resp.sources_chained, ( + f"direct pin alone must not tag 'graph'; got: {pf_resp.sources_chained}" + ) diff --git a/tests/test_preflight_hook.py b/tests/test_preflight_hook.py new file mode 100644 index 00000000..fe76eb01 --- /dev/null +++ b/tests/test_preflight_hook.py @@ -0,0 +1,122 @@ +"""Functionality tests for scripts/hooks/preflight_reminder.py. + +The hook is invoked as a subprocess by Claude Code. Tests run it the +same way to exercise stdin/stdout exactly as production does. + +Claude Code 2.x requires UserPromptSubmit hook output shaped as +``{"hookSpecificOutput": {"hookEventName": "UserPromptSubmit", +"additionalContext": "..."}}``. The legacy top-level +``{"additionalContext": ...}`` shape is silently dropped by the CLI, +so these tests assert against the nested shape — anything else is a +broken contract regardless of whether the hook process exits cleanly. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +HOOK_SCRIPT = REPO_ROOT / "scripts" / "hooks" / "preflight_reminder.py" + + +def _run_hook(stdin_text: str) -> tuple[int, str, str]: + """Invoke the hook with stdin_text on stdin; return (rc, stdout, stderr).""" + proc = subprocess.run( + [sys.executable, str(HOOK_SCRIPT)], + input=stdin_text, + capture_output=True, + text=True, + timeout=10, + ) + return proc.returncode, proc.stdout, proc.stderr + + +def _hook_output(parsed: dict) -> dict: + """Extract the hookSpecificOutput payload, asserting the envelope shape.""" + assert "hookSpecificOutput" in parsed, ( + f"hook must emit hookSpecificOutput envelope (Claude Code 2.x contract); got {parsed!r}" + ) + inner = parsed["hookSpecificOutput"] + assert inner.get("hookEventName") == "UserPromptSubmit" + return inner + + +def test_emits_additional_context_on_match(): + """Fire-worthy prompt produces additionalContext containing the directive.""" + payload = {"prompt": "Please refactor the rate limiter to sliding window."} + rc, out, _ = _run_hook(json.dumps(payload)) + assert rc == 0 + inner = _hook_output(json.loads(out)) + assert "additionalContext" in inner + assert "<system-reminder>" in inner["additionalContext"] + assert "bicameral.preflight" in inner["additionalContext"] + + +def test_emits_empty_on_no_match(): + """Skip-worthy prompt produces empty response (no hookSpecificOutput).""" + payload = {"prompt": "fix the typo in README"} + rc, out, _ = _run_hook(json.dumps(payload)) + assert rc == 0 + parsed = json.loads(out) if out.strip() else {} + assert "hookSpecificOutput" not in parsed + + +def test_handles_malformed_stdin(): + """Non-JSON stdin returns rc 0 with empty/no response — never blocks user.""" + rc, out, _ = _run_hook("this is not JSON at all {[}") + assert rc == 0 + if out.strip(): + parsed = json.loads(out) + assert "hookSpecificOutput" not in parsed + + +def test_idempotent_on_double_fire(): + """Same prompt twice produces identical output (no state leak).""" + payload = {"prompt": "implement the OAuth callback for Google Calendar"} + rc1, out1, _ = _run_hook(json.dumps(payload)) + rc2, out2, _ = _run_hook(json.dumps(payload)) + assert rc1 == rc2 == 0 + assert out1 == out2 + + +def test_handles_natural_contradiction_prompt(): + """The literal Flow 2 prompt fires the hook (issue #146 acceptance).""" + payload = { + "prompt": ( + "I know the roadmap said drag-and-drop to reorder commits, " + "but actually we're switching to a text-editor approach. " + "Please update cherry-pick.ts and reorder.ts." + ) + } + rc, out, _ = _run_hook(json.dumps(payload)) + assert rc == 0 + inner = _hook_output(json.loads(out)) + assert "additionalContext" in inner + assert "bicameral.preflight" in inner["additionalContext"] + + +def test_reminder_gates_writes_not_discovery(): + """The reminder must allow Read/Grep/Glob discovery before preflight, + and gate preflight against WRITE ops only. An earlier shape ("call + preflight before any file-inspection tool") short-circuited the + caller-LLM discovery the rest of the contract depends on (the agent + needs to map "the X feature" → concrete file paths via Read/Grep/Glob + before calling preflight). Lock the new posture in so future edits + don't quietly regress it. + """ + payload = {"prompt": "refactor the reorder feature to a text-editor flow"} + rc, out, _ = _run_hook(json.dumps(payload)) + assert rc == 0 + ctx = _hook_output(json.loads(out))["additionalContext"] + # Affirmative: discovery comes first, write op is the gate. + assert "Read-only discovery FIRST" in ctx + assert "BEFORE any write op" in ctx + assert "Edit, Write" in ctx + # The reminder should explicitly tell the agent to populate file_paths. + assert "file_paths" in ctx + # Negative: must NOT forbid file-inspection tools (the old shape). + assert "before any file-inspection tool" not in ctx + assert "Before invoking any file-inspection tool" not in ctx diff --git a/tests/test_preflight_intent.py b/tests/test_preflight_intent.py new file mode 100644 index 00000000..4cbc4443 --- /dev/null +++ b/tests/test_preflight_intent.py @@ -0,0 +1,70 @@ +"""Functionality tests for scripts.hooks.preflight_intent.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT)) + +from scripts.hooks.preflight_intent import ( # noqa: E402 + IMPLEMENTATION_VERBS, + INDIRECT_INTENT_PHRASES, + SKIP_PATTERNS, + should_fire_preflight, +) + + +def test_fires_on_implementation_verbs(): + """Every canonical verb in a natural sentence must fire the classifier.""" + for verb in IMPLEMENTATION_VERBS: + prompt = f"Please {verb} the rate limiter for me." + assert should_fire_preflight(prompt), f"verb {verb!r} did not fire" + + +def test_skips_on_doc_only_prompts(): + """Skip patterns must suppress the fire even when verbs are present.""" + skip_prompts = ( + "fix the typo in the README", + "bump lodash to 4.17.21", + "how does the rate limiter work?", + ) + for prompt in skip_prompts: + assert not should_fire_preflight(prompt), f"skip-prompt {prompt!r} fired" + + +def test_fires_on_indirect_intent(): + """Asking HOW to implement is intent — must fire.""" + indirect = ( + "how should I implement the retry logic?", + "how do I build the payment flow?", + "what's the best way to add idempotency keys?", + ) + for prompt in indirect: + assert should_fire_preflight(prompt), f"indirect prompt {prompt!r} did not fire" + + +def test_data_is_loadable(): + """The shared verb list must be importable, non-empty, and well-typed.""" + assert isinstance(IMPLEMENTATION_VERBS, frozenset) + assert len(IMPLEMENTATION_VERBS) >= 28 + assert all(isinstance(v, str) and v for v in IMPLEMENTATION_VERBS) + assert isinstance(INDIRECT_INTENT_PHRASES, tuple) + assert all(isinstance(p, str) and p for p in INDIRECT_INTENT_PHRASES) + assert isinstance(SKIP_PATTERNS, tuple) + + +def test_natural_contradiction_prompt(): + """The literal Flow 2 prompt from issue #146 must fire.""" + prompt = ( + "I know the roadmap said drag-and-drop to reorder commits, " + "but actually we're switching to a text-editor approach. " + "Please update cherry-pick.ts and reorder.ts." + ) + assert should_fire_preflight(prompt) + + +def test_empty_prompt_does_not_fire(): + assert not should_fire_preflight("") + assert not should_fire_preflight(" \n\t") diff --git a/tests/test_project_decision_status.py b/tests/test_project_decision_status.py index 00674f97..aaac34d0 100644 --- a/tests/test_project_decision_status.py +++ b/tests/test_project_decision_status.py @@ -9,6 +9,7 @@ Closes the gap v0.6.1's session-start banner infra couldn't close on its own. """ + from __future__ import annotations import pytest @@ -33,6 +34,7 @@ async def _seed_decision(client: LedgerClient, description: str = "test decision # canonical_id has a UNIQUE index — derive a stable unique value from the # description so multiple decisions in one test don't collide. import hashlib + canonical = hashlib.sha256(description.encode()).hexdigest()[:16] rows = await client.query( "CREATE decision SET description = $d, canonical_id = $c, source_type = 'manual'", diff --git a/tests/test_reset.py b/tests/test_reset.py index dbd607b8..0bef3663 100644 --- a/tests/test_reset.py +++ b/tests/test_reset.py @@ -17,7 +17,6 @@ from context import BicameralContext from handlers.reset import handle_reset - # ── Helpers ───────────────────────────────────────────────────────── @@ -53,7 +52,10 @@ def _payload_for(repo: str, source_type: str, source_ref: str) -> dict: async def _seed_repo_with_cursors( - ledger, repo: str, count: int = 3, source_type: str = "slack", + ledger, + repo: str, + count: int = 3, + source_type: str = "slack", ) -> None: """Seed N source_cursor rows for a repo by upserting them directly.""" for i in range(count): @@ -72,6 +74,7 @@ def _ctx(repo_path: str = "test-repo") -> BicameralContext: are left as whatever from_env builds — reset doesn't use them. """ import os + os.environ["REPO_PATH"] = repo_path return BicameralContext.from_env() @@ -139,9 +142,7 @@ async def test_reset_confirm_actually_wipes(monkeypatch, surreal_url): for d in post_decisions: # description-based check — the seeded decisions had distinctive # 'decision from msg_N' descriptions - assert "decision from msg_" not in d.get("description", ""), ( - f"wipe missed an intent: {d}" - ) + assert "decision from msg_" not in d.get("description", ""), f"wipe missed an intent: {d}" reset_ledger_singleton() diff --git a/tests/test_resolve_compliance.py b/tests/test_resolve_compliance.py index 5583cb48..21758dcb 100644 --- a/tests/test_resolve_compliance.py +++ b/tests/test_resolve_compliance.py @@ -11,6 +11,7 @@ link_commit + resolve flow on a tmp git repo. - not_relevant verdict prunes the binds_to edge + audit row kept """ + from __future__ import annotations import subprocess @@ -72,8 +73,7 @@ async def _seed_region( symbol: str = "do_thing", ) -> str: rows = await client.query( - "CREATE code_region SET file_path = $f, symbol_name = $s, " - "start_line = 1, end_line = 10", + "CREATE code_region SET file_path = $f, symbol_name = $s, start_line = 1, end_line = 10", {"f": file_path, "s": symbol}, ) return str(rows[0]["id"]) @@ -100,7 +100,9 @@ async def test_resolve_compliance_writes_compliance_check_row(): ) resp = await handle_resolve_compliance( - ctx, phase="ingest", verdicts=[verdict], + ctx, + phase="ingest", + verdicts=[verdict], ) assert resp.phase == "ingest" @@ -246,7 +248,10 @@ async def test_resolve_compliance_mixed_batch_partitions_correctly(): ) resp = await handle_resolve_compliance( - ctx, phase="drift", verdicts=[good, bad], commit_hash="abc123", + ctx, + phase="drift", + verdicts=[good, bad], + commit_hash="abc123", ) assert len(resp.accepted) == 1 @@ -272,9 +277,7 @@ async def test_resolve_compliance_accepts_all_phase_values(): decision_id = await _seed_decision(client) region_id = await _seed_region(client) - for i, phase in enumerate( - ("ingest", "drift", "regrounding", "supersession", "divergence") - ): + for i, phase in enumerate(("ingest", "drift", "regrounding", "supersession", "divergence")): v = ComplianceVerdict( decision_id=decision_id, region_id=region_id, @@ -296,7 +299,9 @@ async def test_resolve_compliance_rejects_unknown_phase(): try: with pytest.raises(ValueError, match="Unknown phase"): await handle_resolve_compliance( - ctx, phase="speculation", verdicts=[], + ctx, + phase="speculation", + verdicts=[], ) finally: await _client.close() @@ -322,7 +327,9 @@ async def test_resolve_compliance_accepts_dict_verdicts(): "explanation": "from JSON", } resp = await handle_resolve_compliance( - ctx, phase="ingest", verdicts=[verdict_dict], + ctx, + phase="ingest", + verdicts=[verdict_dict], ) assert len(resp.accepted) == 1 finally: @@ -354,7 +361,9 @@ async def test_not_relevant_verdict_prunes_binds_to_edge(): explanation="this region is unrelated", ) resp = await handle_resolve_compliance( - ctx, phase="ingest", verdicts=[verdict], + ctx, + phase="ingest", + verdicts=[verdict], ) assert len(resp.accepted) == 1 @@ -376,7 +385,11 @@ async def test_not_relevant_verdict_prunes_binds_to_edge(): def _git(cwd: Path, *args: str) -> str: result = subprocess.run( - ["git", *args], cwd=cwd, capture_output=True, text=True, check=True, + ["git", *args], + cwd=cwd, + capture_output=True, + text=True, + check=True, ) return result.stdout.strip() @@ -386,12 +399,14 @@ def _seed_repo(root: Path) -> None: _git(root, "init", "-q", "-b", "main") _git(root, "config", "user.email", "test@example.com") _git(root, "config", "user.name", "Test") - (root / "pricing.py").write_text(dedent(""" + (root / "pricing.py").write_text( + dedent(""" def calculate_discount(order_total): if order_total >= 100: return order_total * 0.10 return 0 - """).lstrip("\n")) + """).lstrip("\n") + ) _git(root, "add", "pricing.py") _git(root, "-c", "commit.gpgsign=false", "commit", "-q", "-m", "seed") @@ -433,13 +448,15 @@ async def test_e2e_pending_to_reflected_via_resolve(_repo_ctx): }, "intent": "Apply 10% discount on orders of $100 or more", "symbols": ["calculate_discount"], - "code_regions": [{ - "file_path": "pricing.py", - "symbol": "calculate_discount", - "type": "function", - "start_line": 1, - "end_line": 4, - }], + "code_regions": [ + { + "file_path": "pricing.py", + "symbol": "calculate_discount", + "type": "function", + "start_line": 1, + "end_line": 4, + } + ], # Ratified signoff required for drift detection to run (v0.7+) "signoff": { "state": "ratified", @@ -454,9 +471,7 @@ async def test_e2e_pending_to_reflected_via_resolve(_repo_ctx): assert ingest_resp.sync_status is not None, "ingest should populate sync_status" pending = ingest_resp.sync_status.pending_compliance_checks - assert len(pending) == 1, ( - f"Expected one pending check from drift sweep, got {len(pending)}" - ) + assert len(pending) == 1, f"Expected one pending check from drift sweep, got {len(pending)}" p = pending[0] assert p.decision_description == "Apply 10% discount on orders of $100 or more" @@ -512,13 +527,15 @@ async def test_e2e_noncompliant_verdict_yields_drifted(_repo_ctx): }, "intent": "Apply 50% discount on orders of $100 or more", "symbols": ["calculate_discount"], - "code_regions": [{ - "file_path": "pricing.py", - "symbol": "calculate_discount", - "type": "function", - "start_line": 1, - "end_line": 4, - }], + "code_regions": [ + { + "file_path": "pricing.py", + "symbol": "calculate_discount", + "type": "function", + "start_line": 1, + "end_line": 4, + } + ], # Ratified signoff required for drift detection to run (v0.7+) "signoff": { "state": "ratified", @@ -553,7 +570,10 @@ async def test_e2e_noncompliant_verdict_yields_drifted(_repo_ctx): assert len(drifted) == 1 inner = getattr(ledger, "_inner", ledger) cached = await get_compliance_verdict( - inner._client, p.decision_id, p.region_id, p.content_hash, + inner._client, + p.decision_id, + p.region_id, + p.content_hash, ) assert cached is not None assert cached["verdict"] == "drifted" diff --git a/tests/test_schema_persistence.py b/tests/test_schema_persistence.py index ec0fc854..ce81c6b4 100644 --- a/tests/test_schema_persistence.py +++ b/tests/test_schema_persistence.py @@ -80,6 +80,7 @@ async def test_destructive_migration_blocked(tmp_path): allow_destructive=False is safe when there are no destructive steps. """ from ledger.schema import DESTRUCTIVE_MIGRATIONS + url = f"surrealkv://{tmp_path / 'ledger.db'}" client = LedgerClient(url=url, ns="bicameral", db="ledger") await client.connect() diff --git a/tests/test_session_end_hook_drift.py b/tests/test_session_end_hook_drift.py new file mode 100644 index 00000000..a850e1fb --- /dev/null +++ b/tests/test_session_end_hook_drift.py @@ -0,0 +1,87 @@ +"""Functionality tests for SessionEnd hook drift fix per +plan-147-flow4-ledger-validation.md Phase 2. + +Verifies the canonical hook command shape lands in: + - .claude/settings.json (the deployed hook) + - setup_wizard._BICAMERAL_SESSION_END_COMMAND (the source of truth for + fresh installs) + +The canonical command per skills/bicameral-capture-corrections/SKILL.md:207: + + [ -d .bicameral ] && [ -z "$BICAMERAL_SESSION_END_RUNNING" ] && \ + BICAMERAL_SESSION_END_RUNNING=1 \ + claude -p '/bicameral:capture-corrections --auto-ingest' || true +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT)) + + +CANONICAL_COMMAND = ( + '[ -d .bicameral ] && [ -z "$BICAMERAL_SESSION_END_RUNNING" ] && ' + "BICAMERAL_SESSION_END_RUNNING=1 " + "claude -p '/bicameral:capture-corrections --auto-ingest' || true" +) + + +def _extract_session_end_command() -> str: + """Parse .claude/settings.json and return the SessionEnd hook command string.""" + settings = json.loads((REPO_ROOT / ".claude" / "settings.json").read_text(encoding="utf-8")) + session_end = settings["hooks"]["SessionEnd"] + return session_end[0]["hooks"][0]["command"] + + +def test_settings_json_session_end_has_reentrancy_guard(): + """Behavior: deployed SessionEnd hook short-circuits when env var is set.""" + cmd = _extract_session_end_command() + assert '[ -z "$BICAMERAL_SESSION_END_RUNNING" ]' in cmd + assert "BICAMERAL_SESSION_END_RUNNING=1" in cmd + + +def test_settings_json_session_end_passes_auto_ingest_flag(): + """Behavior: deployed SessionEnd hook invokes capture-corrections in batch (auto-ingest) mode.""" + cmd = _extract_session_end_command() + assert "--auto-ingest" in cmd + + +def test_setup_wizard_renders_canonical_session_end_hook(): + """Behavior: setup_wizard's source-of-truth constant matches the + canonical command verbatim. Drift between this constant and the + SKILL.md prescription is the failure mode this test exists to catch.""" + import setup_wizard + + assert setup_wizard._BICAMERAL_SESSION_END_COMMAND == CANONICAL_COMMAND + + +def test_build_session_end_command_no_args_matches_canonical(): + """Behavior: the parameterized helper, when called with no args, + produces the same string as the no-args constant — i.e. end-user + installs are unchanged by the helper's existence.""" + import setup_wizard + + assert setup_wizard._build_session_end_command() == CANONICAL_COMMAND + + +def test_build_session_end_command_with_mcp_config_inserts_flags(): + """Behavior: passing ``mcp_config_path`` inserts ``--mcp-config <path>`` + + ``--strict-mcp-config`` after the prompt, before the ``|| true`` + fallback. This is the test-harness path: spawned subprocess writes + to the harness's test ledger instead of the user's default + (~/.bicameral/ledger.db).""" + import setup_wizard + + cmd = setup_wizard._build_session_end_command(mcp_config_path="/tmp/x/mcp.json") + assert "--mcp-config /tmp/x/mcp.json" in cmd + assert "--strict-mcp-config" in cmd + # Re-entrancy guard and --auto-ingest preserved. + assert '[ -z "$BICAMERAL_SESSION_END_RUNNING" ]' in cmd + assert "--auto-ingest" in cmd + # Path with shell metachar still safe (shlex.quote applied). + cmd2 = setup_wizard._build_session_end_command(mcp_config_path="/tmp/with space/mcp.json") + assert "'/tmp/with space/mcp.json'" in cmd2 diff --git a/tests/test_sync_middleware.py b/tests/test_sync_middleware.py index 111cd614..410bf87c 100644 --- a/tests/test_sync_middleware.py +++ b/tests/test_sync_middleware.py @@ -1,7 +1,8 @@ """Tests for sync_middleware — session-start banner and ledger catch-up (v0.6.1).""" + from __future__ import annotations -from datetime import datetime, timedelta, timezone +from datetime import UTC, datetime, timedelta, timezone from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch @@ -47,9 +48,13 @@ def _ungrounded(decision_id="decision:2", description="Billing uses Stripe", sou } -def _proposal(decision_id="decision:3", description="Rate limit is 100 req/s", - source_ref="sprint-notes", days_old=15): - created_at = (datetime.now(timezone.utc) - timedelta(days=days_old)).isoformat() +def _proposal( + decision_id="decision:3", + description="Rate limit is 100 req/s", + source_ref="sprint-notes", + days_old=15, +): + created_at = (datetime.now(UTC) - timedelta(days=days_old)).isoformat() return { "decision_id": decision_id, "description": description, @@ -99,25 +104,28 @@ async def test_banner_includes_ungrounded_decisions(): async def test_banner_queries_both_drifted_and_ungrounded_statuses(): ctx = _make_ctx(open_rows=[_drifted()]) await get_session_start_banner(ctx) - ctx.ledger.get_decisions_by_status.assert_called_once_with(["drifted", "ungrounded", "context_pending"]) + ctx.ledger.get_decisions_by_status.assert_called_once_with( + ["drifted", "ungrounded", "context_pending"] + ) @pytest.mark.asyncio async def test_banner_truncates_at_10_items_with_drifted_prioritized(): # 12 open items: 3 drifted + 9 ungrounded. Truncated view should keep # all 3 drifted first, then fill with ungrounded up to the 10-item cap. - rows = [_drifted(decision_id=f"decision:d{i}") for i in range(3)] + \ - [_ungrounded(decision_id=f"decision:u{i}") for i in range(9)] + rows = [_drifted(decision_id=f"decision:d{i}") for i in range(3)] + [ + _ungrounded(decision_id=f"decision:u{i}") for i in range(9) + ] ctx = _make_ctx(open_rows=rows) banner = await get_session_start_banner(ctx) assert banner is not None - assert banner.drifted_count == 3 # full count, not truncated + assert banner.drifted_count == 3 # full count, not truncated assert banner.ungrounded_count == 9 - assert len(banner.items) == 10 # list is capped + assert len(banner.items) == 10 # list is capped assert banner.truncated is True # All 3 drifted must be present in the truncated view assert sum(1 for i in banner.items if i["status"] == "drifted") == 3 - assert f"top 10" in banner.message + assert "top 10" in banner.message @pytest.mark.asyncio @@ -232,6 +240,7 @@ def _reset_locks(): """Drop the per-repo lock registry before and after each test so lock identity is deterministic across tests in the same process.""" from handlers.sync_middleware import _reset_repo_locks_for_tests + _reset_repo_locks_for_tests() yield _reset_repo_locks_for_tests() @@ -251,6 +260,7 @@ async def test_repo_write_barrier_serializes_same_repo(_reset_locks): bind call cannot observe the ledger while the first is mid-write. """ import asyncio + from handlers.sync_middleware import repo_write_barrier events: list[str] = [] @@ -272,6 +282,7 @@ async def task(name: str, hold_ms: int): async def test_repo_write_barrier_allows_different_repos_concurrently(_reset_locks): """Different repos use different locks and MUST run in parallel.""" import asyncio + from handlers.sync_middleware import repo_write_barrier events: list[str] = [] @@ -295,6 +306,7 @@ async def task(name: str, repo: str): async def test_repo_write_barrier_releases_on_exception(_reset_locks): """If the body raises, the lock must still release so the next caller proceeds.""" import asyncio + from handlers.sync_middleware import repo_write_barrier ctx = _barrier_ctx("/repo/a") @@ -315,6 +327,7 @@ async def reacquire(): async def test_repo_write_barrier_falls_back_when_repo_path_missing(_reset_locks): """Missing ctx.repo_path falls back to a default key and still serializes.""" import asyncio + from handlers.sync_middleware import repo_write_barrier class _Bare: @@ -343,6 +356,7 @@ async def task(name: str): async def test_repo_write_barrier_reports_held_ms(_reset_locks): """BarrierTiming.held_ms is populated on exit and is non-negative.""" import asyncio + from handlers.sync_middleware import repo_write_barrier ctx = _barrier_ctx("/repo/a") diff --git a/tests/test_team_event_replay.py b/tests/test_team_event_replay.py index 78647190..ca8dbe78 100644 --- a/tests/test_team_event_replay.py +++ b/tests/test_team_event_replay.py @@ -92,9 +92,7 @@ async def test_ratify_event_roundtrip(tmp_path: Path) -> None: } await team_a.apply_ratify(decision_id_a, signoff) - rows = await inner_a._client.query( - f"SELECT signoff FROM {decision_id_a} LIMIT 1" - ) + rows = await inner_a._client.query(f"SELECT signoff FROM {decision_id_a} LIMIT 1") assert rows and rows[0]["signoff"]["state"] == "ratified" # Fresh adapter, same JSONL log, fresh watermark — replay from 0. @@ -104,13 +102,10 @@ async def test_ratify_event_roundtrip(tmp_path: Path) -> None: decision_id_b = await find_decision_by_canonical_id(inner_b._client, canonical) assert decision_id_b, "ingest event did not replay (no row for canonical_id)" - rows_b = await inner_b._client.query( - f"SELECT signoff FROM {decision_id_b} LIMIT 1" - ) + rows_b = await inner_b._client.query(f"SELECT signoff FROM {decision_id_b} LIMIT 1") replayed_signoff = rows_b[0].get("signoff") or {} assert replayed_signoff.get("state") == "ratified", ( - "decision_ratified.completed event did not replay; " - f"got signoff={replayed_signoff!r}" + f"decision_ratified.completed event did not replay; got signoff={replayed_signoff!r}" ) @@ -154,8 +149,7 @@ async def test_supersede_event_roundtrip(tmp_path: Path) -> None: rows_b = await inner_b._client.query(f"SELECT signoff FROM {old_id_b} LIMIT 1") replayed = rows_b[0].get("signoff") or {} assert replayed.get("state") == "superseded", ( - "decision_superseded.completed event did not replay; " - f"got signoff={replayed!r}" + f"decision_superseded.completed event did not replay; got signoff={replayed!r}" ) assert replayed.get("superseded_by") == new_id_b @@ -188,8 +182,6 @@ async def test_ingest_event_roundtrip_regression(tmp_path: Path) -> None: decision_id_b = await find_decision_by_canonical_id(inner_b._client, canonical) assert decision_id_b, "ingest.completed regression — canonical lookup failed" - rows = await inner_b._client.query( - f"SELECT description FROM {decision_id_b} LIMIT 1" - ) + rows = await inner_b._client.query(f"SELECT description FROM {decision_id_b} LIMIT 1") assert rows, "ingest.completed regression — decision row missing after replay" assert "regression-intent" in str(rows[0].get("description", "")) diff --git a/tests/test_usage_summary.py b/tests/test_usage_summary.py index 50068abf..783a6d5b 100644 --- a/tests/test_usage_summary.py +++ b/tests/test_usage_summary.py @@ -11,7 +11,9 @@ from handlers.usage_summary import handle_usage_summary -def _ctx_with_decisions(rows: list[dict] | None = None, cc_rows: list[dict] | None = None) -> SimpleNamespace: +def _ctx_with_decisions( + rows: list[dict] | None = None, cc_rows: list[dict] | None = None +) -> SimpleNamespace: """Build a fake ctx whose ledger.client.query returns staged rows.""" client = MagicMock() call_count = {"i": 0} @@ -102,7 +104,9 @@ async def test_tool_call_counts_from_local_counters( monkeypatch.setenv("HOME", str(tmp_path)) monkeypatch.setenv("USERPROFILE", str(tmp_path)) import importlib + import local_counters + importlib.reload(local_counters) for _ in range(3): local_counters.increment("bicameral-ingest") diff --git a/tests/test_v0410_guided_mode.py b/tests/test_v0410_guided_mode.py index a9da9dee..8b218f64 100644 --- a/tests/test_v0410_guided_mode.py +++ b/tests/test_v0410_guided_mode.py @@ -36,11 +36,10 @@ SearchDecisionsResponse, ) from handlers.action_hints import ( - generate_hints_from_findings, generate_hints_for_search, + generate_hints_from_findings, ) - # ── Helper factories ──────────────────────────────────────────────── @@ -119,9 +118,11 @@ def test_search_empty_matches_no_hints_in_either_mode(): def test_search_drifted_match_fires_in_normal_mode_as_advisory(): """v0.4.10: hints fire even in normal mode, just non-blocking.""" - response = _search_response([ - _match(intent_id="decision:1", status="drifted", file_path="src/a.ts"), - ]) + response = _search_response( + [ + _match(intent_id="decision:1", status="drifted", file_path="src/a.ts"), + ] + ) hints = generate_hints_for_search(response, guided_mode=False) assert len(hints) == 1 h = hints[0] @@ -134,10 +135,12 @@ def test_search_drifted_match_fires_in_normal_mode_as_advisory(): def test_search_drifted_match_fires_in_guided_mode_as_blocking(): - response = _search_response([ - _match(intent_id="decision:1", status="drifted", file_path="src/a.ts"), - _match(intent_id="decision:2", status="drifted", file_path="src/b.ts"), - ]) + response = _search_response( + [ + _match(intent_id="decision:1", status="drifted", file_path="src/a.ts"), + _match(intent_id="decision:2", status="drifted", file_path="src/b.ts"), + ] + ) hints = generate_hints_for_search(response, guided_mode=True) review = [h for h in hints if h.kind == "review_drift"] assert len(review) == 1 @@ -153,9 +156,11 @@ def test_search_drifted_match_fires_in_guided_mode_as_blocking(): def test_search_ungrounded_fires_in_both_modes(): - response = _search_response([ - _match(intent_id="decision:1", status="ungrounded"), - ]) + response = _search_response( + [ + _match(intent_id="decision:1", status="ungrounded"), + ] + ) response.matches[0].code_regions = [] advisory = generate_hints_for_search(response, guided_mode=False) @@ -180,11 +185,13 @@ def test_search_message_tone_differs_between_modes(): def test_search_fires_both_review_and_ground_when_mixed(): - response = _search_response([ - _match(intent_id="decision:1", status="drifted"), - _match(intent_id="decision:2", status="ungrounded"), - _match(intent_id="decision:3", status="reflected"), - ]) + response = _search_response( + [ + _match(intent_id="decision:1", status="drifted"), + _match(intent_id="decision:2", status="ungrounded"), + _match(intent_id="decision:3", status="reflected"), + ] + ) for guided in (False, True): hints = generate_hints_for_search(response, guided_mode=guided) kinds = {h.kind for h in hints} @@ -271,11 +278,14 @@ def test_findings_open_question_gap_fires_in_both_modes(): def test_findings_fires_all_three_kinds_when_everything_present(): drift = [_brief_decision(intent_id="a", status="drifted")] - divergences = [BriefDivergence( - symbol="X", file_path="src/x.ts", - conflicting_decisions=[_brief_decision(), _brief_decision()], - summary="conflict", - )] + divergences = [ + BriefDivergence( + symbol="X", + file_path="src/x.ts", + conflicting_decisions=[_brief_decision(), _brief_decision()], + summary="conflict", + ) + ] gaps = [BriefGap(description="open q", hint="open-question phrasing")] for guided in (False, True): hints = generate_hints_from_findings(divergences, drift, gaps, guided_mode=guided) @@ -296,22 +306,26 @@ def test_action_hints_default_to_empty_list(): # ── Context flag parsing ──────────────────────────────────────────── -@pytest.mark.parametrize("env_val,expected", [ - ("1", True), - ("true", True), - ("True", True), - ("TRUE", True), - ("yes", True), - ("on", True), - ("0", False), - ("false", False), - ("no", False), - ("off", False), - ("maybe", False), # unrecognized → falls through to config file → false -]) +@pytest.mark.parametrize( + "env_val,expected", + [ + ("1", True), + ("true", True), + ("True", True), + ("TRUE", True), + ("yes", True), + ("on", True), + ("0", False), + ("false", False), + ("no", False), + ("off", False), + ("maybe", False), # unrecognized → falls through to config file → false + ], +) def test_guided_mode_env_truthy_set(env_val: str, expected: bool): """Truthy/falsy env values map correctly via the helper sets.""" - from context import _GUIDED_MODE_TRUTHY, _GUIDED_MODE_FALSY + from context import _GUIDED_MODE_FALSY, _GUIDED_MODE_TRUTHY + is_truthy = env_val.strip().lower() in _GUIDED_MODE_TRUTHY if expected: assert is_truthy @@ -324,6 +338,7 @@ def test_guided_mode_env_truthy_set(env_val: str, expected: bool): def test_read_guided_mode_falls_back_to_false_when_no_config(tmp_path, monkeypatch): monkeypatch.delenv("BICAMERAL_GUIDED_MODE", raising=False) from context import _read_guided_mode + assert _read_guided_mode(str(tmp_path)) is False @@ -333,6 +348,7 @@ def test_read_guided_mode_reads_config_yaml_true(tmp_path, monkeypatch): cfg_dir.mkdir() (cfg_dir / "config.yaml").write_text("mode: solo\nguided: true\n") from context import _read_guided_mode + assert _read_guided_mode(str(tmp_path)) is True @@ -342,6 +358,7 @@ def test_read_guided_mode_reads_config_yaml_false(tmp_path, monkeypatch): cfg_dir.mkdir() (cfg_dir / "config.yaml").write_text("mode: solo\nguided: false\n") from context import _read_guided_mode + assert _read_guided_mode(str(tmp_path)) is False @@ -352,6 +369,7 @@ def test_env_var_overrides_config_file(tmp_path, monkeypatch): (cfg_dir / "config.yaml").write_text("mode: solo\nguided: false\n") monkeypatch.setenv("BICAMERAL_GUIDED_MODE", "1") from context import _read_guided_mode + assert _read_guided_mode(str(tmp_path)) is True @@ -362,4 +380,5 @@ def test_env_var_can_force_off_against_config_file(tmp_path, monkeypatch): (cfg_dir / "config.yaml").write_text("mode: solo\nguided: true\n") monkeypatch.setenv("BICAMERAL_GUIDED_MODE", "0") from context import _read_guided_mode + assert _read_guided_mode(str(tmp_path)) is False diff --git a/tests/test_v0411_latent_drift.py b/tests/test_v0411_latent_drift.py index 04a8ae3a..0836c3f0 100644 --- a/tests/test_v0411_latent_drift.py +++ b/tests/test_v0411_latent_drift.py @@ -30,7 +30,6 @@ from handlers.link_commit import handle_link_commit from ledger.status import get_changed_files, get_changed_files_in_range - # ── Helpers ───────────────────────────────────────────────────────── @@ -49,18 +48,24 @@ def _seed_repo(repo_root: Path) -> str: _git(repo_root, "init", "-q", "-b", "main") _git(repo_root, "config", "user.email", "t@e.com") _git(repo_root, "config", "user.name", "t") - (repo_root / "pricing.py").write_text(dedent(""" + (repo_root / "pricing.py").write_text( + dedent(""" def calculate_discount(order_total): if order_total >= 100: return order_total * 0.10 return 0 - """).strip() + "\n") - (repo_root / "auth.py").write_text(dedent(""" + """).strip() + + "\n" + ) + (repo_root / "auth.py").write_text( + dedent(""" def validate_token(token): if not token: return False return len(token) > 10 - """).strip() + "\n") + """).strip() + + "\n" + ) _git(repo_root, "add", ".") _git(repo_root, "-c", "commit.gpgsign=false", "commit", "-q", "-m", "seed") return _git(repo_root, "rev-parse", "HEAD") @@ -167,12 +172,14 @@ async def test_second_sync_after_gap_uses_range_diff(_isolated_ledger): # Two commits, two different files sha2 = _commit_edit( - repo_root, "pricing.py", + repo_root, + "pricing.py", "def calculate_discount(t):\n return t * 0.5", "rewrite pricing", ) sha3 = _commit_edit( - repo_root, "auth.py", + repo_root, + "auth.py", "def validate_token(t):\n return False", "rewrite auth", ) @@ -186,9 +193,7 @@ async def test_second_sync_after_gap_uses_range_diff(_isolated_ledger): ctx2 = _ctx() r2 = await handle_link_commit(ctx2, "HEAD") - assert r2.sweep_scope == "range_diff", ( - f"Expected range_diff after gap, got {r2.sweep_scope}" - ) + assert r2.sweep_scope == "range_diff", f"Expected range_diff after gap, got {r2.sweep_scope}" assert r2.range_size >= 2, ( f"Expected range sweep to cover both pricing.py + auth.py " f"(range_size>=2), got range_size={r2.range_size}" @@ -216,7 +221,8 @@ async def test_pre_v0411_head_only_would_have_missed_intermediate_drift( # Drift commit _commit_edit( - repo_root, "pricing.py", + repo_root, + "pricing.py", "def calculate_discount(t):\n return t * 999", # nonsense "drift pricing", ) @@ -260,7 +266,8 @@ async def test_sync_to_same_sha_fast_paths_with_head_only_scope(_isolated_ledger @pytest.mark.phase2 @pytest.mark.asyncio async def test_unreachable_base_sha_falls_back_to_head_only( - _isolated_ledger, monkeypatch, + _isolated_ledger, + monkeypatch, ): """If ``last_synced_commit`` is unreachable (force-push, shallow clone), the range diff returns None and we fall back to head-only. @@ -274,6 +281,7 @@ async def test_unreachable_base_sha_falls_back_to_head_only( # Inject a bogus cursor by patching get_sync_state to return a # SHA that doesn't exist in the repo. from ledger import adapter as adapter_mod + bogus = "deadbeef" + "0" * 32 real_get_sync_state = adapter_mod.get_sync_state @@ -297,12 +305,15 @@ async def _bogus_get_sync_state(client, repo_path): def test_link_commit_response_contract_has_new_fields(): """LinkCommitResponse v0.4.11 contract has sweep_scope + range_size.""" from contracts import LinkCommitResponse + fields = LinkCommitResponse.model_fields assert "sweep_scope" in fields assert "range_size" in fields # Defaults: head_only / 0 — backward compat for callers that don't set them inst = LinkCommitResponse( - commit_hash="abc", synced=True, reason="new_commit", + commit_hash="abc", + synced=True, + reason="new_commit", ) assert inst.sweep_scope == "head_only" assert inst.range_size == 0 @@ -338,7 +349,8 @@ async def test_multi_region_edits_emit_pending_checks_per_region( await ledger.connect() # Append a second function so we have two regions in pricing.py - (repo_root / "pricing.py").write_text(dedent(""" + (repo_root / "pricing.py").write_text( + dedent(""" def calculate_discount(order_total): if order_total >= 100: return order_total * 0.10 @@ -347,7 +359,9 @@ def calculate_discount(order_total): def calculate_tax(order_total): return order_total * 0.08 - """).strip() + "\n") + """).strip() + + "\n" + ) _git(repo_root, "add", "pricing.py") _git(repo_root, "-c", "commit.gpgsign=false", "commit", "-q", "-m", "add tax") @@ -390,14 +404,17 @@ def calculate_tax(order_total): await handle_link_commit(ctx, "HEAD") # Now drift BOTH regions in one commit - (repo_root / "pricing.py").write_text(dedent(""" + (repo_root / "pricing.py").write_text( + dedent(""" def calculate_discount(order_total): return order_total * 999 # nonsense def calculate_tax(order_total): return order_total * 999 # nonsense - """).strip() + "\n") + """).strip() + + "\n" + ) _git(repo_root, "add", "pricing.py") _git(repo_root, "-c", "commit.gpgsign=false", "commit", "-q", "-m", "drift both") @@ -420,15 +437,12 @@ def calculate_tax(order_total): # Same intent across both checks (proves the shared-intent case). intent_ids = {p.decision_id for p in r2.pending_compliance_checks} assert len(intent_ids) == 1, ( - f"Multi-region test: pending checks should share one decision_id, " - f"got {intent_ids}" + f"Multi-region test: pending checks should share one decision_id, got {intent_ids}" ) # Distinct region_ids — the caller needs independent verdicts per region. region_ids = {p.region_id for p in r2.pending_compliance_checks} - assert len(region_ids) == 2, ( - f"Expected 2 distinct region_ids in the batch, got {region_ids}" - ) + assert len(region_ids) == 2, f"Expected 2 distinct region_ids in the batch, got {region_ids}" # Phase is drift (hash-mismatch triggered re-emission). phases = {p.phase for p in r2.pending_compliance_checks} diff --git a/tests/test_v0412_preflight.py b/tests/test_v0412_preflight.py index a4f4eabc..a24865dd 100644 --- a/tests/test_v0412_preflight.py +++ b/tests/test_v0412_preflight.py @@ -49,7 +49,6 @@ handle_preflight, ) - # ── Pure helpers ──────────────────────────────────────────────────── @@ -88,9 +87,7 @@ def test_validate_topic_strips_implementation_verbs(): def test_dedup_key_normalizes_word_order(): """'Stripe webhook' and 'webhook stripe' should dedup as same topic.""" - assert _dedup_key_for("Stripe webhook payment") == _dedup_key_for( - "payment webhook Stripe" - ) + assert _dedup_key_for("Stripe webhook payment") == _dedup_key_for("payment webhook Stripe") def test_check_dedup_marks_then_hits(): @@ -152,7 +149,9 @@ def _empty_search_response() -> SearchDecisionsResponse: return SearchDecisionsResponse( query="test", sync_status=LinkCommitResponse( - commit_hash="abc", synced=True, reason="new_commit", + commit_hash="abc", + synced=True, + reason="new_commit", ), matches=[], ungrounded_count=0, @@ -164,7 +163,9 @@ def _search_response_with(matches: list[DecisionMatch]) -> SearchDecisionsRespon return SearchDecisionsResponse( query="test", sync_status=LinkCommitResponse( - commit_hash="abc", synced=True, reason="new_commit", + commit_hash="abc", + synced=True, + reason="new_commit", ), matches=matches, ungrounded_count=sum(1 for m in matches if m.status == "ungrounded"), @@ -172,7 +173,9 @@ def _search_response_with(matches: list[DecisionMatch]) -> SearchDecisionsRespon ) -def _match(intent_id: str, status: str = "reflected", file_path: str = "src/foo.ts") -> DecisionMatch: +def _match( + intent_id: str, status: str = "reflected", file_path: str = "src/foo.ts" +) -> DecisionMatch: return DecisionMatch( decision_id=intent_id, description=f"decision {intent_id}", @@ -181,13 +184,15 @@ def _match(intent_id: str, status: str = "reflected", file_path: str = "src/foo. source_ref="test-ref", code_regions=[ CodeRegionSummary( - file_path=file_path, symbol="foo", lines=(1, 10), purpose="", + file_path=file_path, + symbol="foo", + lines=(1, 10), + purpose="", ) ], ) - @pytest.mark.asyncio async def test_topic_too_generic_returns_silent_skip(): ctx = _ctx() @@ -240,10 +245,12 @@ async def test_normal_mode_silent_on_plain_matches_only(): the only matches are reflected with no drift, no divergences, no open questions.""" ctx = _ctx(guided=False) - search = _search_response_with([ - _match("intent:1", status="reflected"), - _match("intent:2", status="reflected"), - ]) + search = _search_response_with( + [ + _match("intent:1", status="reflected"), + _match("intent:2", status="reflected"), + ] + ) with patch( "handlers.preflight.handle_search_decisions", new=AsyncMock(return_value=search), @@ -288,11 +295,11 @@ async def test_search_failure_fails_open(): """Robustness: if search throws, preflight returns fired=false silently — never blocks on bicameral being unavailable.""" ctx = _ctx() + async def _boom(*a, **kw): raise RuntimeError("ledger down") + with patch("handlers.preflight.handle_search_decisions", side_effect=_boom): r = await handle_preflight(ctx, topic="Stripe webhook payment") assert r.fired is False assert r.reason == "no_matches" - - diff --git a/tests/test_v0413_canonical_dedup.py b/tests/test_v0413_canonical_dedup.py index 9abad717..a89cfd08 100644 --- a/tests/test_v0413_canonical_dedup.py +++ b/tests/test_v0413_canonical_dedup.py @@ -30,7 +30,6 @@ canonicalize_text, ) - # ── Source ref canonicalization ───────────────────────────────────── @@ -57,7 +56,8 @@ def test_slack_three_variants_collapse(): def test_notion_strips_title_prefix(): out = canonicalize_source_ref( - "notion", "Page-Title-abc123def456abc123def456abc123ef45", + "notion", + "Page-Title-abc123def456abc123def456abc123ef45", ) # 32-char hex extracted from the end assert out.startswith("notion:") @@ -237,7 +237,8 @@ async def test_upsert_intent_collapses_whitespace_variant(monkeypatch, surreal_u decisions = await ledger.get_all_decisions(filter="all") matching = [ - d for d in decisions + d + for d in decisions if "redis" in d["description"].lower() and "session" in d["description"].lower() ] assert len(matching) == 1, ( diff --git a/tests/test_v0414_source_excerpt.py b/tests/test_v0414_source_excerpt.py index 8bccdb0e..a95436a4 100644 --- a/tests/test_v0414_source_excerpt.py +++ b/tests/test_v0414_source_excerpt.py @@ -24,7 +24,6 @@ from adapters.ledger import get_ledger, reset_ledger_singleton from context import BicameralContext - from handlers.detect_drift import handle_detect_drift from handlers.search_decisions import handle_search_decisions @@ -67,16 +66,17 @@ async def test_search_response_includes_source_excerpt(monkeypatch, surreal_url) ctx = BicameralContext.from_env() response = await handle_search_decisions( - ctx, query="token bucket rate limit", max_results=5, min_confidence=0.3, + ctx, + query="token bucket rate limit", + max_results=5, + min_confidence=0.3, ) assert response.matches, "Expected at least one match for the ingested decision" match = response.matches[0] assert "token bucket" in match.source_excerpt.lower(), ( f"source_excerpt should contain the meeting passage; got {match.source_excerpt!r}" ) - assert "Alex:" in match.source_excerpt, ( - "speaker prefix should be preserved in the raw passage" - ) + assert "Alex:" in match.source_excerpt, "speaker prefix should be preserved in the raw passage" assert match.meeting_date == "2026-03-30", ( f"meeting_date should round-trip; got {match.meeting_date!r}" ) @@ -116,7 +116,10 @@ async def test_empty_source_excerpt_is_graceful(monkeypatch, surreal_url): ctx = BicameralContext.from_env() response = await handle_search_decisions( - ctx, query="empty span test", max_results=5, min_confidence=0.3, + ctx, + query="empty span test", + max_results=5, + min_confidence=0.3, ) assert response.matches assert response.matches[0].source_excerpt == "" @@ -168,7 +171,9 @@ async def test_drift_entry_carries_source_excerpt(monkeypatch, surreal_url): ctx = BicameralContext.from_env() drift = await handle_detect_drift( - ctx, file_path="src/pricing/discount.py", use_working_tree=False, + ctx, + file_path="src/pricing/discount.py", + use_working_tree=False, ) assert drift.decisions, "Expected at least one decision from detect_drift" entry = drift.decisions[0] diff --git a/tests/test_v0416_gap_judge.py b/tests/test_v0416_gap_judge.py index af835788..95e90955 100644 --- a/tests/test_v0416_gap_judge.py +++ b/tests/test_v0416_gap_judge.py @@ -36,7 +36,6 @@ ) from handlers.ingest import handle_ingest - # ── Layer 1: pure rubric shape tests ──────────────────────────────── @@ -144,7 +143,10 @@ def test_build_context_decisions_groups_related_by_symbol(): source_ref="r1", code_regions=[ CodeRegionSummary( - file_path="src/limit.py", symbol="Limiter", lines=(1, 10), purpose="", + file_path="src/limit.py", + symbol="Limiter", + lines=(1, 10), + purpose="", ) ], drift_evidence="", @@ -160,7 +162,10 @@ def test_build_context_decisions_groups_related_by_symbol(): source_ref="r2", code_regions=[ CodeRegionSummary( - file_path="src/limit.py", symbol="Limiter", lines=(1, 10), purpose="", + file_path="src/limit.py", + symbol="Limiter", + lines=(1, 10), + purpose="", ) ], drift_evidence="", @@ -176,7 +181,10 @@ def test_build_context_decisions_groups_related_by_symbol(): source_ref="r3", code_regions=[ CodeRegionSummary( - file_path="src/other.py", symbol="Other", lines=(1, 10), purpose="", + file_path="src/other.py", + symbol="Other", + lines=(1, 10), + purpose="", ) ], drift_evidence="", @@ -222,8 +230,12 @@ def _seed_repo(repo_root: Path, body: str) -> None: _git(repo_root, "add", ".") _git( repo_root, - "-c", "commit.gpgsign=false", - "commit", "-q", "-m", "seed", + "-c", + "commit.gpgsign=false", + "commit", + "-q", + "-m", + "seed", ) @@ -303,7 +315,8 @@ async def test_judge_gaps_honest_empty_path(_isolated_ledger): ctx = BicameralContext.from_env() payload = await handle_judge_gaps( - ctx, topic="topic-that-has-no-decisions-anywhere", + ctx, + topic="topic-that-has-no-decisions-anywhere", ) assert payload is None @@ -333,7 +346,8 @@ async def test_judge_gaps_builds_context_pack(_isolated_ledger): # Search BM25 against the decision terms directly — generic topics # like "discount pricing" don't rank above min_confidence=0.3. judgment = await handle_judge_gaps( - ctx, topic="apply 10% discount on orders", + ctx, + topic="apply 10% discount on orders", ) assert judgment is not None, "judge_gaps must build a pack on matches" assert judgment.topic == "apply 10% discount on orders" @@ -342,9 +356,7 @@ async def test_judge_gaps_builds_context_pack(_isolated_ledger): assert "VERBATIM" in judgment.judgment_prompt assert judgment.as_of, "as_of must be populated with ISO datetime" - assert len(judgment.decisions) >= 1, ( - "judge_gaps should see the just-ingested decision" - ) + assert len(judgment.decisions) >= 1, "judge_gaps should see the just-ingested decision" decision = judgment.decisions[0] assert "10%" in decision.description or "discount" in decision.description.lower() assert "10%" in decision.source_excerpt or "$100" in decision.source_excerpt diff --git a/tests/test_v0416_natural_format_fields.py b/tests/test_v0416_natural_format_fields.py index d5c5f674..ed563824 100644 --- a/tests/test_v0416_natural_format_fields.py +++ b/tests/test_v0416_natural_format_fields.py @@ -29,9 +29,11 @@ def test_canonical_description_survives(): """`decisions[].description` is the canonical field — must produce a mapping with the description as the intent.""" - out = _normalize_payload({ - "decisions": [{"description": "Use Redis for session cache"}], - }) + out = _normalize_payload( + { + "decisions": [{"description": "Use Redis for session cache"}], + } + ) mappings = out.get("mappings", []) assert len(mappings) == 1 assert mappings[0]["intent"] == "Use Redis for session cache" @@ -41,9 +43,11 @@ def test_canonical_description_survives(): def test_canonical_title_fallback(): """`decisions[].title` is the documented secondary field — used when `description` is absent.""" - out = _normalize_payload({ - "decisions": [{"title": "Apply 10% discount on orders over $100"}], - }) + out = _normalize_payload( + { + "decisions": [{"title": "Apply 10% discount on orders over $100"}], + } + ) mappings = out.get("mappings", []) assert len(mappings) == 1 assert mappings[0]["intent"] == "Apply 10% discount on orders over $100" @@ -53,9 +57,11 @@ def test_text_alias_for_decisions(): """v0.4.16 alias: `text` on a decision should flow through as the intent. This is the exact shape the old SKILL.md documented; keeping it working guards against a regression.""" - out = _normalize_payload({ - "decisions": [{"text": "Cache user sessions in Redis"}], - }) + out = _normalize_payload( + { + "decisions": [{"text": "Cache user sessions in Redis"}], + } + ) mappings = out.get("mappings", []) assert len(mappings) == 1 assert mappings[0]["intent"] == "Cache user sessions in Redis" @@ -65,12 +71,16 @@ def test_description_preferred_over_text_when_both_present(): """When a decision has both `description` and `text`, the canonical `description` wins. This is the documented priority order: description > title > text.""" - out = _normalize_payload({ - "decisions": [{ - "description": "canonical description wins", - "text": "alias should lose", - }], - }) + out = _normalize_payload( + { + "decisions": [ + { + "description": "canonical description wins", + "text": "alias should lose", + } + ], + } + ) mappings = out.get("mappings", []) assert len(mappings) == 1 assert mappings[0]["intent"] == "canonical description wins" @@ -79,13 +89,15 @@ def test_description_preferred_over_text_when_both_present(): def test_decision_with_all_text_fields_empty_is_dropped(): """If a decision has no text in any accepted field, it must be silently dropped rather than producing a phantom mapping.""" - out = _normalize_payload({ - "decisions": [ - {"description": "real decision"}, - {"status": "proposed"}, # no description/title/text - {"id": "abc", "participants": ["Ian"]}, # metadata only - ], - }) + out = _normalize_payload( + { + "decisions": [ + {"description": "real decision"}, + {"status": "proposed"}, # no description/title/text + {"id": "abc", "participants": ["Ian"]}, # metadata only + ], + } + ) mappings = out.get("mappings", []) assert len(mappings) == 1 assert mappings[0]["intent"] == "real decision" @@ -95,9 +107,11 @@ def test_action_items_not_written_to_ledger(): """action_items are accepted in payload for backwards compat but NOT written to the ledger (not converted to mappings). They belong in a ticket tracker, not the decision ledger.""" - out = _normalize_payload({ - "action_items": [{"action": "Write retry tests", "owner": "Ian"}], - }) + out = _normalize_payload( + { + "action_items": [{"action": "Write retry tests", "owner": "Ian"}], + } + ) mappings = out.get("mappings", []) assert len(mappings) == 0 @@ -105,10 +119,12 @@ def test_action_items_not_written_to_ledger(): def test_action_items_mixed_with_decisions(): """When payload has both decisions and action_items, only decisions become mappings — action_items are silently ignored.""" - out = _normalize_payload({ - "decisions": [{"description": "Use Redis for session cache"}], - "action_items": [{"action": "Write retry tests", "owner": "Ian"}], - }) + out = _normalize_payload( + { + "decisions": [{"description": "Use Redis for session cache"}], + "action_items": [{"action": "Write retry tests", "owner": "Ian"}], + } + ) mappings = out.get("mappings", []) assert len(mappings) == 1 assert mappings[0]["intent"] == "Use Redis for session cache" @@ -120,17 +136,19 @@ def test_the_exact_dogfood_payload(): 1 phantom '[Action: Ian] ' mapping, grounded to unrelated symbols. After the fix: only real decisions surface; action_items are accepted for backwards compat but not written to the ledger.""" - out = _normalize_payload({ - "source": "transcript", - "title": "demo-gallery", - "decisions": [ - {"text": "Cache user sessions in Redis for horizontal scaling"}, - {"text": "Apply 10% discount on orders over $100"}, - ], - "action_items": [ - {"text": "Write tests for retry policy", "owner": "Ian"}, - ], - }) + out = _normalize_payload( + { + "source": "transcript", + "title": "demo-gallery", + "decisions": [ + {"text": "Cache user sessions in Redis for horizontal scaling"}, + {"text": "Apply 10% discount on orders over $100"}, + ], + "action_items": [ + {"text": "Write tests for retry policy", "owner": "Ian"}, + ], + } + ) mappings = out.get("mappings", []) intents = [m["intent"] for m in mappings] assert "Cache user sessions in Redis for horizontal scaling" in intents @@ -143,13 +161,15 @@ def test_the_exact_dogfood_payload(): def test_mixed_canonical_and_alias_in_same_payload(): """A payload can mix canonical and alias fields across decisions — the handler normalizes each decision independently.""" - out = _normalize_payload({ - "decisions": [ - {"description": "First decision via canonical field"}, - {"title": "Second decision via title fallback"}, - {"text": "Third decision via text alias"}, - ], - }) + out = _normalize_payload( + { + "decisions": [ + {"description": "First decision via canonical field"}, + {"title": "Second decision via title fallback"}, + {"text": "Third decision via text alias"}, + ], + } + ) mappings = out.get("mappings", []) assert len(mappings) == 3 assert mappings[0]["intent"] == "First decision via canonical field" @@ -160,11 +180,13 @@ def test_mixed_canonical_and_alias_in_same_payload(): def test_action_items_always_produce_zero_mappings(): """action_items are never written to the ledger regardless of their fields. This guards against the '[Action: <owner>] ' phantom-mapping regression.""" - out = _normalize_payload({ - "action_items": [ - {"action": "real action", "owner": "Ian"}, - {"action": "another action"}, - ], - }) + out = _normalize_payload( + { + "action_items": [ + {"action": "real action", "owner": "Ian"}, + {"action": "another action"}, + ], + } + ) mappings = out.get("mappings", []) assert len(mappings) == 0 diff --git a/tests/test_v0417_jargon_hygiene.py b/tests/test_v0417_jargon_hygiene.py index 231135ef..87eb169e 100644 --- a/tests/test_v0417_jargon_hygiene.py +++ b/tests/test_v0417_jargon_hygiene.py @@ -58,10 +58,12 @@ def _all_skill_files() -> list[Path]: - return sorted([ - *_MCP_ROOT.glob("skills/**/SKILL.md"), - *_MCP_ROOT.glob(".claude/skills/**/SKILL.md"), - ]) + return sorted( + [ + *_MCP_ROOT.glob("skills/**/SKILL.md"), + *_MCP_ROOT.glob(".claude/skills/**/SKILL.md"), + ] + ) def _compile_patterns() -> list[tuple[str, re.Pattern]]: @@ -97,10 +99,7 @@ def test_no_backend_jargon_in_skill_files(): for match in pattern.finditer(body): # Find the line number for a useful error message line_no = body.count("\n", 0, match.start()) + 1 - offenders.append( - f"{rel}:{line_no}: " - f"'{match.group()}' (term: '{term}')" - ) + offenders.append(f"{rel}:{line_no}: '{match.group()}' (term: '{term}')") assert not offenders, ( "Backend jargon found in user-facing skill files:\n" + "\n".join(f" - {o}" for o in offenders) @@ -129,9 +128,8 @@ def test_no_backend_jargon_in_tool_descriptions(): continue # Match Tool(...) — plain Name or attribute reference func = node.func - is_tool = ( - (isinstance(func, ast.Name) and func.id == "Tool") - or (isinstance(func, ast.Attribute) and func.attr == "Tool") + is_tool = (isinstance(func, ast.Name) and func.id == "Tool") or ( + isinstance(func, ast.Attribute) and func.attr == "Tool" ) if not is_tool: continue @@ -152,13 +150,10 @@ def test_no_backend_jargon_in_tool_descriptions(): for term, pattern in patterns: for match in pattern.finditer(desc_text): - offenders.append( - f"Tool '{tool_name}': '{match.group()}' (term: '{term}')" - ) + offenders.append(f"Tool '{tool_name}': '{match.group()}' (term: '{term}')") - assert not offenders, ( - "Backend jargon found in Tool descriptions:\n" - + "\n".join(f" - {o}" for o in offenders) + assert not offenders, "Backend jargon found in Tool descriptions:\n" + "\n".join( + f" - {o}" for o in offenders ) diff --git a/tests/test_v0420_history.py b/tests/test_v0420_history.py index b64403cb..d751dd6a 100644 --- a/tests/test_v0420_history.py +++ b/tests/test_v0420_history.py @@ -24,7 +24,6 @@ from context import BicameralContext from handlers.history import handle_history - # ── Fixtures ───────────────────────────────────────────────────────────────── @@ -105,20 +104,27 @@ async def test_empty_ledger(ctx): async def test_single_source_reflected(ctx): """One decision with a code region → one feature, one decision, status reflected or ungrounded.""" ledger = get_ledger() - await _ingest(ledger, _payload([ - _mapping( - description="Use tree-sitter for symbol extraction", - source_type="transcript", - source_ref="sprint-1", - code_regions=[{ - "file_path": "server.py", - "symbol": "validate_symbols", - "type": "function", - "start_line": 10, - "end_line": 30, - }], - ) - ])) + await _ingest( + ledger, + _payload( + [ + _mapping( + description="Use tree-sitter for symbol extraction", + source_type="transcript", + source_ref="sprint-1", + code_regions=[ + { + "file_path": "server.py", + "symbol": "validate_symbols", + "type": "function", + "start_line": 10, + "end_line": 30, + } + ], + ) + ] + ), + ) response = await handle_history(ctx) @@ -157,10 +163,7 @@ async def test_multi_source_same_decision(ctx): response = await handle_history(ctx) # Count matching decisions across all features - matching = [ - d for f in response.features for d in f.decisions - if "Cache sessions" in d.summary - ] + matching = [d for f in response.features for d in f.decisions if "Cache sessions" in d.summary] # With dedup, should be exactly 1 assert len(matching) == 1, ( f"Expected 1 deduped decision, got {len(matching)}: {[d.summary for d in matching]}" @@ -172,14 +175,19 @@ async def test_multi_source_same_decision(ctx): async def test_ungrounded_no_fulfillment(ctx): """Decision with no code regions → fulfillment is None, status ungrounded or discovered.""" ledger = get_ledger() - await _ingest(ledger, _payload([ - _mapping( - description="Implement SOC2 audit logging", - source_type="document", - source_ref="compliance-doc", - code_regions=[], # no grounding - ) - ])) + await _ingest( + ledger, + _payload( + [ + _mapping( + description="Implement SOC2 audit logging", + source_type="document", + source_ref="compliance-doc", + code_regions=[], # no grounding + ) + ] + ), + ) response = await handle_history(ctx) @@ -196,13 +204,18 @@ async def test_ungrounded_no_fulfillment(ctx): async def test_agent_session_source_type(ctx): """source_type='agent_session' round-trips through history correctly.""" ledger = get_ledger() - await _ingest(ledger, _payload([ - _mapping( - description="Use event.id for deduplication, not account_id", - source_type="agent_session", - source_ref="preflight-resolution-stripe-webhook", - ) - ])) + await _ingest( + ledger, + _payload( + [ + _mapping( + description="Use event.id for deduplication, not account_id", + source_type="agent_session", + source_ref="preflight-resolution-stripe-webhook", + ) + ] + ), + ) response = await handle_history(ctx) @@ -226,28 +239,43 @@ async def test_feature_group_grouping(ctx): ledger = get_ledger() # Two separate ingests, same feature_group - await _ingest(ledger, _payload([ - _mapping( - description="Stripe webhook uses SETNX for idempotency", - source_ref="sprint-5", - feature_group="Stripe Webhooks", - ) - ])) - await _ingest(ledger, _payload([ - _mapping( - description="Stripe webhook retries use exponential backoff", - source_ref="sprint-5", - feature_group="Stripe Webhooks", - ) - ])) + await _ingest( + ledger, + _payload( + [ + _mapping( + description="Stripe webhook uses SETNX for idempotency", + source_ref="sprint-5", + feature_group="Stripe Webhooks", + ) + ] + ), + ) + await _ingest( + ledger, + _payload( + [ + _mapping( + description="Stripe webhook retries use exponential backoff", + source_ref="sprint-5", + feature_group="Stripe Webhooks", + ) + ] + ), + ) # Different feature group - await _ingest(ledger, _payload([ - _mapping( - description="Google Calendar syncs via OAuth2", - source_ref="sprint-6", - feature_group="Google Calendar", - ) - ])) + await _ingest( + ledger, + _payload( + [ + _mapping( + description="Google Calendar syncs via OAuth2", + source_ref="sprint-6", + feature_group="Google Calendar", + ) + ] + ), + ) response = await handle_history(ctx) @@ -280,25 +308,27 @@ async def test_feature_group_fallback_to_query(ctx): ledger = get_ledger() # Ingest without feature_group (pre-v0.5.1 style) - await ledger.ingest_payload({ - "repo": "test-repo", - "query": "auth middleware", - "mappings": [ - { - "intent": "JWT tokens expire after 24 hours", - "span": { - "text": "JWT tokens expire after 24 hours", - "source_type": "transcript", - "source_ref": "auth-sync-2026-04", - "speakers": [], - "meeting_date": "2026-04-01", - }, - "symbols": [], - "code_regions": [], - # no feature_group - } - ], - }) + await ledger.ingest_payload( + { + "repo": "test-repo", + "query": "auth middleware", + "mappings": [ + { + "intent": "JWT tokens expire after 24 hours", + "span": { + "text": "JWT tokens expire after 24 hours", + "source_type": "transcript", + "source_ref": "auth-sync-2026-04", + "speakers": [], + "meeting_date": "2026-04-01", + }, + "symbols": [], + "code_regions": [], + # no feature_group + } + ], + } + ) response = await handle_history(ctx) @@ -323,13 +353,18 @@ async def test_truncation_at_50_features(ctx): # Create 51 decisions with distinct feature_groups for i in range(51): - await _ingest(ledger, _payload([ - _mapping( - description=f"Decision for feature area {i}", - source_ref=f"ref-{i}", - feature_group=f"Feature Area {i:03d}", - ) - ])) + await _ingest( + ledger, + _payload( + [ + _mapping( + description=f"Decision for feature area {i}", + source_ref=f"ref-{i}", + feature_group=f"Feature Area {i:03d}", + ) + ] + ), + ) response = await handle_history(ctx) @@ -347,20 +382,30 @@ async def test_feature_filter(ctx): ledger = get_ledger() # Create two distinct feature groups - await _ingest(ledger, _payload([ - _mapping( - description="Checkout uses Stripe payment intents", - source_ref="ref-checkout", - feature_group="Checkout Flow", - ) - ])) - await _ingest(ledger, _payload([ - _mapping( - description="Auth uses JWT with 24h expiry", - source_ref="ref-auth", - feature_group="Auth Middleware", - ) - ])) + await _ingest( + ledger, + _payload( + [ + _mapping( + description="Checkout uses Stripe payment intents", + source_ref="ref-checkout", + feature_group="Checkout Flow", + ) + ] + ), + ) + await _ingest( + ledger, + _payload( + [ + _mapping( + description="Auth uses JWT with 24h expiry", + source_ref="ref-auth", + feature_group="Auth Middleware", + ) + ] + ), + ) response = await handle_history(ctx, feature_filter="checkout") @@ -380,13 +425,18 @@ async def test_feature_filter(ctx): async def test_include_superseded_false(ctx): """include_superseded=False excludes superseded decisions from response.""" ledger = get_ledger() - await _ingest(ledger, _payload([ - _mapping( - description="Use Redis for session caching", - source_ref="sprint-1", - feature_group="Session Management", - ) - ])) + await _ingest( + ledger, + _payload( + [ + _mapping( + description="Use Redis for session caching", + source_ref="sprint-1", + feature_group="Session Management", + ) + ] + ), + ) # All decisions will be ungrounded (not superseded) in this test, # so we just verify the parameter is accepted and response is valid. @@ -403,13 +453,18 @@ async def test_include_superseded_false(ctx): async def test_response_structure(ctx): """HistoryResponse has the correct structure and types.""" ledger = get_ledger() - await _ingest(ledger, _payload([ - _mapping( - description="Rate limit API calls to 1000 req/min per tenant", - source_ref="sprint-3", - feature_group="Rate Limiting", - ) - ])) + await _ingest( + ledger, + _payload( + [ + _mapping( + description="Rate limit API calls to 1000 req/min per tenant", + source_ref="sprint-3", + feature_group="Rate Limiting", + ) + ] + ), + ) response = await handle_history(ctx) diff --git a/tests/test_v048_sync_dedup.py b/tests/test_v048_sync_dedup.py index 94fa358b..46d45bb4 100644 --- a/tests/test_v048_sync_dedup.py +++ b/tests/test_v048_sync_dedup.py @@ -55,8 +55,12 @@ def _seed_repo(repo_root: Path, body: str) -> None: _git(repo_root, "add", ".") _git( repo_root, - "-c", "commit.gpgsign=false", - "commit", "-q", "-m", "seed", + "-c", + "commit.gpgsign=false", + "commit", + "-q", + "-m", + "seed", ) @@ -65,8 +69,12 @@ def _commit_edit(repo_root: Path, new_body: str, message: str) -> None: _git(repo_root, "add", "pricing.py") _git( repo_root, - "-c", "commit.gpgsign=false", - "commit", "-q", "-m", message, + "-c", + "commit.gpgsign=false", + "commit", + "-q", + "-m", + message, ) @@ -114,8 +122,7 @@ async def test_dedup_second_call_normalizes_reason(_isolated_ledger): r2 = await handle_link_commit(ctx, "HEAD") assert r2.reason == "already_synced", ( - f"Dedup hit must normalize reason to 'already_synced', " - f"got {r2.reason!r}" + f"Dedup hit must normalize reason to 'already_synced', got {r2.reason!r}" ) # Cached fields should match the first call's real values (B23). assert r2.commit_hash == r1.commit_hash @@ -149,9 +156,7 @@ async def _counting_ingest_commit(*args, **kwargs): ctx = _ctx() await handle_link_commit(ctx, "HEAD") - assert call_count["n"] == 1, ( - f"First call should hit the ledger once, got {call_count['n']}" - ) + assert call_count["n"] == 1, f"First call should hit the ledger once, got {call_count['n']}" # Second call WITHOUT invalidate — dedup short-circuits, no ledger hit. await handle_link_commit(ctx, "HEAD") @@ -202,8 +207,7 @@ def calculate_discount(order_total): f"trusting it instead of re-reading git HEAD." ) assert r2.commit_hash != r1.commit_hash, ( - f"New HEAD SHA should differ from old. r1={r1.commit_hash!r}, " - f"r2={r2.commit_hash!r}" + f"New HEAD SHA should differ from old. r1={r1.commit_hash!r}, r2={r2.commit_hash!r}" ) @@ -224,7 +228,6 @@ async def test_explicit_sha_dedup(_isolated_ledger): r2 = await handle_link_commit(ctx, head_sha) assert r2.reason == "already_synced", ( - f"Second call with same explicit SHA should dedup — " - f"got reason={r2.reason!r}" + f"Second call with same explicit SHA should dedup — got reason={r2.reason!r}" ) assert r2.commit_hash == r1.commit_hash diff --git a/tests/test_v055_region_anchored_preflight.py b/tests/test_v055_region_anchored_preflight.py index 73b2c01e..94886243 100644 --- a/tests/test_v055_region_anchored_preflight.py +++ b/tests/test_v055_region_anchored_preflight.py @@ -30,7 +30,6 @@ handle_preflight, ) - # ── Fixtures ──────────────────────────────────────────────────────────────── @@ -82,12 +81,14 @@ def _make_ctx( queried. """ ledger = MagicMock() - ledger.ingest_commit = AsyncMock(return_value={ - "commit_hash": "abc123", - "new_decisions_linked": 0, - "drift_detected": [], - "symbols_indexed": 0, - }) + ledger.ingest_commit = AsyncMock( + return_value={ + "commit_hash": "abc123", + "new_decisions_linked": 0, + "drift_detected": [], + "symbols_indexed": 0, + } + ) ledger.get_decisions_for_files = AsyncMock(return_value=region_decisions or []) ledger.search_by_query = AsyncMock(return_value=[]) @@ -221,9 +222,17 @@ async def test_preflight_fires_on_region_hit_no_keyword(): ) with ( - patch("handlers.link_commit.handle_link_commit", new=AsyncMock(return_value=_make_link_commit_response())), - patch("handlers.search_decisions.handle_link_commit", new=AsyncMock(return_value=_make_link_commit_response())), - patch("handlers.preflight.handle_search_decisions", new=AsyncMock(return_value=search_resp)), + patch( + "handlers.link_commit.handle_link_commit", + new=AsyncMock(return_value=_make_link_commit_response()), + ), + patch( + "handlers.search_decisions.handle_link_commit", + new=AsyncMock(return_value=_make_link_commit_response()), + ), + patch( + "handlers.preflight.handle_search_decisions", new=AsyncMock(return_value=search_resp) + ), ): resp = await handle_preflight( ctx, @@ -248,9 +257,17 @@ async def test_preflight_region_in_sources_chained(): ) with ( - patch("handlers.link_commit.handle_link_commit", new=AsyncMock(return_value=_make_link_commit_response())), - patch("handlers.search_decisions.handle_link_commit", new=AsyncMock(return_value=_make_link_commit_response())), - patch("handlers.preflight.handle_search_decisions", new=AsyncMock(return_value=search_resp)), + patch( + "handlers.link_commit.handle_link_commit", + new=AsyncMock(return_value=_make_link_commit_response()), + ), + patch( + "handlers.search_decisions.handle_link_commit", + new=AsyncMock(return_value=_make_link_commit_response()), + ), + patch( + "handlers.preflight.handle_search_decisions", new=AsyncMock(return_value=search_resp) + ), ): resp = await handle_preflight( ctx, @@ -287,9 +304,17 @@ async def test_preflight_topic_only_no_file_paths_still_works(): ) with ( - patch("handlers.link_commit.handle_link_commit", new=AsyncMock(return_value=_make_link_commit_response())), - patch("handlers.search_decisions.handle_link_commit", new=AsyncMock(return_value=_make_link_commit_response())), - patch("handlers.preflight.handle_search_decisions", new=AsyncMock(return_value=search_resp)), + patch( + "handlers.link_commit.handle_link_commit", + new=AsyncMock(return_value=_make_link_commit_response()), + ), + patch( + "handlers.search_decisions.handle_link_commit", + new=AsyncMock(return_value=_make_link_commit_response()), + ), + patch( + "handlers.preflight.handle_search_decisions", new=AsyncMock(return_value=search_resp) + ), ): resp = await handle_preflight(ctx, topic="drifted stripe webhook handler")