diff --git a/CHANGELOG.md b/CHANGELOG.md index 250a239c..1e463c4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,69 @@ All notable changes to bicameral-mcp are tracked here. Format loosely follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## 0.4.23 — 2026-04-21 — caller-LLM-driven retrieval + search_hint recall booster + +Addresses the BM25 vocab-mismatch problem that surfaced after v0.4.20 +made grounding status honest. Decisions whose natural-language +description doesn't lexically overlap with the real code identifier +vocabulary were getting bound to whatever file incidentally shared a +keyword — "email dispatch" binding to a React toast reducer's +`dispatch`, "active subscriber" binding to an unrelated `AcquisitionFunnel.tsx` +`ActiveUser` component. Under v0.4.19's silent auto-promotion nobody +saw this; under v0.4.20's honest PENDING projection users saw garbage +bindings and had nothing to do about them. + +Two changes, both within the existing deterministic-retrieval moat: + +### Changed — caller-LLM retrieval is now the default (Lever 1) + +- `skills/bicameral-ingest/SKILL.md` restructured. Step 2 is now + *"Resolve code regions via the MCP retrieval tools"* — caller LLM is + instructed to use `validate_symbols` + `search_code` + `get_neighbors` + to build explicit `code_regions` from codebase evidence *before* + ingesting. Step 3 now leads with the internal format (with explicit + regions) as the preferred shape. Natural format remains supported + as the fallback for abstract decisions with no resolvable code surface. +- No server-side code changes. The server already accepted internal- + format ingest payloads; this flips the skill's default guidance from + *"use natural format, let BM25 handle it"* to *"resolve explicitly, + fall back to BM25 only when necessary."* + +### Added — `search_hint` recall booster (Lever 2) + +- `IngestMapping.search_hint: str` and `IngestDecision.search_hint: str` + — optional caller-supplied field carrying synonyms / domain vocab / + likely identifier names that the decision's description wouldn't + contain literally. Used only when the mapping falls through to + server-side auto-grounding. +- `adapters.code_locator.ground_mappings` concatenates + `description + " " + search_hint` as the BM25 query when the hint is + non-empty. Strictly additive: omitted hint = pre-v0.4.23 behavior. +- `search_hint` is query-only metadata. It is never stored on + `intent.description` and never surfaces in briefs, status responses, + or the gap-judge context pack. Humans see the clean decision text; + BM25 sees the widened query. + +### Guarantee preserved — no server-side LLM + +Retrieval remains deterministic at runtime. The caller LLM does the +expensive lookup at ingest time (when it has your full codebase +context), writes explicit `code_regions`, and the server's BM25 fallback +is only consulted for truly abstract decisions. This keeps the tech +moat (*deterministic, provider-agnostic retrieval*) intact while +fixing the quality complaint. + +### Upgrade notes + +- **Existing bindings from pre-v0.4.23 ingests are unchanged.** If you + have false-positive bindings from BM25 auto-grounding (e.g., dispatch + intents bound to `use-toast.ts`), they persist in the graph. To clean + them up today: `bicameral.reset` → re-ingest under the new skill + defaults. A targeted edge-pruning path is tracked for a future release. +- **No schema change**, no migration, no behavior shift for running + callers — the skill update only changes the default path the caller + LLM takes when the bicameral-ingest skill is invoked. + ## 0.4.22 — 2026-04-20 — hotfix: init_schema idempotent against existing persistent DB **Hotfix for v0.4.20 regression on persistent DBs.** Phase 1b made diff --git a/RECOMMENDED_VERSION b/RECOMMENDED_VERSION index 39cf8aa8..a7475fbf 100644 --- a/RECOMMENDED_VERSION +++ b/RECOMMENDED_VERSION @@ -1 +1 @@ -0.4.22 +0.4.23 diff --git a/adapters/code_locator.py b/adapters/code_locator.py index ad7f69ec..0c61e56d 100644 --- a/adapters/code_locator.py +++ b/adapters/code_locator.py @@ -404,6 +404,17 @@ def ground_mappings(self, mappings: list[dict]) -> tuple[list[dict], int]: resolved.append(mapping) continue + # v0.4.23 (Lever 2): widen the BM25 query with the caller-LLM's + # search_hint — synonyms, domain vocab, likely identifier names. + # Strictly additive: if no hint is provided, behavior is + # identical to pre-0.4.23. Hint is NOT stored as part of the + # intent's description; it's query-only metadata. + search_hint = (mapping.get("search_hint") or "").strip() + if search_hint: + bm25_query = f"{description} {search_hint}" + else: + bm25_query = description + # FC-1 guard: refuse to ground queries that degenerate to <2 corpus # tokens. Witnessed in Accountable (2026-04-13): "GitHub Discussions # vs Slack" left only ``slack`` after stopword filtering, BM25 @@ -411,23 +422,23 @@ def ground_mappings(self, mappings: list[dict]) -> tuple[list[dict], int]: # to ``log-error-to-slack/index.ts:getFeatureName`` by tiebreak. # Under-specified queries belong as ungrounded open questions. try: - corpus_token_count = self._search_tool.bm25.count_corpus_tokens(description) + corpus_token_count = self._search_tool.bm25.count_corpus_tokens(bm25_query) except Exception as exc: - logger.debug("[ground] FC-1 token count failed for '%s': %s", description[:60], exc) + logger.debug("[ground] FC-1 token count failed for '%s': %s", bm25_query[:60], exc) corpus_token_count = 2 # fail-open: do not block grounding on detector failure if corpus_token_count < 2: logger.info( "[ground] FC-1 skip: %d corpus tokens in %r — leaving ungrounded", - corpus_token_count, description[:60], + corpus_token_count, bm25_query[:60], ) resolved.append(mapping) continue # Run BM25 search once and reuse across tiers. try: - hits = self.search_code(description) + hits = self.search_code(bm25_query) except Exception as exc: - logger.warning("[ground] BM25 search failed for '%s': %s", description[:60], exc) + logger.warning("[ground] BM25 search failed for '%s': %s", bm25_query[:60], exc) hits = [] code_regions: list[dict] = [] diff --git a/contracts.py b/contracts.py index ce57e06f..1f85c0f7 100644 --- a/contracts.py +++ b/contracts.py @@ -394,6 +394,13 @@ class IngestMapping(BaseModel): span: IngestSpan = IngestSpan() symbols: list[str] = [] code_regions: list[IngestCodeRegion] = [] + # v0.4.23 (Lever 2): optional additional BM25 search terms — synonyms, + # related domain vocab, likely code identifiers — used to widen recall + # when the server falls through to auto-grounding. Only consulted when + # ``code_regions`` is empty. The caller LLM supplies these at ingest + # time via the bicameral-ingest skill. Stored-intent.description is + # never polluted with these terms; they're query-only metadata. + search_hint: str = "" class IngestDecision(BaseModel): @@ -417,6 +424,13 @@ class IngestDecision(BaseModel): # the conclusion), and the ingest path will skip creating a # placeholder source_span row. source_excerpt: str = "" + # v0.4.23 (Lever 2): BM25 recall booster for the auto-grounding + # fallback path. Caller LLM supplies synonyms, domain vocab, likely + # identifier names that the decision's description wouldn't contain + # literally ("subscription check" → "resolveMemberStatus + # isActiveSubscriber dispatch_reminders dispatch_interventions"). Only + # consulted when no explicit code_regions are resolved. + search_hint: str = "" class IngestActionItem(BaseModel): diff --git a/handlers/ingest.py b/handlers/ingest.py index 073f81c5..f2e0c6fe 100644 --- a/handlers/ingest.py +++ b/handlers/ingest.py @@ -56,6 +56,8 @@ def _normalize_payload(payload: dict) -> dict: }, "symbols": [], "code_regions": [], + # v0.4.23: propagate search_hint from natural format → internal. + "search_hint": d.search_hint, }) for a in validated.action_items: diff --git a/pyproject.toml b/pyproject.toml index 12d3c71a..8edeabba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "bicameral-mcp" -version = "0.4.22" +version = "0.4.23" description = "Decision ledger MCP server — ingests meeting transcripts, maps decisions to code, tracks drift" readme = "README.md" requires-python = ">=3.10" diff --git a/skills/bicameral-ingest/SKILL.md b/skills/bicameral-ingest/SKILL.md index 8caff42e..b91a9277 100644 --- a/skills/bicameral-ingest/SKILL.md +++ b/skills/bicameral-ingest/SKILL.md @@ -158,21 +158,85 @@ Keep the business driver attached to each decision's description so the gap judg → **Extract: 1 decision** — "Add PII redaction to the audit log (driver: GDPR self-assessment data-minimization check, next month deadline)." The key-rotation line is security hygiene with no business driver named — reject it. A PM reviewing the ledger can act on the GDPR item; they can't act on key rotation. -### 2. Validate relevance against the codebase +### 2. Resolve code regions via the MCP retrieval tools (v0.4.23+ default) + +**This is where grounding quality is won or lost.** Server-side BM25 is a fallback +for *abstract* decisions with no identifiable code anchor. For every decision +that touches concrete code, **you** (the caller LLM) should resolve explicit +`code_regions` using the MCP retrieval tools before ingesting. You have full +codebase context; BM25 has a bag of tokens. Use your advantage. + +**Procedure per decision**: + +1. **Generate symbol hypotheses** from the decision text. If a decision says + *"all email dispatch functions filter via a single source-of-truth check,"* + your hypotheses are `dispatchReminders`, `dispatchInterventions`, + `dispatchNudge`, `resolveMemberStatus`, `isActiveSubscriber` — not just + the literal word "dispatch." +2. **Call `validate_symbols`** with the hypotheses. Keep symbols that actually + exist in the index; drop the rest. +3. **Call `search_code`** with the validated symbol_ids (not the raw decision + text — seeded graph traversal is strictly better than keyword BM25 for + finding the real regions). Take the top hits that look relevant. +4. **Call `get_neighbors`** on the top hit if you're unsure of scope — surfaces + callers/callees so you can tell whether the decision is local to one + function or spans a call tree. +5. **Build explicit `code_regions`** — `{file_path, symbol, start_line, end_line, type}` — + from the validated tool output. Prefer function-level pins over file-level; + bind to the tightest region that still covers the decision's surface area. + +**Grounding quality: filter out false positives before ingesting**. If +`search_code` returns a hit that keyword-matches but doesn't actually implement +anything related to the decision, drop it. Example: a decision about email +dispatch should NOT bind to a React `dispatch` reducer just because the word +appears. Ingesting garbage bindings means every edit to that unrelated file +triggers a drift alarm later — noise that drowns out real signal. + +**Skip decisions that don't bind to real code**. If after this procedure the +decision has zero concrete regions AND names no valid symbols, it's either +(a) strategic (drop it) or (b) a genuine "pending" decision for code that +doesn't exist yet. For the pending case, ingest it with empty `code_regions` +but include a `search_hint` (see Step 3) so the server's future re-grounding +sweeps have something to work with. -For each candidate decision, use the code locator tools to check whether it touches real code: - -- Call `search_code` with a query derived from the decision text. If results come back with relevant hits, the decision is groundable. -- If the decision mentions specific symbols (functions, classes, modules), call `validate_symbols` with those names to confirm they exist. -- If a decision returns **zero relevant code hits** and names **no valid symbols**, it is likely strategic — drop it unless it describes something that *should* be built but doesn't exist yet (a genuine "pending" decision). +### 3. Ingest the filtered set -This step is a lightweight filter, not an exhaustive audit. Spend ~1 search per candidate decision. +Call `bicameral.ingest` using the **internal format** (preferred from +v0.4.23+ onward) with the `code_regions` you resolved in step 2. Natural +format remains supported as a fallback for truly abstract decisions with +no resolvable code surface. -### 3. Ingest the filtered set +**Internal format** (preferred v0.4.23+) — use this when you resolved +`code_regions` in Step 2: -Call `bicameral.ingest` with a `payload` using the **natural format** (preferred). Only include decisions that passed the relevance filter from step 2. +``` +payload: { + query: "", + mappings: [ + { + intent: "Cache user sessions in Redis for horizontal scaling", + span: { + text: "", + source_type: "transcript", + source_ref: "sprint-14-planning", + meeting_date: "2026-04-15", + speakers: ["Ian", "Brian"] + }, + symbols: ["SessionCache", "RedisClient"], + code_regions: [ + { file_path: "src/lib/session.ts", symbol: "SessionCache", + start_line: 42, end_line: 89, type: "class" }, + { file_path: "src/lib/redis.ts", symbol: "RedisClient", + start_line: 1, end_line: 34, type: "class" } + ], + search_hint: "SessionCache RedisClient session-cache horizontal scaling" + } + ] +} +``` -**Natural format** — canonical fields (use this shape): +**Natural format** (fallback) — use when a decision is truly abstract +and has no resolvable code surface: ``` payload: { @@ -184,10 +248,12 @@ payload: { decisions: [ { description: "Cache user sessions in Redis for horizontal scaling", - id: "sprint-14-planning#session-cache" # optional stable id + id: "sprint-14-planning#session-cache", # optional stable id + search_hint: "SessionCache RedisClient session cache horizontal scaling" }, { - description: "Apply 10% discount on orders ≥ $100" + description: "Apply 10% discount on orders ≥ $100", + search_hint: "calculateDiscount order_total applyDiscount PricingService" } ], action_items: [ @@ -198,37 +264,19 @@ payload: { **Field rules** — get these right or decisions evaporate: +- **`mappings[].code_regions`** is the whole game from v0.4.23+. When you pass explicit regions, server BM25 does not run for that mapping — grounding is exactly what you resolved. No false positives from vocab mismatch. +- **`search_hint`** is the fallback recall booster. When server BM25 *does* run (you didn't resolve `code_regions`), the server concatenates `intent.description + search_hint` as the BM25 query. Put 3-5 likely identifier names or domain synonyms here — exactly the kind of vocabulary your codebase uses that the decision's natural-language description wouldn't contain literally. Example: a decision about "subscription status source-of-truth" won't mention `resolveMemberStatus` or `isActiveSubscriber` but BM25 needs those tokens to find the right dispatch functions. `search_hint` is query-only — it's never stored as part of the intent's description and never appears in briefs. - **`decisions[].description`** is the canonical text field. `title` is accepted as a synonym for back-compat; `text` is tolerated as an alias (v0.4.16+). At least one of the three must be non-empty or the decision is silently dropped. - **`action_items[].action`** is the canonical text field. `text` is tolerated as an alias (v0.4.16+). `owner` defaults to `"unassigned"`. `due` is an optional ISO date. - **`query`** is load-bearing: it's the topic the post-ingest auto-brief and gap-judge chain fire on. If you omit it, the handler falls through to the longest decision description as a topic guess — usable but less focused. **When fanning out from the boundary-detection flow (step 0), always pass each segment's title as `query`.** -- **`participants`** on the payload populates `span.speakers` for every decision. Put the meeting attendees here, not on individual decisions. +- **`participants`** (natural format) or **`span.speakers`** (internal format) records the meeting attendees. - Do NOT include `open_questions` unless they have direct implementation implications — they're accepted as `list[str]` but clutter the ledger with non-code entries. -**Internal format** — only if you already have pre-resolved code regions from `search_code` / `validate_symbols`: - -``` -payload: { - query: "...", - mappings: [ - { - intent: "Cache user sessions in Redis", - span: { - text: "", - source_type: "transcript", - source_ref: "sprint-14-planning", - meeting_date: "2026-04-15" - }, - symbols: ["SessionCache"], - code_regions: [ - { file_path: "src/lib/session.ts", symbol: "SessionCache", - start_line: 42, end_line: 89, type: "class" } - ] - } - ] -} -``` +**When to choose which format**: -Use the natural format in the common case. Fall through to internal format only when you already have verified file/line pins — otherwise you'll bypass auto-grounding and the server can't map decisions to code on its own. +- **Internal format, v0.4.23+ default.** You resolved `code_regions` via Step 2. Ingest with explicit pins. The ledger is a trustworthy drift anchor — editing those pinned files fires real drift alarms; editing unrelated files fires nothing. This is the posture we want for real branches. +- **Natural format + `search_hint`, fallback.** The decision is abstract ("ship by Q3," "SOC2-compliant session storage") or points at code that doesn't exist yet. Server BM25 tries with the widened query; if it produces zero hits the intent stays ungrounded (honest). If BM25 produces a false-positive binding, you'll catch it at the first `bicameral.doctor` or via a pending_compliance_check verdict. +- **Natural format WITHOUT `search_hint`, legacy.** Works, but this is how the 2026-04-20 Accountable dispatcher ingest ended up with "all dispatch functions" bound to `use-toast.ts:dispatch`. You almost always want at least the hint. ### 4. Report results diff --git a/tests/test_v0423_search_hint.py b/tests/test_v0423_search_hint.py new file mode 100644 index 00000000..b8d0146d --- /dev/null +++ b/tests/test_v0423_search_hint.py @@ -0,0 +1,203 @@ +"""v0.4.23 — ``search_hint`` recall booster regression tests. + +Locks in the caller-LLM → server contract for the new ``search_hint`` +field (Lever 2 of the BM25-vocab-mismatch fix). The hint lets the caller +supply synonyms / likely identifier names that the decision's natural- +language description wouldn't contain literally, so server-side BM25 +fallback grounding has a fighting chance when the caller didn't resolve +explicit ``code_regions``. + +Guards three surfaces: + +1. **Propagation** — ``search_hint`` on a natural-format ``IngestDecision`` + survives ``_normalize_payload`` and lands on the resulting mapping. +2. **Query expansion** — ``ground_mappings`` (auto-ground) uses + ``description + search_hint`` as the BM25 input when the field is set, + and falls back to bare ``description`` when it's absent. +3. **Backward compatibility** — ingesting without ``search_hint`` produces + identical behavior to pre-v0.4.23 (additive, no default change). +""" +from __future__ import annotations + +from unittest.mock import MagicMock + +from handlers.ingest import _normalize_payload + + +# ── _normalize_payload propagation ─────────────────────────────────── + + +def test_search_hint_propagates_from_natural_decision_to_mapping(): + """The natural-format ``IngestDecision.search_hint`` must end up on + the resulting mapping so the server-side grounding path can read it. + """ + out = _normalize_payload({ + "decisions": [ + { + "description": "All email dispatchers filter via single source-of-truth check", + "search_hint": "dispatchReminders dispatchInterventions resolveMemberStatus isActiveSubscriber", + } + ], + }) + mappings = out.get("mappings", []) + assert len(mappings) == 1 + assert mappings[0]["search_hint"].startswith("dispatchReminders"), ( + f"search_hint should survive normalization, got {mappings[0].get('search_hint')!r}" + ) + + +def test_search_hint_defaults_to_empty_when_omitted(): + """Omitting ``search_hint`` must produce the same mapping shape pre- and + post-v0.4.23 (backward compat).""" + out = _normalize_payload({ + "decisions": [{"description": "Cache user sessions in Redis"}], + }) + mappings = out.get("mappings", []) + assert len(mappings) == 1 + assert mappings[0].get("search_hint", "") == "" + + +def test_search_hint_never_pollutes_span_text(): + """``search_hint`` is query-only metadata. It must NOT leak into + ``span.text`` — that field surfaces verbatim in briefs and status + responses, so polluting it would show synonyms to human reviewers. + """ + out = _normalize_payload({ + "decisions": [ + { + "description": "Apply 10% discount on orders over $100", + "search_hint": "calculateDiscount PricingService applyDiscount", + } + ], + }) + mappings = out.get("mappings", []) + assert len(mappings) == 1 + assert "calculateDiscount" not in mappings[0]["span"]["text"], ( + f"search_hint leaked into span.text: {mappings[0]['span']['text']!r}" + ) + assert "calculateDiscount" not in mappings[0]["intent"], ( + f"search_hint leaked into intent: {mappings[0]['intent']!r}" + ) + + +def test_search_hint_passthrough_on_internal_format(): + """The internal format (pre-built ``mappings`` list) must pass + ``search_hint`` through unchanged. The caller may set it even when + providing explicit ``code_regions`` as a safety net for future + re-grounding sweeps. + """ + out = _normalize_payload({ + "mappings": [ + { + "intent": "Use Stripe for checkout", + "span": {"text": "...", "source_type": "transcript"}, + "symbols": ["StripeClient"], + "code_regions": [], + "search_hint": "StripeClient checkout payment_intent webhook", + } + ], + }) + mappings = out.get("mappings", []) + assert len(mappings) == 1 + assert mappings[0]["search_hint"].startswith("StripeClient") + + +# ── BM25 query construction in ground_mappings ─────────────────────── + + +def _call_ground_mappings_with_stub(search_code_spy, mapping: dict) -> None: + """Invoke ``ground_mappings`` with a stubbed search_code to capture + the BM25 query string without needing a real code index. + """ + from adapters.code_locator import RealCodeLocatorAdapter + + adapter = RealCodeLocatorAdapter.__new__(RealCodeLocatorAdapter) + # Minimal attributes ground_mappings touches + adapter._initialized = True + adapter._db = MagicMock() + # Stub the search tool so the FC-1 token count guard passes. + adapter._search_tool = MagicMock() + adapter._search_tool.bm25.count_corpus_tokens = MagicMock(return_value=10) + adapter.search_code = search_code_spy + # Stub coverage-loop tiers — set to zero matches so we exit without + # touching the graph (the spy captures the query before that). + adapter._COVERAGE_TIERS = [(5, 0.8, 3)] + adapter._ground_single = MagicMock(return_value=[]) + + adapter.ground_mappings([mapping]) + + +def test_ground_mappings_concatenates_search_hint_into_bm25_query(): + """When ``search_hint`` is present, the BM25 query is + ``description + " " + search_hint`` — wider recall on vocab- + mismatched decisions. + """ + captured: dict[str, str] = {} + + def spy(query: str) -> list: + captured["query"] = query + return [] + + _call_ground_mappings_with_stub(spy, { + "intent": "All email dispatchers filter via single source-of-truth check", + "search_hint": "dispatchReminders dispatchInterventions resolveMemberStatus", + "code_regions": [], + }) + + assert "All email dispatchers" in captured["query"] + assert "dispatchReminders" in captured["query"] + assert "resolveMemberStatus" in captured["query"] + + +def test_ground_mappings_uses_bare_description_when_no_hint(): + """No ``search_hint`` → BM25 query is the raw description (backward + compatible with pre-v0.4.23).""" + captured: dict[str, str] = {} + + def spy(query: str) -> list: + captured["query"] = query + return [] + + _call_ground_mappings_with_stub(spy, { + "intent": "Cache user sessions in Redis", + "code_regions": [], + }) + + assert captured["query"] == "Cache user sessions in Redis" + + +def test_ground_mappings_ignores_empty_string_search_hint(): + """``search_hint = ""`` is equivalent to omitting the field — + no query mutation.""" + captured: dict[str, str] = {} + + def spy(query: str) -> list: + captured["query"] = query + return [] + + _call_ground_mappings_with_stub(spy, { + "intent": "Apply 10% discount on orders over $100", + "search_hint": "", + "code_regions": [], + }) + + assert captured["query"] == "Apply 10% discount on orders over $100" + + +def test_ground_mappings_skips_grounding_when_code_regions_already_present(): + """If the caller already resolved ``code_regions`` (Lever 1 happy path), + BM25 doesn't run at all — ``search_code`` is never called, so neither + query composition nor the FC-1 guard matter. + """ + spy = MagicMock() + + _call_ground_mappings_with_stub(spy, { + "intent": "Use Stripe for checkout", + "search_hint": "StripeClient checkout", + "code_regions": [ + {"file_path": "src/checkout.ts", "symbol": "StripeClient", + "start_line": 1, "end_line": 50, "type": "class"} + ], + }) + + spy.assert_not_called()