Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,31 @@
All notable changes to bicameral-mcp are tracked here. Format loosely follows
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

## 0.4.7 — 2026-04-14 — FC-3 Vocab Cache Similarity Gate

Fixes witnessed cross-contamination where the vocab cache reused an unrelated
intent's code regions — and, worse, labeled them with the original intent's
`purpose` text. Observed live on Accountable 2026-04-14: a "Stripe payment-link
fallback" decision inherited 8 bogus regions from an earlier "weekly bulletin
page" ingest because both descriptions shared incidental tokens.

### Fixed

- **FC-3a — Vocab cache BM25 cross-match.** `lookup_vocab_cache` now returns
`(symbols, matched_query_text)`. `handle_ingest` computes Jaccard similarity
over non-stopword 4+ char tokens and discards hits below 0.5, forcing a
fall-through to fresh grounding via `ground_mappings`. Deterministic, no LLM
in the critical indexing path (per `git-for-specs.md`).
- **FC-3b — Stale `purpose` field on reused regions.** `_validate_cached_regions`
now accepts `current_description` and rewrites every returned region's
`purpose` field so reused regions carry the *current* intent's text, not the
cached one's.

### Migration

No manual action required. `v0.4.6 → v0.4.7` is a handler-layer fix. Existing
vocab_cache rows remain valid; the gate rejects false positives on read.

## 0.4.6 — 2026-04-14 — Adoption Floor (Trust + First Wow)

Five initiatives: FC-1 BM25 degeneracy guard, FC-2 multi-region grounding
Expand Down
11 changes: 9 additions & 2 deletions events/team_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,15 @@ async def upsert_source_cursor(
error=error,
)

async def lookup_vocab_cache(self, query_text: str, repo: str) -> list[dict]:
"""Vocab cache is local bookkeeping — no event emitted."""
async def lookup_vocab_cache(
self, query_text: str, repo: str,
) -> tuple[list[dict], str]:
"""Vocab cache is local bookkeeping — no event emitted.

Returns ``(symbols, matched_query_text)``. The second element is
the ``query_text`` that the top cache hit was originally stored
against — the caller uses it for FC-3 similarity gating.
"""
await self._ensure_ready()
return await self._inner.lookup_vocab_cache(query_text, repo)

Expand Down
96 changes: 90 additions & 6 deletions handlers/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,64 @@ def _normalize_payload(payload: dict) -> dict:
return result


# ── FC-3: vocab cache similarity gate ──────────────────────────────
#
# The vocab cache uses SurrealDB's ``@0@`` BM25 full-text operator to match
# incoming descriptions against stored ``query_text``. Without a similarity
# threshold, two unrelated intents sharing incidental tokens cross-match —
# witnessed live on Accountable 2026-04-14 where a "Stripe payment-link
# fallback" decision inherited 8 bogus regions from an earlier "weekly
# bulletin page" ingest.
#
# The gate below computes Jaccard similarity over non-stopword tokens ≥4
# chars. Cache hits below the threshold are discarded, forcing the caller
# to fall through to fresh grounding (which is already correct, per FC-2).
# Jaccard was chosen over embeddings because:
# 1. Deterministic, no model dependency (git-for-specs.md invariant:
# "no LLM in critical indexing path")
# 2. The downstream ground_mappings pipeline already handles semantic
# variation via BM25+graph fusion — an embedding gate here would
# double-count
# 3. 20 LOC vs 200+ LOC with a new dependency

_VOCAB_SIMILARITY_THRESHOLD = 0.5

_VOCAB_STOPWORDS = frozenset({
"the", "and", "for", "that", "this", "with", "are", "from", "have",
"will", "when", "then", "been", "also", "into", "about", "should",
"must", "need", "each", "they", "their", "there", "which", "where",
"what", "than", "some", "more", "such", "only", "very", "just",
"like", "make", "made", "use", "used", "using", "after", "before",
"over", "under", "between", "through", "against",
})


def _content_tokens(text: str) -> set[str]:
"""Lowercase, non-stopword, ≥4-char tokens for similarity comparison."""
import re
raw = re.findall(r"[A-Za-z]{4,}", text or "")
return {t.lower() for t in raw if t.lower() not in _VOCAB_STOPWORDS}


def _jaccard_similarity(a: str, b: str) -> float:
"""Jaccard coefficient over ``_content_tokens`` sets.

Returns 0.0 when either set is empty. Returns 1.0 when both strings
produce identical token sets.
"""
ta = _content_tokens(a)
tb = _content_tokens(b)
if not ta or not tb:
return 0.0
intersection = ta & tb
union = ta | tb
return len(intersection) / len(union)


def _validate_cached_regions(
regions: list[dict], code_graph,
regions: list[dict],
code_graph,
current_description: str = "",
) -> list[dict]:
"""Check cached code_regions against the live symbol index.

Expand All @@ -95,6 +151,13 @@ def _validate_cached_regions(

When lookup_by_name returns multiple rows, prefers the row matching
the cached region's file_path to avoid picking an unrelated symbol.

v0.4.7 (FC-3): when ``current_description`` is non-empty, the returned
region's ``purpose`` field is rewritten to it. Previously this function
preserved the cached region's stale ``purpose`` (= the ORIGINAL
intent's description), cross-wiring intents so one decision's regions
carried another decision's label. Witnessed live on Accountable
2026-04-14.
"""
try:
code_graph._ensure_initialized()
Expand All @@ -120,13 +183,16 @@ def _validate_cached_regions(
(r for r in rows if r["file_path"] == cached_file),
rows[0],
)
valid.append({
entry = {
**region,
"file_path": row["file_path"],
"start_line": row["start_line"],
"end_line": row["end_line"],
"type": row["type"],
})
}
if current_description:
entry["purpose"] = current_description # FC-3: rewrite stale purpose
valid.append(entry)
return valid


Expand Down Expand Up @@ -155,6 +221,7 @@ async def handle_ingest(
# Runs before ground_mappings — a hit skips the full BM25 pipeline.
mappings_to_ground = payload.get("mappings") or []
cache_hits = 0
cache_similarity_rejections = 0
pre_grounded: set[str] = set()
for mapping in mappings_to_ground:
if mapping.get("code_regions"):
Expand All @@ -167,19 +234,36 @@ async def handle_ingest(
if not description:
continue
try:
cached_symbols = await ledger.lookup_vocab_cache(description, repo)
cached_symbols, matched_query_text = await ledger.lookup_vocab_cache(description, repo)
if cached_symbols:
# FC-3 similarity gate: the vocab cache lookup uses SurrealDB's
# BM25 @0@ operator, which is too loose on its own. Two unrelated
# intents sharing incidental tokens can cross-match. Compute
# Jaccard similarity between the incoming description and the
# matched query_text, and reject the cache hit if it's below
# threshold. Falls through to fresh grounding via ground_mappings.
similarity = _jaccard_similarity(description, matched_query_text)
if similarity < _VOCAB_SIMILARITY_THRESHOLD:
cache_similarity_rejections += 1
logger.info(
"[ingest] vocab cache rejected (similarity %.2f < %.2f): "
"current=%r matched=%r",
similarity, _VOCAB_SIMILARITY_THRESHOLD,
description[:60], matched_query_text[:60],
)
continue
valid_regions = _validate_cached_regions(
cached_symbols, ctx.code_graph,
current_description=description, # FC-3: rewrite purpose
)
if valid_regions:
mapping["code_regions"] = valid_regions
cache_hits += 1
pre_grounded.add(description)
logger.info(
"[ingest] vocab cache hit for '%s' (%d/%d regions valid)",
"[ingest] vocab cache hit for '%s' (%d/%d regions valid, sim=%.2f)",
description[:60],
len(valid_regions), len(cached_symbols),
len(valid_regions), len(cached_symbols), similarity,
)
else:
logger.debug(
Expand Down
9 changes: 7 additions & 2 deletions ledger/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,13 @@ async def lookup_vocab_cache(
self,
query_text: str,
repo: str,
) -> list[dict]:
"""Check vocab_cache for cached grounding results."""
) -> tuple[list[dict], str]:
"""Check vocab_cache for cached grounding results.

Returns ``(symbols, matched_query_text)``. The matched query text
is needed by callers to run the FC-3 similarity gate before
deciding whether to reuse the cached symbols.
"""
await self._ensure_connected()
return await lookup_vocab_cache(self._client, query_text, repo)

Expand Down
17 changes: 12 additions & 5 deletions ledger/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,13 +222,20 @@ async def lookup_vocab_cache(
query_text: str,
repo: str,
max_results: int = 3,
) -> list[dict]:
) -> tuple[list[dict], str]:
"""BM25 lookup on vocab_cache for cached grounding results.

Returns the ``symbols`` array from the top matching cache entry
(a list of code_region-shaped dicts). Empty list on miss.
Returns a 2-tuple: ``(symbols, matched_query_text)``.
- ``symbols`` is the cached code_region-shaped dict list from the
top matching cache entry, or ``[]`` on miss.
- ``matched_query_text`` is the ``query_text`` that the top hit was
originally stored against. The caller uses this to compute a
similarity gate (FC-3 fix) before deciding whether to reuse the
cached symbols — BM25's ``@0@`` operator is too loose on its
own and cross-contaminates unrelated intents.

On hit, increments hit_count and refreshes last_hit for LRU tracking.
On miss, returns ``([], "")``.
"""
rows = await client.query(
"""
Expand All @@ -241,7 +248,7 @@ async def lookup_vocab_cache(
{"query": query_text, "repo": repo, "max_results": max_results},
)
if not rows:
return []
return [], ""

top = rows[0]
top_id = top.get("id")
Expand All @@ -250,7 +257,7 @@ async def lookup_vocab_cache(
f"UPDATE {top_id} SET hit_count += 1, last_hit = time::now()",
)

return top.get("symbols") or []
return top.get("symbols") or [], str(top.get("query_text") or "")


async def upsert_vocab_cache(
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "bicameral-mcp"
version = "0.4.6"
version = "0.4.7"
description = "Decision ledger MCP server — ingests meeting transcripts, maps decisions to code, tracks drift"
readme = "README.md"
requires-python = ">=3.10"
Expand Down
Loading