diff --git a/CHANGELOG.md b/CHANGELOG.md index 6964a976..9f582cdc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,24 @@ All notable changes to bicameral-mcp are tracked here. Format loosely follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## v0.15.1 — hotfix: route `ledger_sync` deserialization warnings to a wipe-and-replay recovery path (#301) + +Hotfix for [#301](https://github.com/BicameralAI/bicameral-mcp/issues/301). v0.15.0 already added a row-level deserialization probe to `bicameral.diagnose` ([`cli/_diagnose_gather.py::_probe_row_deserialization`](cli/_diagnose_gather.py)), but the probe's findings stopped at the `suggestions` list — `_classify_recovery` in `handlers/diagnose.py` still inspected only `schema_meta.version`, so an agent that ran `diagnose` after a `link_commit` failure would see `recovery_path: "clean"` and `next_action: "No remediation needed."` while the ledger was actually unreadable. + +This release closes the loop end-to-end: + +- **`LedgerDeserializationError`** (subclass of `LedgerError`) is raised from `ledger.client.query` / `ledger.client.execute` whenever the SurrealDB SDK returns `Invalid revision \`N\` for type \`Value\`` or a `deserialization error` wrapper. The exception message embeds the recovery command, so the agent sees the wipe-and-replay instruction inside the MCP error envelope without needing a second `diagnose` round-trip. +- **`handlers/diagnose.py::_classify_recovery`** now consults `Diagnosis.row_probe_warnings` *before* the schema-version checks. Non-empty warnings route to `reset_rebuild` (when `.bicameral/events/*.jsonl` is present) or `reset_destructive` (no events on disk) with a `next_action` that quotes the exact `bicameral_reset(wipe_mode="ledger", replay_from_events=…, confirm=True)` call. +- **`handlers/sync_middleware.py::ensure_ledger_synced`** re-raises `LedgerDeserializationError` instead of swallowing it at DEBUG. The broad `except Exception` is still in place for transient catch-up failures (the original "best-effort" contract); only deserialization errors break out, because they're the one class of failure the agent must surface to the user. + +### Fixed + +- **#301** — `bicameral.link_commit` no longer fails with a bare `LedgerError("SurrealDB rejected query: ...")` when `ledger_sync` rows can't be deserialized; the new exception class names the recovery path. `bicameral.diagnose` now classifies the same failure mode under `reset_rebuild` / `reset_destructive` so the agent's next_action is actionable. + +### Notes for v0.14.x → v0.15.x upgraders hit by #301 + +The SurrealDB SDK pin (`surrealdb==2.0.0`) is unchanged between v0.14.x and v0.15.1; upgrading the package does NOT rewrite the persisted SurrealKV record format. An installation that hit the `Invalid revision \`3\` for type \`Value\`` error on v0.14.x will continue to see the same SDK-level deserialization failure on first `link_commit`. What changes in v0.15.1 is the *visibility*: the error now surfaces with an explicit recovery command instead of as a generic `LedgerError`, and `diagnose` correctly routes to a reset path. To recover, run `bicameral_reset(wipe_mode="ledger", replay_from_events=True, confirm=True)` (if `.bicameral/events/*.jsonl` is present) or `bicameral_reset(wipe_mode="ledger", confirm=True)` otherwise. + ## v0.15.0 — PII archive, hard-delete `remove_decision`, schema v17→v24 chain, team-mode foundations Cumulative release draining the dev → main backlog accumulated since v0.14.7. Lands the **#221 PII archive** (operator-erasable PII surface), retires the **soft-delete tombstone model** for `bicameral.remove_decision` (now hard-delete by default), brings the constant-time **`bicameral_meta.decision_revision` counter** (#87) into the preflight dedup path, ships **`bicameral.admin/query`** and **dashboard source view** (#278 Phase 1+3), wires the **`LocalDirectorySourceAdapter`** and **`sync-and-brief`** team-mode flows (#344, #279), and adds the **code-locator singleton + eager startup init** that moves index work off the MCP stdio handshake (#243, #380). diff --git a/RECOMMENDED_VERSION b/RECOMMENDED_VERSION index a5510516..e815b861 100644 --- a/RECOMMENDED_VERSION +++ b/RECOMMENDED_VERSION @@ -1 +1 @@ -0.15.0 +0.15.1 diff --git a/handlers/diagnose.py b/handlers/diagnose.py index 5099d477..3f3a7490 100644 --- a/handlers/diagnose.py +++ b/handlers/diagnose.py @@ -121,8 +121,26 @@ def _classify_recovery(diagnosis) -> tuple[RecoveryPath, str]: exp = diagnosis.schema_version_expected has_events = _events_present(diagnosis.ledger_url) - if rec is not None and rec > exp: + # #301 — row-level deserialization warnings outrank the schema-version + # check. Schema can read clean (`schema_meta` is a tiny row, never the + # one that breaks) while the operational tables hit + # ``Invalid revision `N` for type Value``. The probe in + # cli/_diagnose_gather.py::_probe_row_deserialization catches that; here + # we route the verdict so the agent sees recovery_path=reset_rebuild + # instead of "clean, no remediation needed". + row_warnings: list[str] = list(getattr(diagnosis, "row_probe_warnings", []) or []) + if row_warnings: path: RecoveryPath = "reset_rebuild" if has_events else "reset_destructive" + tables = ", ".join(sorted({w.split(":", 1)[0] for w in row_warnings})) + return path, ( + f"Row-level deserialization warnings on {tables} — likely a " + "SurrealDB embedded-SDK record-format mismatch. Run " + f"`bicameral_reset(wipe_mode='ledger', replay_from_events={has_events}, " + "confirm=True)` to wipe and replay from .bicameral/events/." + ) + + if rec is not None and rec > exp: + path = "reset_rebuild" if has_events else "reset_destructive" return path, ( f"Ledger schema v{rec} is newer than this binary (v{exp}). " f"Upgrade `bicameral-mcp` to a version that understands v{rec}, " diff --git a/handlers/sync_middleware.py b/handlers/sync_middleware.py index b9059875..9aed0d5c 100644 --- a/handlers/sync_middleware.py +++ b/handlers/sync_middleware.py @@ -20,6 +20,8 @@ from datetime import UTC, datetime from typing import TYPE_CHECKING +from ledger.client import LedgerDeserializationError + if TYPE_CHECKING: from contracts import LinkCommitResponse, SessionStartBanner @@ -288,6 +290,14 @@ async def ensure_ledger_synced(ctx) -> LinkCommitResponse | None: _LAST_SYNCED_SHA = live_head logger.debug("[sync_middleware] catch-up ran for %s", live_head[:8]) return result + except LedgerDeserializationError: + # #301 — row-format mismatch on ledger_sync / yields / etc. This is + # the *opposite* of a swallowable error: the ledger is in a state + # the user must repair before any tool will work. Re-raise so the + # MCP transport layer surfaces the exception (with embedded + # RECOVERY_HINT) to the agent instead of silently logging at DEBUG. + logger.warning("[sync_middleware] ledger row-format mismatch — surfacing to agent") + raise except Exception as exc: logger.debug("[sync_middleware] catch-up failed: %s", exc) return None diff --git a/ledger/client.py b/ledger/client.py index 9525e9c3..2bbc1e59 100644 --- a/ledger/client.py +++ b/ledger/client.py @@ -115,6 +115,55 @@ class LedgerError(RuntimeError): """ +class LedgerDeserializationError(LedgerError): + """Raised when SurrealDB's embedded SDK can't decode a row on read (#301). + + The on-disk SurrealKV record header carries a revision number that must + match the surrealdb-py deserializer. A mismatch surfaces as + ``Invalid revision `N` for type `Value```. This is *below* the schema + layer — `init_schema`/`migrate` don't help because the migration code + can't read the row either. The fix is to wipe the affected cursor and + replay from the event log via + ``bicameral_reset(wipe_mode="ledger", replay_from_events=True, confirm=True)``. + + Subclass of ``LedgerError`` so existing ``except LedgerError`` handlers + catch it; callers that surface a recovery hint to the agent match on + ``LedgerDeserializationError`` specifically. + """ + + RECOVERY_HINT = ( + "Row-level deserialization failed — likely a SurrealDB embedded SDK " + "revision mismatch on persisted rows. Run " + "`bicameral_reset(wipe_mode='ledger', replay_from_events=True, " + "confirm=True)` to wipe and replay from .bicameral/events/, or " + "`bicameral-mcp diagnose` for a full report." + ) + + def __init__(self, *, raw: str, sql_prefix: str) -> None: + self.raw = raw + self.sql_prefix = sql_prefix + super().__init__( + f"SurrealDB row deserialization failed: {raw}\n" + f"SQL: {sql_prefix}\n" + f"Recovery: {self.RECOVERY_HINT}" + ) + + +_DESERIALIZATION_SIGNATURES = ("Invalid revision", "deserialization error") + + +def _is_deserialization_error(raw: str) -> bool: + """Return True when ``raw`` looks like a SurrealKV row-format mismatch. + + Matches the two signatures observed in #301 and the related ``yields`` + incident: the SDK's ``Invalid revision `N` for type `T``` and the more + general ``A deserialization error occured`` wrapper. Both come back as + strings the caller sees as ``LedgerError`` today — this helper is the + classification seam. + """ + return any(sig in raw for sig in _DESERIALIZATION_SIGNATURES) + + class LedgerTimeoutError(LedgerError): """Raised when a ledger query exceeds its wallclock timeout budget. @@ -270,8 +319,13 @@ async def query( try: result = await self._run_with_timeout(sql, vars, timeout_class) except SurrealError as exc: + msg = str(exc) + if _is_deserialization_error(msg): + raise LedgerDeserializationError(raw=msg, sql_prefix=sql[:300]) from exc raise LedgerError(f"SurrealDB rejected query: {exc}\nSQL: {sql[:300]}") from exc if isinstance(result, str): + if _is_deserialization_error(result): + raise LedgerDeserializationError(raw=result, sql_prefix=sql[:300]) raise LedgerError(f"SurrealDB rejected query: {result}\nSQL: {sql[:300]}") return _normalize(result) if isinstance(result, list) else [] @@ -297,8 +351,13 @@ async def execute( try: result = await self._run_with_timeout(sql, vars, timeout_class) except SurrealError as exc: + msg = str(exc) + if _is_deserialization_error(msg): + raise LedgerDeserializationError(raw=msg, sql_prefix=sql[:300]) from exc raise LedgerError(f"SurrealDB rejected statement: {exc}\nSQL: {sql[:300]}") from exc if isinstance(result, str): + if _is_deserialization_error(result): + raise LedgerDeserializationError(raw=result, sql_prefix=sql[:300]) raise LedgerError(f"SurrealDB rejected statement: {result}\nSQL: {sql[:300]}") async def execute_many( diff --git a/pyproject.toml b/pyproject.toml index fe43717f..820d7428 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "bicameral-mcp" -version = "0.15.0" +version = "0.15.1" description = "Decision ledger MCP server — ingests meeting transcripts, maps decisions to code, tracks drift" readme = "README.md" requires-python = ">=3.10" diff --git a/tests/test_ledger_sync_deserialization_recovery_301.py b/tests/test_ledger_sync_deserialization_recovery_301.py new file mode 100644 index 00000000..074b46cb --- /dev/null +++ b/tests/test_ledger_sync_deserialization_recovery_301.py @@ -0,0 +1,248 @@ +"""Tests for #301 — ledger_sync row-format deserialization recovery routing. + +Three behaviours under test (sociable where it counts, narrow seam where the +failure mode is a SurrealKV record-revision mismatch that can't be triggered +naturally): + +1. ``ledger.client.query`` / ``ledger.client.execute`` raise + ``LedgerDeserializationError`` (subclass of ``LedgerError``) — not the + generic ``LedgerError`` — when SurrealDB returns a record-format + mismatch. The new exception's message embeds the recovery hint. +2. ``handlers.diagnose._classify_recovery`` returns + ``reset_rebuild`` / ``reset_destructive`` (not ``clean``) when the + ``Diagnosis.row_probe_warnings`` field is non-empty, even if + ``schema_version_recorded == schema_version_expected``. The next_action + quotes ``bicameral_reset(...)``. +3. ``handlers.sync_middleware.ensure_ledger_synced`` re-raises + ``LedgerDeserializationError`` instead of swallowing it at DEBUG, so the + agent sees the recovery hint via the MCP error envelope. +""" + +from __future__ import annotations + +from types import SimpleNamespace +from unittest.mock import AsyncMock, patch + +import pytest + +try: + from surrealdb import SurrealError +except ImportError: # pragma: no cover — pinned SDK exposes it, fallback mirrors client.py + SurrealError = Exception # type: ignore[assignment,misc] + +from cli.diagnose import Diagnosis +from handlers.diagnose import _classify_recovery +from handlers.sync_middleware import _reset_repo_locks_for_tests, ensure_ledger_synced +from ledger.client import ( + LedgerClient, + LedgerDeserializationError, + LedgerError, + _is_deserialization_error, +) +from ledger.schema import SCHEMA_VERSION, init_schema, migrate + +# ── 1. ledger.client classification ──────────────────────────────────────── + + +def test_is_deserialization_error_matches_invalid_revision(): + assert _is_deserialization_error("Invalid revision `3` for type `Value`") + + +def test_is_deserialization_error_matches_wrapper_text(): + assert _is_deserialization_error("Versioned error: A deserialization error occured: ...") + + +def test_is_deserialization_error_does_not_match_unrelated_errors(): + assert not _is_deserialization_error("UNIQUE constraint violation on idx_x") + assert not _is_deserialization_error("ASSERT failed on field text") + + +def test_ledger_deserialization_error_is_subclass_of_ledger_error(): + # Existing `except LedgerError` handlers must still catch the new class. + err = LedgerDeserializationError(raw="Invalid revision `3`", sql_prefix="SELECT 1") + assert isinstance(err, LedgerError) + + +def test_ledger_deserialization_error_message_embeds_recovery_hint(): + err = LedgerDeserializationError(raw="Invalid revision `3`", sql_prefix="SELECT 1") + msg = str(err) + assert "bicameral_reset" in msg + assert "replay_from_events=True" in msg + assert "Invalid revision `3`" in msg + + +@pytest.mark.asyncio +async def test_query_raises_deserialization_error_when_surrealdb_complains(): + """The classifier triggers on a real LedgerClient.query() path. + + Narrow seam: we patch the surrealdb-py async call so it raises + ``SurrealError("Invalid revision ...")`` — this is the documented failure + mode for SurrealKV record-format drift and cannot be triggered naturally + against ``memory://``. + """ + client = LedgerClient(url="memory://", ns="t301_q", db="ledger_test") + await client.connect() + try: + boom = SurrealError("Invalid revision `3` for type `Value`") + with patch.object(client._db, "query", AsyncMock(side_effect=boom)): + with pytest.raises(LedgerDeserializationError) as ei: + await client.query("SELECT * FROM ledger_sync WHERE repo = 'x' LIMIT 1") + assert "bicameral_reset" in str(ei.value) + assert ei.value.sql_prefix.startswith("SELECT * FROM ledger_sync") + finally: + await client.close() + + +@pytest.mark.asyncio +async def test_query_with_non_deserialization_error_still_raises_plain_ledger_error(): + """Unrelated SurrealErrors must not be reclassified as deserialization.""" + client = LedgerClient(url="memory://", ns="t301_q2", db="ledger_test") + await client.connect() + try: + boom = SurrealError("Some other constraint failure") + with patch.object(client._db, "query", AsyncMock(side_effect=boom)): + with pytest.raises(LedgerError) as ei: + await client.query("SELECT * FROM ledger_sync") + assert not isinstance(ei.value, LedgerDeserializationError) + finally: + await client.close() + + +# ── 2. _classify_recovery routing on row_probe_warnings ──────────────────── + + +def _stub_diagnosis( + *, + row_warnings: list[str], + schema_rec: int | None = None, + schema_exp: int | None = None, + ledger_url: str = "memory://", +) -> Diagnosis: + """Minimal Diagnosis fixture — only the fields _classify_recovery reads. + + Diagnosis is a frozen dataclass; the other fields are populated with + type-correct defaults so the constructor accepts the row. + """ + return Diagnosis( + bicameral_version="0.15.1", + python_version="3.11.0", + platform_str="test", + surrealdb_running="2.0.0", + ledger_url=ledger_url, + ledger_size_bytes=None, + ledger_mtime_iso=None, + schema_version_recorded=schema_rec if schema_rec is not None else SCHEMA_VERSION, + schema_version_expected=schema_exp if schema_exp is not None else SCHEMA_VERSION, + surrealdb_first_write=None, + surrealdb_last_write=None, + last_write_at=None, + drift_status="match", + audit_log_channel="stderr", + table_counts={"decision": 0}, + row_probe_warnings=row_warnings, + recent_events=[], + suggestions=[], + ) + + +def test_classify_recovery_returns_reset_destructive_when_row_warnings_present_no_events(): + d = _stub_diagnosis( + row_warnings=["ledger_sync: SurrealError: Invalid revision `3` for type `Value`"], + ) + path, next_action = _classify_recovery(d) + assert path == "reset_destructive" + assert "bicameral_reset" in next_action + assert "replay_from_events=False" in next_action + assert "ledger_sync" in next_action + + +def test_classify_recovery_returns_reset_rebuild_when_events_present(tmp_path, monkeypatch): + """Same row_warnings, but events/*.jsonl exists next to the ledger db file. + + ``_events_present`` resolves the events dir as ``.parent / events`` + (handlers/diagnose.py:174-184), so the fixture lays the file under + ``/events/`` and points the ledger URL at + ``/ledger.db``. + """ + bicameral_dir = tmp_path / ".bicameral" + events_dir = bicameral_dir / "events" + events_dir.mkdir(parents=True) + (events_dir / "0001.jsonl").write_text("{}\n") + ledger_url = f"surrealkv://{bicameral_dir}/ledger.db" + d = _stub_diagnosis( + row_warnings=["ledger_sync: SurrealError: Invalid revision `3`"], + ledger_url=ledger_url, + ) + path, next_action = _classify_recovery(d) + assert path == "reset_rebuild" + assert "replay_from_events=True" in next_action + + +def test_classify_recovery_row_warnings_outrank_clean_schema(): + """Schema matches exactly but rows are broken → must NOT return 'clean'.""" + d = _stub_diagnosis( + row_warnings=["ledger_sync: SurrealError: Invalid revision `3`"], + schema_rec=SCHEMA_VERSION, + schema_exp=SCHEMA_VERSION, + ) + path, _ = _classify_recovery(d) + assert path != "clean" + + +def test_classify_recovery_no_row_warnings_stays_clean(): + """Regression guard — happy-path schema_rec == schema_exp still returns 'clean'.""" + d = _stub_diagnosis(row_warnings=[]) + path, next_action = _classify_recovery(d) + assert path == "clean" + assert "No remediation needed" in next_action + + +# ── 3. ensure_ledger_synced re-raises instead of swallowing ──────────────── + + +@pytest.mark.asyncio +async def test_ensure_ledger_synced_reraises_deserialization_error(): + """Previously: broad ``except Exception`` swallowed this to DEBUG, masking + the failure. After #301: LedgerDeserializationError surfaces to the + caller so the MCP transport renders the recovery hint to the agent. + """ + _reset_repo_locks_for_tests() + ctx = SimpleNamespace(repo_path=".") + boom = LedgerDeserializationError( + raw="Invalid revision `3` for type `Value`", + sql_prefix="SELECT * FROM ledger_sync", + ) + with ( + patch( + "handlers.link_commit._read_current_head_sha", + return_value="deadbeef" * 5, + ), + patch( + "handlers.link_commit.handle_link_commit", + AsyncMock(side_effect=boom), + ), + ): + with pytest.raises(LedgerDeserializationError) as ei: + await ensure_ledger_synced(ctx) + assert "bicameral_reset" in str(ei.value) + + +@pytest.mark.asyncio +async def test_ensure_ledger_synced_still_swallows_unrelated_errors(): + """Non-deserialization failures stay isolated — the prior broad-catch + contract still applies to keep the catch-up best-effort.""" + _reset_repo_locks_for_tests() + ctx = SimpleNamespace(repo_path=".") + with ( + patch( + "handlers.link_commit._read_current_head_sha", + return_value="cafebabe" * 5, + ), + patch( + "handlers.link_commit.handle_link_commit", + AsyncMock(side_effect=RuntimeError("transient io")), + ), + ): + # Should NOT raise — broad exception is still swallowed. + result = await ensure_ledger_synced(ctx) + assert result is None