From 65cf5cb059e4a046d947549a9dcd7a212832f460 Mon Sep 17 00:00:00 2001 From: jinhongkuan Date: Fri, 15 May 2026 20:45:44 -0700 Subject: [PATCH] fix(server): move code-locator init off MCP stdio handshake (#380) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-fix, serve_stdio awaited get_code_locator().initialize() inline before opening the MCP stdio transport. On a 150MB+ symbol-index DB the cold path took ~45s (sqlite-vec open + tree-sitter load + BM25 pickle load), blowing past Claude Code's 30s MCP initialize timeout on real-world repos — the server "started" but the JSON-RPC handshake never landed and the client gave up. Fix: - ``RealCodeLocatorAdapter.initialize_in_background()`` — schedules ``_ensure_initialized`` in the default executor via an asyncio Task, returns immediately. A done-callback prints the bare error to stderr on failure so the operator still sees the actionable "Run: python -m code_locator index " hint that #243 wrote. - ``_ensure_initialized`` now serializes its body via a threading.Lock. Sync callers from worker threads (the ``asyncio.to_thread(ctx.code_graph., ...)`` pattern every tool handler already uses) block on the lock until the background Task finishes, then see the post-init state and proceed. No callsite needs to know about the background Task. - ``_run_init_body`` extracted from ``_ensure_initialized`` so tests can monkey-patch the slow body without bypassing the lock/state machine — the lock + Task glue is what's under test. - ``wait_until_ready()`` — optional async gate for callers that want to explicitly await readiness from an async context and surface a structured error to the MCP client on failure. - ``server.py:serve_stdio`` — replaces ``await get_code_locator().initialize()`` with ``get_code_locator().initialize_in_background()`` (synchronous, no await). Stderr message rewritten to reflect the new contract. Trade-off: #243's "server refuses to boot when index is empty" becomes "first code-locator tool call fails loudly when index is empty." Operator still sees the failure on stderr at boot via the done-callback. The fail-loud contract from #243 phase-2 signoff Q3 is preserved, just relocated from boot-time to first-tool-call-time. Measured: JSON-RPC ``initialize`` reply now lands in ~16ms on this repo's own 150MB code-graph.db (was ~45s). Closes #380 Co-Authored-By: Claude Opus 4.7 (1M context) --- adapters/code_locator.py | 154 ++++++++++++++-- server.py | 51 +++--- tests/test_codelocator_background_init.py | 206 ++++++++++++++++++++++ 3 files changed, 367 insertions(+), 44 deletions(-) create mode 100644 tests/test_codelocator_background_init.py diff --git a/adapters/code_locator.py b/adapters/code_locator.py index ad8ee167..5821d051 100644 --- a/adapters/code_locator.py +++ b/adapters/code_locator.py @@ -11,6 +11,8 @@ import asyncio import logging import os +import sys +import threading from pathlib import Path from code_locator_runtime import ( @@ -77,12 +79,43 @@ def __init__(self, repo_path: str = ".") -> None: self._initialized = False self._validate_tool = None self._neighbors_tool = None + # #380 — lock makes ``_ensure_initialized`` safe to call concurrently + # from the background-init asyncio Task (running in the default + # executor thread pool) AND from worker threads spawned by tool + # handlers via ``asyncio.to_thread(ctx.code_graph.validate_symbols, ...)``. + # First holder runs init; everyone else blocks until it finishes, + # then sees ``self._initialized = True`` and returns immediately. + self._init_lock = threading.Lock() + # #380 — handle to the background init Task (set by + # ``initialize_in_background``). ``wait_until_ready`` awaits it + # without re-running init; failure is re-raised to the caller so + # the fail-loud contract from #243 is preserved (relocated from + # boot-time to first-tool-call-time). + self._init_task: asyncio.Task | None = None def _ensure_initialized(self) -> None: - """Lazy init of SymbolDB, config, and tool instances.""" + """Lazy init of SymbolDB, config, and tool instances. + + Thread-safe (#380): the body is serialized via ``self._init_lock`` + so a sync caller on a worker thread (from ``asyncio.to_thread``) + will block on the background init Task instead of racing it. + Whichever thread acquires the lock first runs the init body; the + loser sees ``self._initialized`` True and returns. + """ if self._initialized: return + with self._init_lock: + # Re-check after lock acquire — another thread may have + # finished init while we were waiting for the lock. + if self._initialized: + return + self._run_init_body() + + def _run_init_body(self) -> None: + """Actual init work — extracted so tests can monkey-patch this + without bypassing the lock/state contract in ``_ensure_initialized``. + """ ensure_runtime_env() from code_locator.config import load_config from code_locator.indexing.sqlite_store import SymbolDB @@ -106,25 +139,118 @@ def _ensure_initialized(self) -> None: self._initialized = True async def initialize(self) -> None: - """Async wrapper around ``_ensure_initialized()`` for the - ``server.py:serve_stdio`` startup hook (#243 Piece B). - - Runs the sync init in a thread-pool executor so the event loop - stays responsive while sqlite-vec opens + tree-sitter loads — - the cold-init path can take several seconds on larger repos. - - **Fail-loud contract** (per #243 phase-2 signoff Q3): caller MUST - NOT swallow the ``RuntimeError("Code locator index is empty...")`` - — the server should refuse to start when index init fails rather - than silently degrade every preflight call. Lazy init via - ``_ensure_initialized`` remains for test contexts where mock - adapters bypass this path. + """Async wrapper around ``_ensure_initialized()`` — awaits the + sync init in a thread-pool executor so the event loop stays + responsive. + + Pre-#380 this was the only path ``server.py:serve_stdio`` used, + and it was awaited inline before opening the MCP stdio + transport — every cold boot paid the index-load cost (sqlite-vec + + tree-sitter + BM25 pickle, ~45s on a 150MB graph) BEFORE the + ``initialize`` JSON-RPC reply could land, which exceeded Claude + Code's 30s MCP startup timeout on real-world repos. + + Post-#380 the startup hook calls ``initialize_in_background()`` + instead, which schedules the same work but returns immediately. + This method remains for tests + lazy-init paths that genuinely + want a synchronous wait. """ if self._initialized: return loop = asyncio.get_running_loop() await loop.run_in_executor(None, self._ensure_initialized) + def initialize_in_background(self) -> None: + """Schedule index init as a background asyncio Task and return + immediately (#380). + + Called from ``server.py:serve_stdio`` AFTER the dashboard sidecar + starts but BEFORE ``stdio_server()`` opens the MCP transport, so + the JSON-RPC ``initialize`` reply lands inside Claude Code's 30s + startup timeout regardless of how large the symbol index is. + + Concurrency model (#380): + - The Task runs ``_ensure_initialized`` in the default executor + (thread pool), so the event loop stays free to serve protocol + traffic. + - Tool handlers reach the index via ``asyncio.to_thread( + ctx.code_graph.validate_symbols, ...)`` — those worker threads + call ``_ensure_initialized`` synchronously, which lock-blocks + on the background Task. First tool call (typically preflight) + eats the latency that boot used to. + - If the background Task raises, the exception is captured by + the Task; a ``done_callback`` logs the bare error to stderr so + operators see the failure even before any tool call lands. + The next ``_ensure_initialized`` call re-runs init and raises + the same error to its caller — preserving #243's fail-loud + contract (relocated, not removed). + + Idempotent: subsequent calls before the Task completes are + no-ops; calls after a successful init are no-ops; calls after a + failed init schedule a fresh retry attempt. + """ + if self._initialized: + return + if self._init_task is not None and not self._init_task.done(): + return + + try: + loop = asyncio.get_running_loop() + except RuntimeError: + # No running loop — caller is in a sync context (e.g., tests + # that import the adapter without running an event loop). + # Fall back to a synchronous init so the caller's next + # public-method call works. + self._ensure_initialized() + return + + async def _run_in_executor() -> None: + # ``loop.run_in_executor`` returns a Future, not a coroutine, so + # we wrap it so ``loop.create_task`` (which requires a coroutine) + # accepts it. The awaited Future propagates the executor-thread + # exception back into the Task, which the done_callback then + # logs to stderr. + await loop.run_in_executor(None, self._ensure_initialized) + + self._init_task = loop.create_task(_run_in_executor()) + + def _log_failure(task: asyncio.Task) -> None: + try: + exc = task.exception() + except asyncio.CancelledError: + return + if exc is not None: + # Match the boot-time stderr format the operator was + # already trained to look for. Tool calls will surface + # the same error to the client when they hit the lock. + print( + f"[code_locator] background init FAILED (#380) — first tool " + f"call will surface this to the client: {exc}\n" + "Run: python -m code_locator index ", + file=sys.stderr, + ) + + self._init_task.add_done_callback(_log_failure) + + async def wait_until_ready(self) -> None: + """Await pending background init, raising on failure (#380). + + Optional explicit gate for callers that want to *await* readiness + from an async context rather than rely on sync ``_ensure_initialized`` + blocking inside ``asyncio.to_thread``. Re-raises the original + ``RuntimeError`` so a code-locator tool dispatcher can return a + structured error response to the MCP client. + """ + if self._initialized: + return + if self._init_task is not None: + # Awaiting a done Task re-raises its exception; awaiting a + # pending Task waits for completion first. Either way the + # caller gets fail-loud semantics. + await self._init_task + return + await self.initialize() + def validate_symbols(self, candidates: list[str]) -> list[dict]: """Fuzzy-match candidate symbol names against the codebase index.""" self._ensure_initialized() diff --git a/server.py b/server.py index 759fe9f2..94c5b97d 100644 --- a/server.py +++ b/server.py @@ -1376,37 +1376,28 @@ async def serve_stdio() -> None: dashboard_srv = get_dashboard_server() await dashboard_srv.start(ctx_factory=BicameralContext.from_env) - # #243 Piece B — eager symbol-index initialization. Pre-fix, init - # was lazy (first tool call paid the cost AND could race the - # index check). Post-fix, the server refuses to boot when the - # index is broken — silent degradation is the failure mode #243 - # is fixing. Production deployments that don't have an indexed - # repo at boot SHOULD see a clear startup error and run - # ``python -m code_locator index `` rather than have - # every preflight call silently fall back. + # #380 — symbol-index init runs in the background so the JSON-RPC + # ``initialize`` reply lands inside Claude Code's 30s MCP startup + # timeout. Pre-#380 this was an inline ``await initialize()`` per + # #243 Piece B; on a 150MB+ code-graph.db it took ~45s and every + # client disconnected before the handshake completed. # - # Fail-loud per #243 phase-2 signoff Q3 — do NOT catch the - # `RuntimeError("Code locator index is empty...")`. Let it - # propagate; the outer try/finally below still runs the - # SERVER_SHUTDOWN audit emit so the operator gets a clean event. - try: - from adapters.code_locator import get_code_locator - - await get_code_locator().initialize() - print( - "[serve_stdio] code-locator index initialized at startup (#243)", - file=sys.stderr, - ) - except RuntimeError as exc: - # Re-raise to fail boot. The audit log records SERVER_SHUTDOWN - # in the outer finally; operator sees the bare RuntimeError on - # stderr with the actionable index-missing message. - print( - f"[serve_stdio] code-locator init FAILED at startup (#243) — refusing to boot: {exc}\n" - "Run: python -m code_locator index ", - file=sys.stderr, - ) - raise + # The fail-loud contract from #243 phase-2 signoff Q3 is preserved + # but relocated: a background-init failure is logged to stderr by + # the adapter's done-callback (operator sees it immediately), and + # the first code-locator tool call surfaces the same error to the + # MCP client because ``_ensure_initialized`` re-raises through the + # lock. Trade-off: "server refuses to boot" → "first tool call + # fails loudly." The operator still gets the actionable + # `python -m code_locator index ` hint. + from adapters.code_locator import get_code_locator + + get_code_locator().initialize_in_background() + print( + "[serve_stdio] code-locator init scheduled in background (#380); " + "first tool call will block until ready", + file=sys.stderr, + ) # First-boot telemetry consent notice (non-blocking, fires once per # policy_version). Stderr-only here; MCP-channel surfacing happens diff --git a/tests/test_codelocator_background_init.py b/tests/test_codelocator_background_init.py new file mode 100644 index 00000000..d61b7b95 --- /dev/null +++ b/tests/test_codelocator_background_init.py @@ -0,0 +1,206 @@ +"""Background-init lifecycle on ``RealCodeLocatorAdapter`` (#380). + +Pre-#380: ``server.py:serve_stdio`` did ``await get_code_locator().initialize()`` +inline before opening the MCP stdio transport. On a 150MB+ symbol-index DB +the cold path took ~45s, blowing past Claude Code's 30s ``initialize`` +JSON-RPC timeout. The fix moves init off the handshake path — kicked off +as a background asyncio Task, with a threading.Lock making +``_ensure_initialized`` safe to call concurrently from the background +Task AND from worker threads spawned by ``asyncio.to_thread( +ctx.code_graph., ...)``. + +These tests pin the contract: + +1. ``initialize_in_background`` returns immediately (doesn't block on the + slow init body). +2. A concurrent sync ``_ensure_initialized`` call (e.g., from a worker + thread) blocks on the lock until the background init finishes — + honoring the "first tool call eats the latency" trade. +3. ``wait_until_ready`` re-raises a background-init failure to its + async caller (fail-loud contract from #243 phase-2 signoff Q3, + relocated from boot to first call). +4. After a failed background init, ``_ensure_initialized`` is free to + retry (the lock is released on exception, the task slot is reused). +5. Concurrent ``initialize_in_background`` calls produce exactly one + Task (idempotent against re-entry from server.py startup paths). + +Solitary by design — patching ``_ensure_initialized`` is the right +seam because the alternative (a real symbol index that takes a +controllable amount of time) is fragile and slow. The lock + Task +glue is what's under test; the init body is replaced with a +deterministic sleep/flag/raise. +""" + +from __future__ import annotations + +import asyncio +import threading +import time + +import pytest + +from adapters.code_locator import RealCodeLocatorAdapter + + +def _fresh_adapter() -> RealCodeLocatorAdapter: + """Avoid the module-level singleton cache so each test gets a clean state.""" + return RealCodeLocatorAdapter(repo_path=".") + + +@pytest.mark.asyncio +async def test_initialize_in_background_returns_immediately() -> None: + """Scheduling init must not block the event loop on the slow body.""" + adapter = _fresh_adapter() + ready = threading.Event() + release = threading.Event() + + def slow_init(self: RealCodeLocatorAdapter) -> None: + # Signal we entered, then wait until the test releases us. Mirrors a + # real cold-init that takes seconds. + ready.set() + release.wait(timeout=5) + self._initialized = True + + # Monkey-patch via direct method substitution on the instance. + adapter._run_init_body = slow_init.__get__(adapter, RealCodeLocatorAdapter) + + t0 = time.monotonic() + adapter.initialize_in_background() + elapsed = time.monotonic() - t0 + + assert elapsed < 0.2, f"initialize_in_background must return immediately; took {elapsed:.3f}s" + assert adapter._init_task is not None, "background Task must be stored on the adapter" + assert not adapter._init_task.done(), "background Task must still be running" + + # Wait for the executor thread to actually enter the slow body before + # releasing. Use ``asyncio.to_thread`` so the event loop stays free to + # actually schedule the Task we just created. + entered = await asyncio.to_thread(ready.wait, 2) + assert entered, "background Task didn't reach the init body" + release.set() + await adapter._init_task + assert adapter._initialized is True + + +@pytest.mark.asyncio +async def test_sync_caller_blocks_on_background_init_via_lock() -> None: + """First tool-call thread blocks on the lock until background init lands.""" + adapter = _fresh_adapter() + init_entered = threading.Event() + release = threading.Event() + call_log: list[str] = [] + + def slow_init(self: RealCodeLocatorAdapter) -> None: + call_log.append("init-start") + init_entered.set() + release.wait(timeout=5) + call_log.append("init-end") + self._initialized = True + + adapter._run_init_body = slow_init.__get__(adapter, RealCodeLocatorAdapter) + + # Kick off background init; wait for it to actually enter the body so + # the next call genuinely contends for the lock. ``asyncio.to_thread`` + # keeps the event loop free to schedule the Task we just created. + adapter.initialize_in_background() + entered = await asyncio.to_thread(init_entered.wait, 2) + assert entered + + # Simulate a tool-handler worker thread reaching the adapter via + # ``asyncio.to_thread(adapter._ensure_initialized)``. Without the lock + # this would race the background init body. + second_call_finished = threading.Event() + + def second_caller() -> None: + adapter._ensure_initialized() + call_log.append("second-call-return") + second_call_finished.set() + + t = threading.Thread(target=second_caller, daemon=True) + t.start() + # Give the second caller a beat to try to acquire the lock. + await asyncio.sleep(0.1) + assert not second_call_finished.is_set(), ( + "second caller returned before background init finished — lock not held" + ) + + release.set() + await adapter._init_task + finished = await asyncio.to_thread(second_call_finished.wait, 2) + assert finished + + # The slow init ran exactly once, and the second caller observed the + # post-init state without re-running the body. + assert call_log == ["init-start", "init-end", "second-call-return"] + + +@pytest.mark.asyncio +async def test_wait_until_ready_reraises_background_init_failure() -> None: + """Fail-loud contract from #243 phase-2 (relocated to first-call time).""" + adapter = _fresh_adapter() + + def boom(self: RealCodeLocatorAdapter) -> None: + raise RuntimeError( + "Code locator index is empty. Run: python -m code_locator index " + ) + + adapter._run_init_body = boom.__get__(adapter, RealCodeLocatorAdapter) + + adapter.initialize_in_background() + with pytest.raises(RuntimeError, match="Code locator index is empty"): + await adapter.wait_until_ready() + + +@pytest.mark.asyncio +async def test_failed_background_init_allows_retry() -> None: + """After a failed init, the next call may try again — the slot isn't poisoned.""" + adapter = _fresh_adapter() + attempts = {"n": 0} + + def flaky_init(self: RealCodeLocatorAdapter) -> None: + attempts["n"] += 1 + if attempts["n"] == 1: + raise RuntimeError("transient") + self._initialized = True + + adapter._run_init_body = flaky_init.__get__(adapter, RealCodeLocatorAdapter) + + adapter.initialize_in_background() + with pytest.raises(RuntimeError, match="transient"): + await adapter.wait_until_ready() + assert adapter._initialized is False + + # Second kickoff schedules a fresh Task; the previous one is done. + adapter.initialize_in_background() + await adapter.wait_until_ready() + assert adapter._initialized is True + assert attempts["n"] == 2 + + +@pytest.mark.asyncio +async def test_initialize_in_background_is_idempotent_while_running() -> None: + """Repeated kickoffs while a Task is in flight reuse the existing Task.""" + adapter = _fresh_adapter() + release = threading.Event() + + def slow_init(self: RealCodeLocatorAdapter) -> None: + release.wait(timeout=5) + self._initialized = True + + adapter._run_init_body = slow_init.__get__(adapter, RealCodeLocatorAdapter) + + adapter.initialize_in_background() + first_task = adapter._init_task + adapter.initialize_in_background() + adapter.initialize_in_background() + assert adapter._init_task is first_task, ( + "subsequent initialize_in_background calls must not replace the in-flight Task" + ) + + release.set() + await first_task + assert adapter._initialized is True + + # Post-success kickoff is a no-op — Task slot stays as-is. + adapter.initialize_in_background() + assert adapter._init_task is first_task