From ba35e4b41e97ceed9f40c45aa8a56e6d62cde776 Mon Sep 17 00:00:00 2001 From: Silong Tan Date: Fri, 17 Apr 2026 13:36:09 -0400 Subject: [PATCH] fix: extract compound tokens (snake_case, dotted.names) for grounding recall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The tokenizer in _ground_single stripped underscores and dots, destroying compound identifiers like decrease_stock and transaction.atomic before they reached validate_symbols. Extract compounds first via regex, then append word tokens as fallback. Aggregate recall: 9.3% → 13.9%, MRR@3 held at 0.59. Co-Authored-By: Claude Opus 4.6 (1M context) --- adapters/code_locator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/adapters/code_locator.py b/adapters/code_locator.py index 7f69f4ff..fdbbd20a 100644 --- a/adapters/code_locator.py +++ b/adapters/code_locator.py @@ -157,6 +157,9 @@ async def extract_symbols(self, file_path: str) -> list[dict]: "like", "make", "made", "use", "used", "using", "after", "before", }) + _COMPOUND_RE = re.compile(r"[A-Za-z]\w*(?:[_.][A-Za-z]\w*)+") + + def _regions_from_symbol_ids(self, symbol_ids: list[int], db, description: str) -> list[dict]: """Resolve a list of symbol IDs to code_region dicts.""" regions = [] @@ -199,10 +202,12 @@ def _ground_single( if hits is None: hits = [] - tokens = [ + compounds = [c for c in self._COMPOUND_RE.findall(description) if len(c) >= 4] + word_tokens = [ w for w in re.findall(r"[a-zA-Z]{4,}", description) if w.lower() not in self._STOP_WORDS ] + tokens = compounds + word_tokens # Pre-compute fuzzy-validated symbol IDs once. These serve two # purposes below: