From ba35e4b41e97ceed9f40c45aa8a56e6d62cde776 Mon Sep 17 00:00:00 2001
From: Silong Tan <silongtan@outlook.com>
Date: Fri, 17 Apr 2026 13:36:09 -0400
Subject: [PATCH] fix: extract compound tokens (snake_case, dotted.names) for
 grounding recall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The tokenizer in _ground_single stripped underscores and dots, destroying
compound identifiers like decrease_stock and transaction.atomic before they
reached validate_symbols. Extract compounds first via regex, then append
word tokens as fallback. Aggregate recall: 9.3% → 13.9%, MRR@3 held at 0.59.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 adapters/code_locator.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/adapters/code_locator.py b/adapters/code_locator.py
index 7f69f4ff..fdbbd20a 100644
--- a/adapters/code_locator.py
+++ b/adapters/code_locator.py
@@ -157,6 +157,9 @@ async def extract_symbols(self, file_path: str) -> list[dict]:
         "like", "make", "made", "use", "used", "using", "after", "before",
     })
 
+    _COMPOUND_RE = re.compile(r"[A-Za-z]\w*(?:[_.][A-Za-z]\w*)+")
+
+
     def _regions_from_symbol_ids(self, symbol_ids: list[int], db, description: str) -> list[dict]:
         """Resolve a list of symbol IDs to code_region dicts."""
         regions = []
@@ -199,10 +202,12 @@ def _ground_single(
         if hits is None:
             hits = []
 
-        tokens = [
+        compounds = [c for c in self._COMPOUND_RE.findall(description) if len(c) >= 4]
+        word_tokens = [
             w for w in re.findall(r"[a-zA-Z]{4,}", description)
             if w.lower() not in self._STOP_WORDS
         ]
+        tokens = compounds + word_tokens
 
         # Pre-compute fuzzy-validated symbol IDs once. These serve two
         # purposes below: