Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions gitnexus/src/core/lbug/lbug-adapter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,9 @@ let vectorExtensionLoaded = false;

/**
* In-process cache of FTS indexes that have been ensured against the current
* connection. Prevents repeated `CALL CREATE_FTS_INDEX` round-trips inside a
* single CLI/MCP session — the first call to `ensureFTSIndex` for a given
* `(tableName, indexName)` pays the LadybugDB cost (~440 ms even when the
* index already exists on disk), subsequent calls are a Set lookup. Cleared
* by `closeLbug` so a re-init starts fresh.
* writable connection. Prevents repeated `CALL CREATE_FTS_INDEX` round-trips
* for callers that explicitly opt into `ensureFTSIndex`. Cleared by
* `closeLbug` so a re-init starts fresh.
*
* Key format: `${tableName}:${indexName}`.
*/
Expand Down Expand Up @@ -1252,10 +1250,9 @@ export const createFTSIndex = async (
/**
* Lazy-create an FTS index, caching the fact in-process.
*
* Used by `queryFTS` so that `analyze` doesn't pay the ~440 ms × 5 fixed
* LadybugDB cost up-front (it dominates analyze on small repos). Instead,
* the cost is moved to the first `query`/`context` call in a session,
* where it's amortised across many lookups.
* Kept for writable maintenance paths that need to lazily materialize an
* index. Read-only query paths must not call this; production analysis owns
* creating the configured search indexes before the database is served.
*
* Safe to call repeatedly — the in-process Set guarantees only the first
* call hits LadybugDB. `closeLbug` clears the cache so re-init starts fresh.
Expand Down
10 changes: 4 additions & 6 deletions gitnexus/src/core/run-analyze.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import {
closeLbug,
loadCachedEmbeddings,
} from './lbug/lbug-adapter.js';
import { createSearchFTSIndexes } from './search/fts-indexes.js';
import {
getStoragePaths,
saveMeta,
Expand Down Expand Up @@ -280,12 +281,9 @@ export async function runFullAnalysis(
});

// ── Phase 3: FTS (85–90%) ─────────────────────────────────────────
// FTS indexes are created lazily on first `query`/`context` call instead
// of eagerly here. On small repos / CI runners the LadybugDB
// CREATE_FTS_INDEX cost is ~440 ms × 5 (≈2 s) regardless of table size,
// which dominated `analyze` runtime and pushed Windows CI past its
// 30 s test budget. Lazy creation is implemented in
// `core/search/bm25-index.ts` via `ensureFTSIndex`.
progress('fts', 85, 'Creating search indexes...');
await createSearchFTSIndexes();
progress('fts', 90, 'Search indexes ready');

// ── Phase 3.5: Re-insert cached embeddings ────────────────────────
if (cachedEmbeddings.length > 0) {
Expand Down
159 changes: 9 additions & 150 deletions gitnexus/src/core/search/bm25-index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,10 @@
*
* Uses LadybugDB's built-in full-text search indexes for keyword-based search.
* Always reads from the database (no cached state to drift).
*
* FTS indexes are created lazily on first query (via `ensureFTSIndex`) — see
* `lbug-adapter.ts` for the rationale. This keeps `analyze` fast (the
* ~440 ms × 5 LadybugDB CREATE_FTS_INDEX cost dominates pipeline time on
* small repos / CI runners) at the cost of paying that overhead on the
* first `query`/`context` call in a session.
*/

import { queryFTS, ensureFTSIndex } from '../lbug/lbug-adapter.js';
import { queryFTS } from '../lbug/lbug-adapter.js';
import { FTS_INDEXES } from './fts-schema.js';

export interface BM25SearchResult {
filePath: string;
Expand All @@ -20,104 +15,6 @@ export interface BM25SearchResult {
nodeIds?: string[];
}

/**
* FTS schema served by `searchFTSFromLbug`. Centralised so that both the
* CLI/pipeline path and the MCP pool path use identical (table, index,
* properties) tuples and the lazy-create logic stays in one place.
*/
const FTS_INDEXES: ReadonlyArray<{
table: string;
indexName: string;
properties: readonly string[];
}> = [
{ table: 'File', indexName: 'file_fts', properties: ['name', 'content'] },
{ table: 'Function', indexName: 'function_fts', properties: ['name', 'content'] },
{ table: 'Class', indexName: 'class_fts', properties: ['name', 'content'] },
{ table: 'Method', indexName: 'method_fts', properties: ['name', 'content'] },
{ table: 'Interface', indexName: 'interface_fts', properties: ['name', 'content'] },
];

/**
* Per-process cache for the MCP pool path: tracks which `(repoId, table)`
* pairs have been ensured. The CLI/pipeline path gets its own cache inside
* `lbug-adapter.ts` keyed by table/index, scoped to the singleton connection.
*
* IMPORTANT: an entry is added ONLY when the index was confirmed to exist
* (CREATE_FTS_INDEX succeeded, or failed with `'already exists'`). Other
* failures (transient lock errors, missing extension, etc.) leave the key
* unset so the next query retries instead of silently caching the failure.
*
* Entries for a given repoId are invalidated when its pool is closed —
* see the `addPoolCloseListener` registration in `searchFTSFromLbug`.
*/
const ensuredPoolFTS = new Set<string>();

/**
* Drop all ensured-FTS cache entries for a given repoId.
*
* Called from the pool-close listener so that a pool teardown / recreation
* forces the next `searchFTSFromLbug` call to re-issue `CREATE_FTS_INDEX`
* against the fresh connection rather than trust stale ensure-state from a
* previous pool lifetime.
*
* Exported for tests; the listener wiring is internal.
*/
export function invalidateEnsuredFTSForRepo(repoId: string): void {
const prefix = `${repoId}:`;
for (const key of ensuredPoolFTS) {
if (key.startsWith(prefix)) ensuredPoolFTS.delete(key);
}
}

/**
* Tracks whether we've already wired the pool-close listener for this
* process. The pool adapter is dynamically imported, so registration
* happens lazily on the first MCP-pool-backed FTS query.
*/
let poolCloseListenerRegistered = false;
function registerPoolCloseListenerOnce(
addPoolCloseListener: (listener: (repoId: string) => void) => void,
): void {
if (poolCloseListenerRegistered) return;
poolCloseListenerRegistered = true;
addPoolCloseListener((repoId) => invalidateEnsuredFTSForRepo(repoId));
}

async function ensureFTSIndexViaExecutor(
executor: (cypher: string) => Promise<any[]>,
repoId: string,
table: string,
indexName: string,
properties: readonly string[],
): Promise<void> {
const key = `${repoId}:${table}:${indexName}`;
if (ensuredPoolFTS.has(key)) return;
const propList = properties.map((p) => `'${p}'`).join(', ');
try {
await executor(
`CALL CREATE_FTS_INDEX('${table}', '${indexName}', [${propList}], stemmer := 'porter')`,
);
// Index was created successfully — safe to cache.
ensuredPoolFTS.add(key);
} catch (e: any) {
// 'already exists' is the happy path (index persists on disk between
// process invocations) — cache it. Anything else is treated as a
// transient failure: surface a one-time warning and leave the key
// unset so the NEXT query retries rather than silently using a
// cached failure (which previously disabled BM25 for the whole
// process for that repo).
const msg = String(e?.message ?? '');
if (msg.includes('already exists')) {
ensuredPoolFTS.add(key);
} else {
console.warn(
`[gitnexus] FTS index ensure failed for repo "${repoId}" table "${table}" ` +
`(index "${indexName}"): ${msg || e}. Will retry on next query.`,
);
}
}
}

/**
* Execute a single FTS query via a custom executor (for MCP connection pool).
* Returns the same shape as core queryFTS (from LadybugDB adapter).
Expand Down Expand Up @@ -169,58 +66,24 @@ export const searchFTSFromLbug = async (
limit: number = 20,
repoId?: string,
): Promise<BM25SearchResult[]> => {
let fileResults: any[],
functionResults: any[],
classResults: any[],
methodResults: any[],
interfaceResults: any[];
const resultsByIndex: any[][] = [];

if (repoId) {
// Use MCP connection pool via dynamic import
// IMPORTANT: FTS queries run sequentially to avoid connection contention.
// The MCP pool supports multiple connections, but FTS is best run serially.
const poolMod = await import('../lbug/pool-adapter.js');
const { executeQuery, addPoolCloseListener } = poolMod;
// Register the pool-close listener lazily on first use so a teardown of
// the pool entry (LRU eviction, idle timeout, explicit close) drops the
// matching `ensuredPoolFTS` entries. Without this, stale ensure-state
// can outlive the pool that produced it.
registerPoolCloseListenerOnce(addPoolCloseListener);
const { executeQuery } = poolMod;
const executor = (cypher: string) => executeQuery(repoId, cypher);

// Lazy-create FTS indexes on first query for this repo (analyze no longer
// creates them up-front, so we ensure them here). Cached per-process.
for (const { table, indexName, properties } of FTS_INDEXES) {
await ensureFTSIndexViaExecutor(executor, repoId, table, indexName, properties);
for (const { table, indexName } of FTS_INDEXES) {
resultsByIndex.push(await queryFTSViaExecutor(executor, table, indexName, query, limit));
}

fileResults = await queryFTSViaExecutor(executor, 'File', 'file_fts', query, limit);
functionResults = await queryFTSViaExecutor(executor, 'Function', 'function_fts', query, limit);
classResults = await queryFTSViaExecutor(executor, 'Class', 'class_fts', query, limit);
methodResults = await queryFTSViaExecutor(executor, 'Method', 'method_fts', query, limit);
interfaceResults = await queryFTSViaExecutor(
executor,
'Interface',
'interface_fts',
query,
limit,
);
} else {
// Use core lbug adapter (CLI / pipeline context) — also sequential for safety.
// Lazy-create FTS indexes on first query (analyze no longer does it).
for (const { table, indexName, properties } of FTS_INDEXES) {
await ensureFTSIndex(table, indexName, [...properties]).catch(() => {});
for (const { table, indexName } of FTS_INDEXES) {
resultsByIndex.push(await queryFTS(table, indexName, query, limit, false).catch(() => []));
}

fileResults = await queryFTS('File', 'file_fts', query, limit, false).catch(() => []);
functionResults = await queryFTS('Function', 'function_fts', query, limit, false).catch(
() => [],
);
classResults = await queryFTS('Class', 'class_fts', query, limit, false).catch(() => []);
methodResults = await queryFTS('Method', 'method_fts', query, limit, false).catch(() => []);
interfaceResults = await queryFTS('Interface', 'interface_fts', query, limit, false).catch(
() => [],
);
}

// Collect all node scores per filePath to track which nodes actually matched
Expand All @@ -233,11 +96,7 @@ export const searchFTSFromLbug = async (
}
};

addResults(fileResults);
addResults(functionResults);
addResults(classResults);
addResults(methodResults);
addResults(interfaceResults);
for (const results of resultsByIndex) addResults(results);

// Sum the top-3 highest-scoring nodes per file and collect their nodeIds.
// Summing all nodes naively inflates scores for files with many mediocre
Expand Down
8 changes: 8 additions & 0 deletions gitnexus/src/core/search/fts-indexes.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import { createFTSIndex } from '../lbug/lbug-adapter.js';
import { FTS_INDEXES } from './fts-schema.js';

export async function createSearchFTSIndexes(): Promise<void> {
for (const { table, indexName, properties } of FTS_INDEXES) {
await createFTSIndex(table, indexName, [...properties]);
}
}
13 changes: 13 additions & 0 deletions gitnexus/src/core/search/fts-schema.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
export interface FTSIndexDefinition {
readonly table: string;
readonly indexName: string;
readonly properties: readonly string[];
}

export const FTS_INDEXES: readonly FTSIndexDefinition[] = [
{ table: 'File', indexName: 'file_fts', properties: ['name', 'content'] },
{ table: 'Function', indexName: 'function_fts', properties: ['name', 'content'] },
{ table: 'Class', indexName: 'class_fts', properties: ['name', 'content'] },
{ table: 'Method', indexName: 'method_fts', properties: ['name', 'content'] },
{ table: 'Interface', indexName: 'interface_fts', properties: ['name', 'content'] },
];
9 changes: 2 additions & 7 deletions gitnexus/test/integration/local-backend-calltool.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,10 @@ withTestLbugDB(
it('query tool returns results for keyword search', async () => {
const result = await backend.callTool('query', { query: 'login' });
expect(result).not.toHaveProperty('error');
// Should have some combination of processes, process_symbols, or definitions
expect(result).toHaveProperty('processes');
expect(result).toHaveProperty('definitions');
// The search should find something (FTS or graph-based)
const totalResults =
(result.processes?.length || 0) +
(result.process_symbols?.length || 0) +
(result.definitions?.length || 0);
expect(totalResults).toBeGreaterThanOrEqual(1);
expect(result.processes.map((p: any) => p.id)).toContain('proc:login-flow');
expect(result.process_symbols.map((s: any) => s.id)).toContain('func:login');

// #553: query response carries per-phase timing metadata.
expect(result.timing).toBeDefined();
Expand Down
Loading
Loading