From 41844edf883b46c10dbd81d21e00c507c33d9b09 Mon Sep 17 00:00:00 2001 From: jonasvanderhaegen-xve <> Date: Tue, 14 Apr 2026 12:32:08 +0200 Subject: [PATCH 1/6] fix(csv-generator): deduplicate all node types, not just File nodes The pipeline can produce duplicate node IDs across all symbol types (Class, Method, Function, etc.). Only File nodes were guarded by a seenFileIds Set, leaving every other type unprotected. When the CSV was COPY'd into LadybugDB, duplicate PKs caused mass "Batch execution error: Found duplicated primary key value" warnings on gitnexus serve. Replace the per-type seenFileIds with a single seenNodeIds Set checked at the top of the iteration loop, before the switch, so every label is covered by the same O(1) deduplication guard. Fixes: #822 --- gitnexus/src/core/lbug/csv-generator.ts | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/gitnexus/src/core/lbug/csv-generator.ts b/gitnexus/src/core/lbug/csv-generator.ts index b3a53146ea..63a1bb947c 100644 --- a/gitnexus/src/core/lbug/csv-generator.ts +++ b/gitnexus/src/core/lbug/csv-generator.ts @@ -315,14 +315,18 @@ export const streamAllCSVsToDisk = async ( CodeElement: codeElemWriter, }; - const seenFileIds = new Set(); + // Deduplicate all node types — the pipeline can produce duplicate IDs across + // all symbol types (Class, Method, Function, etc.), not just File nodes. + // A single Set covering every label prevents PK violations on COPY. + const seenNodeIds = new Set(); // --- SINGLE PASS over all nodes --- for (const node of graph.iterNodes()) { + if (seenNodeIds.has(node.id)) continue; + seenNodeIds.add(node.id); + switch (node.label) { case 'File': { - if (seenFileIds.has(node.id)) break; - seenFileIds.add(node.id); const content = await extractContent(node, contentCache); await fileWriter.addRow( [ From 8d38cc99fa6c237c925fdf052735cb13470ac856 Mon Sep 17 00:00:00 2001 From: jonasvanderhaegen-xve <> Date: Tue, 14 Apr 2026 12:59:14 +0200 Subject: [PATCH 2/6] fix(embeddings): use MERGE instead of CREATE for CodeEmbedding inserts CREATE fails with duplicate PK when a CodeEmbedding node already exists, which happens when: - A PostToolUse hook triggers a concurrent gitnexus analyze during an active analyze run (git commits fire the hook) - A partial prior run left some embeddings in the DB before a crash Switching to MERGE makes the insert idempotent: existing embeddings are updated in place, new ones are created, no PK violations. Fixes: #822 --- gitnexus/src/core/embeddings/embedding-pipeline.ts | 4 ++-- gitnexus/src/core/run-analyze.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gitnexus/src/core/embeddings/embedding-pipeline.ts b/gitnexus/src/core/embeddings/embedding-pipeline.ts index d3dc0854ec..cb1949144f 100644 --- a/gitnexus/src/core/embeddings/embedding-pipeline.ts +++ b/gitnexus/src/core/embeddings/embedding-pipeline.ts @@ -100,8 +100,8 @@ const batchInsertEmbeddings = async ( ) => Promise, updates: Array<{ id: string; embedding: number[] }>, ): Promise => { - // INSERT into separate embedding table - much more memory efficient! - const cypher = `CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`; + // MERGE instead of CREATE — idempotent, handles concurrent analyzes and partial prior runs + const cypher = `MERGE (e:CodeEmbedding {nodeId: $nodeId}) SET e.embedding = $embedding`; const paramsList = updates.map((u) => ({ nodeId: u.id, embedding: u.embedding })); await executeWithReusedStatement(cypher, paramsList); }; diff --git a/gitnexus/src/core/run-analyze.ts b/gitnexus/src/core/run-analyze.ts index f7b6627057..07fb8ab695 100644 --- a/gitnexus/src/core/run-analyze.ts +++ b/gitnexus/src/core/run-analyze.ts @@ -222,7 +222,7 @@ export async function runFullAnalysis( const paramsList = batch.map((e) => ({ nodeId: e.nodeId, embedding: e.embedding })); try { await executeWithReusedStatement( - `CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`, + `MERGE (e:CodeEmbedding {nodeId: $nodeId}) SET e.embedding = $embedding`, paramsList, ); } catch { From 80a6fde2ba7571523bb1581c11270c6904862ee0 Mon Sep 17 00:00:00 2001 From: jonasvanderhaegen-xve <> Date: Tue, 14 Apr 2026 13:33:48 +0200 Subject: [PATCH 3/6] fix(server): skip already-embedded nodes in POST /api/embed to avoid vector-index SET error Kuzu/LadybugDB forbids SET on a property that is part of a vector index. The /api/embed endpoint was calling runEmbeddingPipeline without skipNodeIds, causing it to attempt MERGE+SET on every node including those already embedded. Fix: query existing CodeEmbedding nodeIds before running the pipeline and pass them as skipNodeIds so only new (unembedded) nodes are processed. --- gitnexus/src/server/api.ts | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/gitnexus/src/server/api.ts b/gitnexus/src/server/api.ts index 3d4cf9a6a8..cddc42ab04 100644 --- a/gitnexus/src/server/api.ts +++ b/gitnexus/src/server/api.ts @@ -1449,6 +1449,20 @@ export const createServer = async (port: number, host: string = '127.0.0.1') => await withLbugDb(lbugPath, async () => { const { runEmbeddingPipeline } = await import('../core/embeddings/embedding-pipeline.js'); + // Skip nodes that already have embeddings — Kuzu forbids SET on vector-indexed properties. + let skipNodeIds: Set | undefined; + try { + const rows = await executeQuery( + 'MATCH (e:CodeEmbedding) RETURN e.nodeId AS nodeId', + ); + if (rows && rows.length > 0) { + skipNodeIds = new Set( + rows.map((r: any) => r.nodeId ?? r[0]).filter(Boolean), + ); + } + } catch { + /* CodeEmbedding table may not exist yet */ + } await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (p) => { embedJobManager.updateJob(job.id, { progress: { @@ -1467,7 +1481,7 @@ export const createServer = async (port: number, host: string = '127.0.0.1') => : `${p.phase} (${p.percent}%)`, }, }); - }); + }, {}, skipNodeIds); }); clearTimeout(embedTimeout); From dd194d56b10062d0e18d927941655cbeb1596df6 Mon Sep 17 00:00:00 2001 From: jonasvanderhaegen-xve <> Date: Tue, 14 Apr 2026 14:27:03 +0200 Subject: [PATCH 4/6] fix(server): narrow catch to table-not-exist errors only in POST /api/embed Bare catch{} would silently swallow connection errors and proceed to re-embed all nodes, hiding infrastructure issues. Now only swallows errors where the CodeEmbedding table does not yet exist. --- gitnexus/src/server/api.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gitnexus/src/server/api.ts b/gitnexus/src/server/api.ts index cddc42ab04..65817e0f79 100644 --- a/gitnexus/src/server/api.ts +++ b/gitnexus/src/server/api.ts @@ -1460,8 +1460,11 @@ export const createServer = async (port: number, host: string = '127.0.0.1') => rows.map((r: any) => r.nodeId ?? r[0]).filter(Boolean), ); } - } catch { - /* CodeEmbedding table may not exist yet */ + } catch (err: any) { + // Swallow only "table does not exist" — let real connection errors propagate + if (!err?.message?.includes('does not exist') && !err?.message?.includes('not found')) { + throw err; + } } await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (p) => { embedJobManager.updateJob(job.id, { From 3384575ac6d0e99f1189c3690de328ec51439e5a Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Wed, 15 Apr 2026 07:38:16 +0100 Subject: [PATCH 5/6] style: prettier format gitnexus/src/server/api.ts --- gitnexus/src/server/api.ts | 57 +++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/gitnexus/src/server/api.ts b/gitnexus/src/server/api.ts index 65817e0f79..24dc17d1f2 100644 --- a/gitnexus/src/server/api.ts +++ b/gitnexus/src/server/api.ts @@ -1452,39 +1452,44 @@ export const createServer = async (port: number, host: string = '127.0.0.1') => // Skip nodes that already have embeddings — Kuzu forbids SET on vector-indexed properties. let skipNodeIds: Set | undefined; try { - const rows = await executeQuery( - 'MATCH (e:CodeEmbedding) RETURN e.nodeId AS nodeId', - ); + const rows = await executeQuery('MATCH (e:CodeEmbedding) RETURN e.nodeId AS nodeId'); if (rows && rows.length > 0) { - skipNodeIds = new Set( - rows.map((r: any) => r.nodeId ?? r[0]).filter(Boolean), - ); + skipNodeIds = new Set(rows.map((r: any) => r.nodeId ?? r[0]).filter(Boolean)); } } catch (err: any) { // Swallow only "table does not exist" — let real connection errors propagate - if (!err?.message?.includes('does not exist') && !err?.message?.includes('not found')) { + if ( + !err?.message?.includes('does not exist') && + !err?.message?.includes('not found') + ) { throw err; } } - await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (p) => { - embedJobManager.updateJob(job.id, { - progress: { - phase: - p.phase === 'ready' ? 'complete' : p.phase === 'error' ? 'failed' : p.phase, - percent: p.percent, - message: - p.phase === 'loading-model' - ? 'Loading embedding model...' - : p.phase === 'embedding' - ? `Embedding nodes (${p.percent}%)...` - : p.phase === 'indexing' - ? 'Creating vector index...' - : p.phase === 'ready' - ? 'Embeddings complete' - : `${p.phase} (${p.percent}%)`, - }, - }); - }, {}, skipNodeIds); + await runEmbeddingPipeline( + executeQuery, + executeWithReusedStatement, + (p) => { + embedJobManager.updateJob(job.id, { + progress: { + phase: + p.phase === 'ready' ? 'complete' : p.phase === 'error' ? 'failed' : p.phase, + percent: p.percent, + message: + p.phase === 'loading-model' + ? 'Loading embedding model...' + : p.phase === 'embedding' + ? `Embedding nodes (${p.percent}%)...` + : p.phase === 'indexing' + ? 'Creating vector index...' + : p.phase === 'ready' + ? 'Embeddings complete' + : `${p.phase} (${p.percent}%)`, + }, + }); + }, + {}, + skipNodeIds, + ); }); clearTimeout(embedTimeout); From 3768e3bfd0c2e4b6aa19f927d9d5a049a0729069 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Wed, 15 Apr 2026 07:40:58 +0100 Subject: [PATCH 6/6] fix(server): log skip-embedding count and table-not-found swallow path Addresses review feedback on PR #823: - Log count of already-embedded nodes when skipNodeIds is populated (aids debugging if Kuzu driver row shape changes). - Log when the 'table does not exist' swallow path fires so ops can catch it if Kuzu ever changes error wording. - Document the {} config positional argument with an inline comment referencing the runEmbeddingPipeline signature. --- gitnexus/src/server/api.ts | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/gitnexus/src/server/api.ts b/gitnexus/src/server/api.ts index 24dc17d1f2..9afdbfe0e6 100644 --- a/gitnexus/src/server/api.ts +++ b/gitnexus/src/server/api.ts @@ -1455,13 +1455,19 @@ export const createServer = async (port: number, host: string = '127.0.0.1') => const rows = await executeQuery('MATCH (e:CodeEmbedding) RETURN e.nodeId AS nodeId'); if (rows && rows.length > 0) { skipNodeIds = new Set(rows.map((r: any) => r.nodeId ?? r[0]).filter(Boolean)); + console.log( + `[embed] ${skipNodeIds.size} nodes already embedded — skipping in incremental run`, + ); } } catch (err: any) { - // Swallow only "table does not exist" — let real connection errors propagate - if ( - !err?.message?.includes('does not exist') && - !err?.message?.includes('not found') - ) { + // Swallow only "table does not exist" — let real connection errors propagate. + // Log so ops can see this path fire if Kuzu ever changes error wording. + const msg = err?.message ?? ''; + if (msg.includes('does not exist') || msg.includes('not found')) { + console.log( + `[embed] CodeEmbedding table not yet present — full embedding run (${msg})`, + ); + } else { throw err; } } @@ -1487,7 +1493,7 @@ export const createServer = async (port: number, host: string = '127.0.0.1') => }, }); }, - {}, + {}, // config: use defaults (runEmbeddingPipeline signature: executeQuery, executeWithReusedStatement, onProgress, config, skipNodeIds) skipNodeIds, ); });