Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions gitnexus/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM node:22-bookworm AS builder
WORKDIR /build
RUN apt-get update && apt-get install -y python3 make g++ && rm -rf /var/lib/apt/lists/*
COPY . .
RUN npm ci --ignore-scripts \
&& node scripts/patch-tree-sitter-swift.cjs \
&& (npm rebuild 2>&1 || true) \
&& npm run build \
&& npm pack

FROM node:22-bookworm-slim
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
COPY --from=builder /build/*.tgz /tmp/gitnexus.tgz
RUN npm install -g /tmp/gitnexus.tgz && rm /tmp/gitnexus.tgz

COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh

WORKDIR /data
ENTRYPOINT ["/entrypoint.sh"]
24 changes: 24 additions & 0 deletions gitnexus/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/sh
set -e

# Determine if we need --force (empty global registry means indexes exist but aren't registered)
FORCE_FLAG=""
INCREMENTAL_FLAG=""
if [ ! -f "$HOME/.gitnexus/registry.json" ] || [ "$(cat "$HOME/.gitnexus/registry.json" 2>/dev/null)" = "[]" ] || [ "$(cat "$HOME/.gitnexus/registry.json" 2>/dev/null)" = "" ]; then
FORCE_FLAG="--force"
echo "Global registry empty — forcing re-index to register repos"
elif [ "${GITNEXUS_INCREMENTAL:-0}" = "1" ]; then
INCREMENTAL_FLAG="--incremental"
fi

# Index all mounted repositories
for repo in /data/repos/*/; do
if [ -d "$repo/.git" ]; then
echo "Indexing repository: $repo"
gitnexus analyze $FORCE_FLAG $INCREMENTAL_FLAG "$repo" || echo "Warning: Failed to index $repo"
fi
done

echo "Starting GitNexus eval-server on port 3456..."
cd /data/repos
exec gitnexus eval-server --port 3456
2 changes: 2 additions & 0 deletions gitnexus/src/cli/analyze.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ function ensureHeap(): boolean {

export interface AnalyzeOptions {
force?: boolean;
incremental?: boolean;
embeddings?: boolean;
skills?: boolean;
verbose?: boolean;
Expand Down Expand Up @@ -174,6 +175,7 @@ export const analyzeCommand = async (inputPath?: string, options?: AnalyzeOption
repoPath,
{
force: options?.force || options?.skills,
incremental: options?.incremental,
embeddings: options?.embeddings,
skipGit: options?.skipGit,
skipAgentsMd: options?.skipAgentsMd,
Expand Down
3 changes: 2 additions & 1 deletion gitnexus/src/cli/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ program

program
.command('analyze [path]')
.description('Index a repository (full analysis)')
.description('Index a repository (full or incremental analysis)')
.option('-f, --force', 'Force full re-index even if up to date')
.option('-i, --incremental', 'Only re-parse files changed since last index (uses SHA-256 hashes)')
.option('--embeddings', 'Enable embedding generation for semantic search (off by default)')
.option('--skills', 'Generate repo-specific skill files from detected communities')
.option('--skip-agents-md', 'Skip updating the gitnexus section in AGENTS.md and CLAUDE.md')
Expand Down
31 changes: 29 additions & 2 deletions gitnexus/src/core/ingestion/pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ import { createWorkerPool, WorkerPool } from './workers/worker-pool.js';
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath, pathToFileURL } from 'node:url';
import { computeFileHashes, diffFileHashes } from '../../storage/file-hasher.js';

const isDev = process.env.NODE_ENV === 'development';

Expand Down Expand Up @@ -1322,6 +1323,8 @@ export const runPipelineFromRepo = async (
repoPath: string,
onProgress: (progress: PipelineProgress) => void,
options?: PipelineOptions,
/** Previous file hashes for incremental mode. When provided, only files with changed hashes are re-parsed. */
previousFileHashes?: Record<string, string>,
): Promise<PipelineResult> => {
const graph = createKnowledgeGraph();
const ctx = createResolutionContext();
Expand All @@ -1335,6 +1338,23 @@ export const runPipelineFromRepo = async (
onProgress,
);

// ── Incremental: compute hashes and filter to changed files ────────
const allFilePaths = scannedFiles.map((f) => f.path);
const currentFileHashes = await computeFileHashes(repoPath, allFilePaths);
const hashDiff = diffFileHashes(currentFileHashes, previousFileHashes);
const changedFileSet = previousFileHashes ? new Set(hashDiff.changed) : undefined;

if (changedFileSet) {
console.log(
` Incremental: ${hashDiff.changed.length} changed, ${hashDiff.removed.length} removed, ${hashDiff.unchanged} unchanged`,
);
}

// In incremental mode, only parse files with changed hashes
const parseableScannedFiles = changedFileSet
? scannedFiles.filter((f) => changedFileSet.has(f.path))
: scannedFiles;

// Phase 3+4: Chunked parse + resolve (imports, calls, heritage, routes)
const {
exportedTypeMap,
Expand All @@ -1346,7 +1366,7 @@ export const runPipelineFromRepo = async (
} = await runChunkedParseAndResolve(
graph,
ctx,
scannedFiles,
parseableScannedFiles,
allPaths,
totalFiles,
repoPath,
Expand Down Expand Up @@ -1706,7 +1726,14 @@ export const runPipelineFromRepo = async (
},
});

return { graph, repoPath, totalFileCount: totalFiles, communityResult, processResult };
return {
graph,
repoPath,
totalFileCount: totalFiles,
communityResult,
processResult,
fileHashes: currentFileHashes,
};
} catch (error) {
ctx.clear();
throw error;
Expand Down
76 changes: 63 additions & 13 deletions gitnexus/src/core/run-analyze.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import {
closeLbug,
createFTSIndex,
loadCachedEmbeddings,
deleteNodesForFile,
} from './lbug/lbug-adapter.js';
import {
getStoragePaths,
Expand All @@ -32,6 +33,7 @@ import {
} from '../storage/repo-manager.js';
import { getCurrentCommit, hasGitDir } from '../storage/git.js';
import { generateAIContextFiles } from '../cli/ai-context.js';
import { diffFileHashes } from '../storage/file-hasher.js';

// ---------------------------------------------------------------------------
// Public types
Expand All @@ -44,6 +46,7 @@ export interface AnalyzeCallbacks {

export interface AnalyzeOptions {
force?: boolean;
incremental?: boolean;
embeddings?: boolean;
skipGit?: boolean;
/** Skip AGENTS.md and CLAUDE.md gitnexus block updates. */
Expand Down Expand Up @@ -155,27 +158,67 @@ export async function runFullAnalysis(
}
}

// ── Incremental: pass previous hashes to pipeline if applicable ───
const isIncremental = !!(options.incremental && existingMeta?.fileHashes && !options.force);
const previousFileHashes = isIncremental ? existingMeta!.fileHashes : undefined;

// ── Phase 1: Full Pipeline (0–60%) ────────────────────────────────
const pipelineResult = await runPipelineFromRepo(repoPath, (p) => {
const phaseLabel = PHASE_LABELS[p.phase] || p.phase;
const scaled = Math.round(p.percent * 0.6);
progress(p.phase, scaled, phaseLabel);
});
const pipelineResult = await runPipelineFromRepo(
repoPath,
(p) => {
const phaseLabel = PHASE_LABELS[p.phase] || p.phase;
const scaled = Math.round(p.percent * 0.6);
progress(p.phase, scaled, phaseLabel);
},
undefined,
previousFileHashes,
);

// ── Phase 2: LadybugDB (60–85%) ──────────────────────────────────
progress('lbug', 60, 'Loading into LadybugDB...');

await closeLbug();
const lbugFiles = [lbugPath, `${lbugPath}.wal`, `${lbugPath}.lock`];
for (const f of lbugFiles) {
try {
await fs.rm(f, { recursive: true, force: true });
} catch {
/* swallow */
let incrementalDeletedNodes = 0;
let incrementalDeletedFiles = 0;

if (isIncremental) {
// Incremental path: open existing DB, delete stale nodes, append new ones
const hashDiff = diffFileHashes(pipelineResult.fileHashes ?? {}, existingMeta!.fileHashes);
const filesToDelete = [...hashDiff.changed, ...hashDiff.removed];

await initLbug(lbugPath);

for (const filePath of filesToDelete) {
try {
const { deletedNodes } = await deleteNodesForFile(filePath);
incrementalDeletedNodes += deletedNodes;
incrementalDeletedFiles++;
} catch {
/* file may not have been indexed — skip */
}
}

// Merge file hashes: keep previous for unchanged, update for changed, drop removed
const mergedHashes: Record<string, string> = { ...existingMeta!.fileHashes };
for (const f of hashDiff.removed) delete mergedHashes[f];
for (const [file, hash] of Object.entries(pipelineResult.fileHashes ?? {})) {
mergedHashes[file] = hash;
}
// Overwrite pipeline fileHashes with merged so saveMeta stores the full set
(pipelineResult as any).fileHashes = mergedHashes;
} else {
// Full rebuild path: wipe and recreate LadybugDB
await closeLbug();
const lbugFiles = [lbugPath, `${lbugPath}.wal`, `${lbugPath}.lock`];
for (const f of lbugFiles) {
try {
await fs.rm(f, { recursive: true, force: true });
} catch {
/* swallow */
}
}
await initLbug(lbugPath);
}

await initLbug(lbugPath);
try {
// All work after initLbug is wrapped in try/finally to ensure closeLbug()
// is called even if an error occurs — the module-level singleton DB handle
Expand Down Expand Up @@ -279,6 +322,12 @@ export async function runFullAnalysis(
/* table may not exist if embeddings never ran */
}

if (isIncremental && incrementalDeletedFiles > 0) {
log(
` Incremental: deleted ${incrementalDeletedNodes} nodes from ${incrementalDeletedFiles} files, re-parsed ${incrementalDeletedFiles} files`,
);
}

const meta = {
repoPath,
lastCommit: currentCommit,
Expand All @@ -291,6 +340,7 @@ export async function runFullAnalysis(
processes: pipelineResult.processResult?.stats.totalProcesses,
embeddings: embeddingCount,
},
fileHashes: pipelineResult.fileHashes,
};
await saveMeta(storagePath, meta);
await registerRepo(repoPath, meta);
Expand Down
87 changes: 87 additions & 0 deletions gitnexus/src/storage/file-hasher.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/**
* File Hasher — per-file SHA-256 hashing for incremental scanning.
*
* Computes hashes for all source files in a repository, then diffs against
* stored hashes to identify changed, added, and removed files.
*/

import { createHash } from 'crypto';
import fs from 'fs/promises';
import path from 'path';

/**
* Compute SHA-256 hashes for a list of file paths.
* @param repoPath - Absolute path to the repository root
* @param filePaths - Array of repo-relative file paths
* @returns Map of relative path → SHA-256 hex digest
*/
export async function computeFileHashes(
repoPath: string,
filePaths: string[],
): Promise<Record<string, string>> {
const hashes: Record<string, string> = {};

// Process in parallel batches to avoid fd exhaustion
const BATCH = 100;
for (let i = 0; i < filePaths.length; i += BATCH) {
const batch = filePaths.slice(i, i + BATCH);
const results = await Promise.all(
batch.map(async (relPath) => {
try {
const content = await fs.readFile(path.join(repoPath, relPath));
const hash = createHash('sha256').update(content).digest('hex');
return { relPath, hash };
} catch {
// File disappeared between scan and hash — skip
return null;
}
}),
);
for (const r of results) {
if (r) hashes[r.relPath] = r.hash;
}
}

return hashes;
}

export interface HashDiff {
/** Files that are new or have a different hash */
changed: string[];
/** Files that existed before but are gone now */
removed: string[];
/** Total files unchanged */
unchanged: number;
}

/**
* Diff current file hashes against previously stored hashes.
*/
export function diffFileHashes(
currentHashes: Record<string, string>,
previousHashes: Record<string, string> | undefined,
): HashDiff {
if (!previousHashes) {
return {
changed: Object.keys(currentHashes),
removed: [],
unchanged: 0,
};
}

const changed: string[] = [];
let unchanged = 0;

for (const [file, hash] of Object.entries(currentHashes)) {
if (previousHashes[file] !== hash) {
changed.push(file);
} else {
unchanged++;
}
}

const currentSet = new Set(Object.keys(currentHashes));
const removed = Object.keys(previousHashes).filter((f) => !currentSet.has(f));

return { changed, removed, unchanged };
}
2 changes: 2 additions & 0 deletions gitnexus/src/storage/repo-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ export interface RepoMeta {
processes?: number;
embeddings?: number;
};
/** Per-file SHA-256 hashes for incremental scanning. Keys are repo-relative paths. */
fileHashes?: Record<string, string>;
}

export interface IndexedRepo {
Expand Down
2 changes: 2 additions & 0 deletions gitnexus/src/types/pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@ export interface PipelineResult {
totalFileCount: number;
communityResult?: CommunityDetectionResult;
processResult?: ProcessDetectionResult;
/** Per-file SHA-256 hashes computed during this run (for incremental scanning) */
fileHashes?: Record<string, string>;
}