Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions gitnexus/src/cli/analyze.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,14 @@ function ensureHeap(): boolean {

export interface AnalyzeOptions {
force?: boolean;
embeddings?: boolean;
/**
* Embedding generation toggle. Commander parses `--embeddings [limit]` as:
* - `undefined` when the flag is omitted
* - `true` when passed without an argument (use default 50K node cap)
* - a string when passed with an argument (`--embeddings 0` disables the
* cap, `--embeddings <n>` uses `<n>` as the cap)
*/
embeddings?: boolean | string;
/**
* Explicitly drop existing embeddings on rebuild instead of preserving
* them. Without this flag, a routine `analyze` keeps any embeddings
Expand Down Expand Up @@ -167,6 +174,25 @@ export const analyzeCommand = async (inputPath?: string, options?: AnalyzeOption
);
}

// Parse `--embeddings [limit]`: `true` → default cap, string → numeric cap
// (0 disables the cap entirely). Validated up here so failures match the
// sibling-validation pattern (exit before bar.start() — otherwise
// process.exit() leaves the progress bar's hidden cursor uncleared).
let embeddingsNodeLimit: number | undefined;
if (typeof options?.embeddings === 'string') {
const parsed = Number(options.embeddings);
if (!Number.isInteger(parsed) || parsed < 0) {
console.error(
` --embeddings expects a non-negative integer (got "${options.embeddings}"). ` +
`Pass 0 to disable the safety cap, or omit the value to keep the default.\n`,
);
process.exitCode = 1;
return;
}
embeddingsNodeLimit = parsed;
}
const embeddingsEnabled = !!options?.embeddings;

const setPositiveEnv = (
optionName: string,
envName: string,
Expand Down Expand Up @@ -338,7 +364,8 @@ export const analyzeCommand = async (inputPath?: string, options?: AnalyzeOption
// needs a fresh pipelineResult. Has no bearing on the registry
// collision guard (see allowDuplicateName below).
force: options?.force || options?.skills,
embeddings: options?.embeddings,
embeddings: embeddingsEnabled,
embeddingsNodeLimit,
dropEmbeddings: options?.dropEmbeddings,
skipGit: options?.skipGit,
skipAgentsMd: options?.skipAgentsMd,
Expand Down
6 changes: 5 additions & 1 deletion gitnexus/src/cli/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ program
.command('analyze [path]')
.description('Index a repository (full analysis)')
.option('-f, --force', 'Force full re-index even if up to date')
.option('--embeddings', 'Enable embedding generation for semantic search (off by default)')
.option(
'--embeddings [limit]',
'Enable embedding generation for semantic search (off by default). ' +
'Optional [limit] overrides the 50,000-node safety cap; pass 0 to disable the cap entirely.',
)
.option(
'--drop-embeddings',
'Drop existing embeddings on rebuild. By default, an `analyze` without `--embeddings` ' +
Expand Down
32 changes: 32 additions & 0 deletions gitnexus/src/core/embedding-mode.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,38 @@ export interface EmbeddingMode {
shouldLoadCache: boolean;
}

/** Default safety cap on graph node count for embedding generation. */
export const DEFAULT_EMBEDDING_NODE_LIMIT = 50_000;

export interface EmbeddingCapDecision {
/** True when the node-count cap blocks generation for this graph. */
skipForCap: boolean;
/** True when the user explicitly disabled the cap (`--embeddings 0`). */
capDisabled: boolean;
/** Effective node limit applied (`0` means disabled). */
nodeLimit: number;
}

/**
* Decide whether the node-count safety cap blocks embedding generation.
*
* - `embeddingsNodeLimit === undefined` → use {@link DEFAULT_EMBEDDING_NODE_LIMIT}
* - `embeddingsNodeLimit === 0` → cap disabled, generation always proceeds
* - any positive integer → custom cap (skip if `nodeCount > limit`)
*
* Lives in `embedding-mode.ts` (not `run-analyze.ts`) so the branching
* contract is unit-testable without spinning up LadybugDB or the pipeline.
*/
export function deriveEmbeddingCap(
nodeCount: number,
embeddingsNodeLimit: number | undefined,
): EmbeddingCapDecision {
const nodeLimit = embeddingsNodeLimit ?? DEFAULT_EMBEDDING_NODE_LIMIT;
const capDisabled = nodeLimit === 0;
const skipForCap = !capDisabled && nodeCount > nodeLimit;
return { skipForCap, capDisabled, nodeLimit };
}

export function deriveEmbeddingMode(
options: EmbeddingModeInput,
existingEmbeddingCount: number,
Expand Down
39 changes: 33 additions & 6 deletions gitnexus/src/core/run-analyze.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ export interface AnalyzeOptions {
*/
force?: boolean;
embeddings?: boolean;
/**
* Override the auto-skip node-count cap for embedding generation.
* `undefined` (default) keeps the built-in 50,000-node safety limit;
* `0` disables the cap entirely; any positive integer sets a custom cap.
* Mapped from the CLI's `--embeddings [limit]` argument.
*/
embeddingsNodeLimit?: number;
/**
* Explicitly drop any embeddings present in the existing index instead of
* preserving them. Only meaningful when `embeddings` is false/undefined:
Expand Down Expand Up @@ -107,14 +114,15 @@ export interface AnalyzeResult {
pipelineResult?: any;
}

/** Threshold: auto-skip embeddings for repos with more nodes than this */
const EMBEDDING_NODE_LIMIT = 50_000;

// Re-export the pure flag-derivation helper so external callers (and tests)
// keep importing from this module's stable surface.
export { deriveEmbeddingMode } from './embedding-mode.js';
export { deriveEmbeddingMode, DEFAULT_EMBEDDING_NODE_LIMIT } from './embedding-mode.js';
export type { EmbeddingMode } from './embedding-mode.js';
import { deriveEmbeddingMode as _deriveEmbeddingMode } from './embedding-mode.js';
import {
deriveEmbeddingMode as _deriveEmbeddingMode,
deriveEmbeddingCap,
DEFAULT_EMBEDDING_NODE_LIMIT,
} from './embedding-mode.js';

export const PHASE_LABELS: Record<string, string> = {
extracting: 'Scanning files',
Expand Down Expand Up @@ -333,8 +341,27 @@ export async function runFullAnalysis(
let semanticMode: 'vector-index' | 'exact-scan' | undefined;

if (shouldGenerateEmbeddings) {
if (stats.nodes <= EMBEDDING_NODE_LIMIT) {
const { skipForCap, capDisabled, nodeLimit } = deriveEmbeddingCap(
stats.nodes,
options.embeddingsNodeLimit,
);
if (!skipForCap) {
embeddingSkipped = false;
if (capDisabled && stats.nodes > DEFAULT_EMBEDDING_NODE_LIMIT) {
log(
`Embedding node-count cap disabled — generating embeddings for ` +
`${stats.nodes.toLocaleString()} nodes. Ensure sufficient memory; ` +
`the default ${DEFAULT_EMBEDDING_NODE_LIMIT.toLocaleString()}-node ` +
`cap exists to prevent OOM.`,
);
}
} else {
log(
`Embeddings skipped: ${stats.nodes.toLocaleString()} nodes exceeds ` +
`the ${nodeLimit.toLocaleString()}-node safety cap. ` +
`Override with \`--embeddings 0\` to disable the cap, or ` +
`\`--embeddings <n>\` to set a custom cap.`,
);
}
}

Expand Down
101 changes: 101 additions & 0 deletions gitnexus/test/unit/analyze-embeddings-limit.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';

const runFullAnalysisMock = vi.fn();

vi.mock('../../src/core/run-analyze.js', () => ({
runFullAnalysis: runFullAnalysisMock,
}));

vi.mock('../../src/core/lbug/lbug-adapter.js', () => ({
closeLbug: vi.fn(async () => undefined),
}));

vi.mock('../../src/storage/repo-manager.js', () => ({
getStoragePaths: vi.fn(() => ({ storagePath: '.gitnexus', lbugPath: '.gitnexus/lbug' })),
getGlobalRegistryPath: vi.fn(() => 'registry.json'),
RegistryNameCollisionError: class RegistryNameCollisionError extends Error {},
AnalysisNotFinalizedError: class AnalysisNotFinalizedError extends Error {},
assertAnalysisFinalized: vi.fn(async () => undefined),
}));

vi.mock('../../src/storage/git.js', () => ({
getGitRoot: vi.fn(() => '/repo'),
hasGitDir: vi.fn(() => true),
}));

vi.mock('../../src/core/ingestion/utils/max-file-size.js', () => ({
getMaxFileSizeBannerMessage: vi.fn(() => null),
}));

describe('analyzeCommand --embeddings [limit] parsing', () => {
beforeEach(() => {
vi.resetModules();
runFullAnalysisMock.mockReset();
runFullAnalysisMock.mockResolvedValue({
repoName: 'repo',
repoPath: '/repo',
stats: {},
alreadyUpToDate: true,
});
process.exitCode = undefined;
process.env.NODE_OPTIONS = `${process.env.NODE_OPTIONS ?? ''} --max-old-space-size=8192`.trim();
});

it.each(['abc', '-1', '1.5', 'NaN', 'Infinity'])(
'rejects invalid --embeddings value %s before analysis starts',
async (embeddings) => {
const errorSpy = vi.spyOn(console, 'error').mockImplementation(() => undefined);
const { analyzeCommand } = await import('../../src/cli/analyze.js');

await analyzeCommand(undefined, { embeddings });

expect(process.exitCode).toBe(1);
expect(runFullAnalysisMock).not.toHaveBeenCalled();
const msg = errorSpy.mock.calls[0]?.[0] ?? '';
expect(msg).toContain('--embeddings expects a non-negative integer');
expect(msg).toContain(`got "${embeddings}"`);
errorSpy.mockRestore();
},
);

it('bare --embeddings forwards undefined limit (default cap honored downstream)', async () => {
const { analyzeCommand } = await import('../../src/cli/analyze.js');

await analyzeCommand(undefined, { embeddings: true });

expect(runFullAnalysisMock).toHaveBeenCalledTimes(1);
const opts = runFullAnalysisMock.mock.calls[0][1];
expect(opts.embeddings).toBe(true);
expect(opts.embeddingsNodeLimit).toBeUndefined();
});

it('--embeddings 0 forwards 0 (cap disabled downstream)', async () => {
const { analyzeCommand } = await import('../../src/cli/analyze.js');

await analyzeCommand(undefined, { embeddings: '0' });

const opts = runFullAnalysisMock.mock.calls[0][1];
expect(opts.embeddings).toBe(true);
expect(opts.embeddingsNodeLimit).toBe(0);
});

it('--embeddings <n> forwards a positive custom cap', async () => {
const { analyzeCommand } = await import('../../src/cli/analyze.js');

await analyzeCommand(undefined, { embeddings: '100000' });

const opts = runFullAnalysisMock.mock.calls[0][1];
expect(opts.embeddings).toBe(true);
expect(opts.embeddingsNodeLimit).toBe(100_000);
});

it('omitted --embeddings keeps embeddings off (boolean false, no limit)', async () => {
const { analyzeCommand } = await import('../../src/cli/analyze.js');

await analyzeCommand(undefined, {});

const opts = runFullAnalysisMock.mock.calls[0][1];
expect(opts.embeddings).toBe(false);
expect(opts.embeddingsNodeLimit).toBeUndefined();
});
});
42 changes: 41 additions & 1 deletion gitnexus/test/unit/run-analyze.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ import { execSync } from 'child_process';
import fs from 'fs/promises';
import path from 'path';
import { describe, it, expect } from 'vitest';
import { deriveEmbeddingMode } from '../../src/core/embedding-mode.js';
import {
deriveEmbeddingMode,
deriveEmbeddingCap,
DEFAULT_EMBEDDING_NODE_LIMIT,
} from '../../src/core/embedding-mode.js';
import { getStoragePaths, saveMeta, type RepoMeta } from '../../src/storage/repo-manager.js';
import { createTempDir } from '../helpers/test-db.js';

Expand Down Expand Up @@ -136,3 +140,39 @@ describe('deriveEmbeddingMode', () => {
expect(m.preserveExistingEmbeddings).toBe(false);
});
});

describe('deriveEmbeddingCap', () => {
it('uses the default 50K cap when limit is undefined', () => {
const d = deriveEmbeddingCap(10_000, undefined);
expect(d.nodeLimit).toBe(DEFAULT_EMBEDDING_NODE_LIMIT);
expect(d.capDisabled).toBe(false);
expect(d.skipForCap).toBe(false);
});

it('skips when node count exceeds the default cap', () => {
const d = deriveEmbeddingCap(75_000, undefined);
expect(d.skipForCap).toBe(true);
expect(d.capDisabled).toBe(false);
});

it('does not skip when node count equals the default cap (boundary)', () => {
const d = deriveEmbeddingCap(DEFAULT_EMBEDDING_NODE_LIMIT, undefined);
expect(d.skipForCap).toBe(false);
});

it('limit=0 disables the cap regardless of node count', () => {
const d = deriveEmbeddingCap(1_000_000, 0);
expect(d.capDisabled).toBe(true);
expect(d.skipForCap).toBe(false);
expect(d.nodeLimit).toBe(0);
});

it('honors a custom positive cap', () => {
expect(deriveEmbeddingCap(99_999, 100_000).skipForCap).toBe(false);
expect(deriveEmbeddingCap(100_001, 100_000).skipForCap).toBe(true);
});

it('custom cap below default still applies', () => {
expect(deriveEmbeddingCap(15_000, 10_000).skipForCap).toBe(true);
});
});
Loading