diff --git a/README.md b/README.md index d55e8ef94b..44f56edd01 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,7 @@ gitnexus analyze --skip-git # Index folders that are not Git repositories gitnexus analyze --embeddings # Enable embedding generation (slower, better search) gitnexus analyze --verbose # Log skipped files when parsers are unavailable gitnexus analyze --worker-timeout 60 # Increase worker idle timeout for slow parses +gitnexus analyze --wal-checkpoint-threshold 67108864 # 64 MiB. Control LadybugDB WAL auto-checkpoint threshold (default: 67108864 = 64 MiB; -1 keeps Ladybug stock ~16 MiB) gitnexus analyze --workers # Parse worker pool size (default: cores-1, capped at 16; 0 = sequential) gitnexus mcp # Start MCP server (stdio) — serves all indexed repos gitnexus serve # Start local HTTP server (multi-repo) for web UI connection @@ -241,6 +242,7 @@ Most `analyze` knobs are also CLI flags (`--workers`, `--worker-timeout`, `--max | `GITNEXUS_PROFILE_DEFERRED_SLOW_MS` | `3000` (verbose) / `5000` | Per-file threshold in ms above which `processCallsFromExtracted` emits a `slow file …` log line. Parsed via `Number()`: accepts integers (`5000`), scientific notation (`2.5e3`), decimals (`.5`), and hex (`0x10`). Non-finite or non-positive values fall back to the default. | Hunting a few outlier files dominating the deferred call-resolution stage; lower to surface more, raise to focus only on the worst. | | `GITNEXUS_MAX_FILE_SIZE` | `512` (KB) | Walker skip threshold in KB. Hard cap is `32768` (tree-sitter buffer ceiling). Equivalent to `--max-file-size `. | Indexing repos with intentionally-large source files (generated parsers, vendored bundles) that should still be parsed. | | `GITNEXUS_WORKER_SUB_BATCH_TIMEOUT_MS` | `30000` | Worker idle timeout in milliseconds before retry/fallback. Equivalent to `--worker-timeout ` × 1000. | Slow-parsing files (large minified JS, deeply-nested TS types) that legitimately need more than 30s. | +| `GITNEXUS_WAL_CHECKPOINT_THRESHOLD` | `67108864` (64 MiB) | LadybugDB WAL auto-checkpoint threshold in bytes. Equivalent to `--wal-checkpoint-threshold `. `-1` keeps LadybugDB's stock threshold (~16 MiB). Larger thresholds reduce checkpoint frequency but increase the WAL size at rotation time — choose a smaller value on disk-constrained environments. | You need a larger or smaller WAL auto-checkpoint threshold for your analyze workload. | | `GITNEXUS_WORKER_SUB_BATCH_MAX_BYTES` | `8388608` (8 MB) | Per-job byte budget the pool will send to a worker in one `postMessage`. | Very large individual files; mostly diagnostic — bumping past 8 MB risks structured-clone memory pressure. | | `GITNEXUS_WORKER_MAX_RESPAWNS_PER_SLOT` | `3` | Max replacement spawns per worker slot before the slot is dropped from the active rotation. Bounds respawn loops on a chronically-crashing slot. | Hosts where a flaky worker should retry more (raise) or fail-fast (lower) before the slot is dropped. | | `GITNEXUS_WORKER_MAX_CUMULATIVE_TIMEOUT_MS` | `5 × subBatchTimeoutMs` | Total retry wall-time budget per job before quarantining. Combined with `timeoutBackoffFactor`, prevents exponentially-growing retries from stalling for hours. | Slow files that legitimately need long total retry windows; lower to fail-fast on stalls. | diff --git a/gitnexus/README.md b/gitnexus/README.md index 61ae131ace..9b18e1dbbe 100644 --- a/gitnexus/README.md +++ b/gitnexus/README.md @@ -158,6 +158,7 @@ gitnexus analyze --skip-agents-md # Preserve custom AGENTS.md/CLAUDE.md gitnexu gitnexus analyze --verbose # Log skipped files when parsers are unavailable gitnexus analyze --max-file-size 1024 # Skip files larger than N KB (default: 512, cap: 32768) gitnexus analyze --worker-timeout 60 # Increase worker idle timeout for slow parses +gitnexus analyze --wal-checkpoint-threshold 67108864 # 64 MiB. Control LadybugDB WAL auto-checkpoint threshold (default: 67108864 = 64 MiB; -1 keeps Ladybug stock ~16 MiB) gitnexus mcp # Start MCP server (stdio) — serves all indexed repos gitnexus serve # Start local HTTP server (multi-repo) for web UI gitnexus index # Register an existing .gitnexus/ folder into the global registry @@ -307,6 +308,7 @@ Configure the behavior with two environment variables: |----------|--------|---------|--------| | `GITNEXUS_LBUG_EXTENSION_INSTALL` | `auto`, `load-only`, `never` | `auto` | `auto` runs one bounded INSTALL if LOAD fails. `load-only` only uses already-installed extensions (recommended for offline / firewalled environments). `never` skips optional extensions entirely. | | `GITNEXUS_LBUG_EXTENSION_INSTALL_TIMEOUT_MS` | positive integer | `15000` | Wall-clock budget for the out-of-process `INSTALL` child before it is killed. | +| `GITNEXUS_WAL_CHECKPOINT_THRESHOLD` | integer `>= -1` | `67108864` (64 MiB) | LadybugDB WAL auto-checkpoint threshold during analyze (bytes). Auto-checkpoint remains enabled; `-1` keeps Ladybug's stock ~16 MiB. Larger thresholds reduce checkpoint frequency but increase the WAL size at rotation time — choose a smaller value on disk-constrained environments. | ```bash # Offline/airgapped: never reach the network for extensions diff --git a/gitnexus/src/cli/analyze.ts b/gitnexus/src/cli/analyze.ts index 55995b76bd..32ceaca623 100644 --- a/gitnexus/src/cli/analyze.ts +++ b/gitnexus/src/cli/analyze.ts @@ -13,7 +13,12 @@ import { spawn } from 'child_process'; import v8 from 'v8'; import cliProgress from 'cli-progress'; import { closeLbug } from '../core/lbug/lbug-adapter.js'; -import { isWalCorruptionError, WAL_RECOVERY_SUGGESTION } from '../core/lbug/lbug-config.js'; +import { + isLbugCheckpointIoError, + isWalCorruptionError, + parseWalCheckpointThreshold, + WAL_RECOVERY_SUGGESTION, +} from '../core/lbug/lbug-config.js'; import { getStoragePaths, getGlobalRegistryPath, @@ -415,6 +420,15 @@ const forceHeapOOMForTestIfEnabled = (): void => { for (;;) chunks.push('x'.repeat(1024 * 1024)); }; +// 64 MiB keeps auto-checkpoint enabled but triggers less frequently than +// Ladybug's stock ~16 MiB threshold, reducing rename/remove churn on large +// runs. Also matches the GitNexus default in `lbug-config.ts`. +// +// IMPORTANT: keep README examples (`README.md`, `gitnexus/README.md`) and +// the `DEFAULT_WAL_CHECKPOINT_THRESHOLD` constant in +// `gitnexus/src/core/lbug/lbug-config.ts` in sync with this value. +const RECOMMENDED_WAL_CHECKPOINT_THRESHOLD = 64 * 1024 * 1024; + /** Re-exec the process with a 16GB heap and larger stack if we're currently below that. */ async function ensureHeap(): Promise { const nodeOpts = process.env.NODE_OPTIONS || ''; @@ -477,6 +491,8 @@ const ANALYZE_CLI_ENV_KEYS = [ 'GITNEXUS_PROFILE_DEFERRED_SLOW_MS', 'GITNEXUS_MAX_FILE_SIZE', 'GITNEXUS_WORKER_SUB_BATCH_TIMEOUT_MS', + 'GITNEXUS_WAL_CHECKPOINT_THRESHOLD', + 'GITNEXUS_WAL_MANUAL_CHECKPOINT', 'GITNEXUS_EMBEDDING_THREADS', 'GITNEXUS_EMBEDDING_BATCH_SIZE', 'GITNEXUS_EMBEDDING_SUB_BATCH_SIZE', @@ -562,6 +578,8 @@ export interface AnalyzeOptions { maxFileSize?: string; /** Override worker sub-batch idle timeout in seconds. */ workerTimeout?: string; + /** Control LadybugDB WAL auto-checkpoint threshold during analyze. */ + walCheckpointThreshold?: string; /** Parse worker pool size; 0 disables workers (sequential fallback). */ workers?: string; embeddingThreads?: string; @@ -633,6 +651,16 @@ const analyzeCommandImpl = async (inputPath?: string, options?: AnalyzeOptions): ); } + if (options?.walCheckpointThreshold !== undefined) { + const parsed = parseWalCheckpointThreshold(options.walCheckpointThreshold); + if (parsed === undefined) { + cliError(' --wal-checkpoint-threshold must be an integer >= -1.\n'); + process.exitCode = 1; + return; + } + process.env.GITNEXUS_WAL_CHECKPOINT_THRESHOLD = String(parsed); + } + // `--workers` is threaded through `runFullAnalysis` options → PipelineOptions // → createWorkerPool, intentionally bypassing the GITNEXUS_WORKER_POOL_SIZE // env channel so this CLI surface never mutates `process.env` for pool size. @@ -1130,6 +1158,20 @@ const analyzeCommandImpl = async (inputPath?: string, options?: AnalyzeOptions): return; } + if (isLbugCheckpointIoError(err)) { + cliError( + ` LadybugDB failed while rotating/removing WAL checkpoint files.\n` + + ` This can happen when auto-checkpoint runs at the default threshold (~16MB).\n` + + ` Retry with a larger checkpoint threshold to reduce checkpoint frequency:\n` + + ` gitnexus analyze --wal-checkpoint-threshold ${RECOMMENDED_WAL_CHECKPOINT_THRESHOLD}\n` + + ` (or set GITNEXUS_WAL_CHECKPOINT_THRESHOLD=${RECOMMENDED_WAL_CHECKPOINT_THRESHOLD})\n` + + ` (Try 33554432 = 32 MiB on small-disk / CI runners.)\n`, + { recoveryHint: 'wal-checkpoint-threshold' }, + ); + process.exitCode = 1; + return; + } + // HF download failure — show clean guidance without the raw stack trace. // Checked before writeFatalToStderr so the user sees one focused message // rather than a stack-trace dump followed by a second remediation block. diff --git a/gitnexus/src/cli/cli-message.ts b/gitnexus/src/cli/cli-message.ts index db5f51fab5..1f1eb9521e 100644 --- a/gitnexus/src/cli/cli-message.ts +++ b/gitnexus/src/cli/cli-message.ts @@ -29,6 +29,38 @@ */ import { logger } from '../core/logger.js'; +/** + * String-literal union of all `recoveryHint` tags emitted by the CLI. + * + * Centralized so a new recovery branch added in `analyze.ts` cannot land + * without updating this union — TypeScript will reject the unknown literal + * passed via `cliError({ recoveryHint: '...' })`. To add a new hint: + * 1. Add the tag string to this union. + * 2. Pass it as the `recoveryHint` field at the relevant `cliError` + * call site. + * + * Consumers can import this type to narrow log-record `recoveryHint` + * fields without restating the literal list. + */ +export type RecoveryHint = + | 'wal-corruption' + | 'wal-checkpoint-threshold' + | 'heap-oom-respawn' + | 'native-worker-abort' + | 'hf-endpoint-unreachable' + | 'large-repo' + | 'npm-resolution' + | 'module-not-found'; + +/** + * Common shape for the optional structured-field bag passed to + * `cliError`/`cliWarn`/`cliInfo`. Typed so the `recoveryHint` slot is + * checked against the {@link RecoveryHint} union. + */ +export interface CliMessageFields extends Record { + recoveryHint?: RecoveryHint; +} + function writeStderr(msg: string): void { // Direct write — bypassing `console.*` so it cannot be intercepted by // progress-bar redirection (see `cli/analyze.ts:barLog`) or other @@ -41,7 +73,7 @@ function writeStderr(msg: string): void { * User-facing informational message. Use for banners, listening URLs, * and any message the user expects to read in plain text. */ -export function cliInfo(msg: string, fields?: Record): void { +export function cliInfo(msg: string, fields?: CliMessageFields): void { writeStderr(msg); logger.info(fields ?? {}, msg); } @@ -50,7 +82,7 @@ export function cliInfo(msg: string, fields?: Record): void { * User-facing warning. Operator-actionable but non-fatal — `cliWarn` * indicates the command can still proceed in some form. */ -export function cliWarn(msg: string, fields?: Record): void { +export function cliWarn(msg: string, fields?: CliMessageFields): void { writeStderr(msg); logger.warn(fields ?? {}, msg); } @@ -59,7 +91,7 @@ export function cliWarn(msg: string, fields?: Record): void { * User-facing error. Indicates the command cannot proceed; usually * paired with a non-zero exit code at the call site. */ -export function cliError(msg: string, fields?: Record): void { +export function cliError(msg: string, fields?: CliMessageFields): void { writeStderr(msg); logger.error(fields ?? {}, msg); } diff --git a/gitnexus/src/cli/index.ts b/gitnexus/src/cli/index.ts index 31446cf6fb..17fab20685 100644 --- a/gitnexus/src/cli/index.ts +++ b/gitnexus/src/cli/index.ts @@ -71,6 +71,11 @@ program '--worker-timeout ', 'Worker sub-batch idle timeout before retry/fallback. Default: 30.', ) + .option( + '--wal-checkpoint-threshold ', + 'LadybugDB WAL auto-checkpoint threshold in bytes during analyze ' + + '(integer >= -1; default: 67108864 = 64 MiB; -1 keeps Ladybug stock ~16 MiB).', + ) .option( '--workers ', 'Parse worker pool size. Default: cores-1 capped at 16. Pass 0 to disable workers (sequential).', @@ -85,6 +90,7 @@ program ' GITNEXUS_NO_GITIGNORE=1 Skip .gitignore parsing (still reads .gitnexusignore)\n' + ' GITNEXUS_MAX_FILE_SIZE=N Override large-file skip threshold (KB). Default 512, max 32768.\n' + ' GITNEXUS_WORKER_SUB_BATCH_TIMEOUT_MS=N Worker idle timeout in milliseconds. Default 30000.\n' + + ' GITNEXUS_WAL_CHECKPOINT_THRESHOLD=N LadybugDB WAL auto-checkpoint threshold in bytes (default 67108864 = 64 MiB; -1 keeps Ladybug stock ~16 MiB).\n' + ' GITNEXUS_WORKER_SUB_BATCH_MAX_BYTES=N Worker job byte budget. Default 8388608.\n' + ' GITNEXUS_WORKER_POOL_SIZE=N Parse worker count override. Default cores-1 capped at 16.\n' + ' GITNEXUS_PARSE_CHUNK_CONCURRENCY=N Concurrent in-flight parse chunks. Default 2.\n' + @@ -93,6 +99,7 @@ program ' GITNEXUS_WORKER_CONSECUTIVE_FAILURE_THRESHOLD=N Per-slot deaths to trip circuit breaker. Default max(3, poolSize).\n' + ' GITNEXUS_EMBEDDING_THREADS=N Limit local ONNX CPU threads for --embeddings.\n' + ' GITNEXUS_SEMANTIC_EXACT_SCAN_LIMIT=N Max embedding chunks for exact-scan fallback. Default 10000.\n' + + '\nFlags override the corresponding env vars when both are provided.\n' + '\nTip: `.gitnexusignore` supports `.gitignore`-style negation. Add e.g.\n' + ' `!__tests__/` to index a directory that is auto-filtered by default (#771).', ) diff --git a/gitnexus/src/core/lbug/lbug-adapter.ts b/gitnexus/src/core/lbug/lbug-adapter.ts index da9809dbae..5e2a346019 100644 --- a/gitnexus/src/core/lbug/lbug-adapter.ts +++ b/gitnexus/src/core/lbug/lbug-adapter.ts @@ -1527,6 +1527,27 @@ export const flushWAL = async (): Promise => { } }; +/** + * Issue a manual `CHECKPOINT` against the current connection and surface + * any engine error to the caller. Unlike {@link flushWAL}, this variant + * does NOT swallow Ladybug rename/remove IO failures — the manual + * checkpoint driver (`wal-checkpoint-driver.ts`) relies on the rejection + * to drive its bounded retry loop. Returns `false` when no connection is + * open (the caller treats this as a no-op success — there is no WAL to + * flush). Returns `true` after a successful CHECKPOINT + drain. + * + * The split from `flushWAL` is deliberate: every other CHECKPOINT site + * (server flush, safeClose) is best-effort and prefers a silent skip; + * the manual driver, by contrast, must observe failures to decide + * whether to retry. + */ +export const tryFlushWAL = async (): Promise => { + if (!conn) return false; + const checkpointResult = await conn.query('CHECKPOINT'); + await drainQueryResult(checkpointResult); + return true; +}; + /** * Flush the WAL and close the connection and database handles. * diff --git a/gitnexus/src/core/lbug/lbug-config.ts b/gitnexus/src/core/lbug/lbug-config.ts index 22f2d3b184..a71b25930a 100644 --- a/gitnexus/src/core/lbug/lbug-config.ts +++ b/gitnexus/src/core/lbug/lbug-config.ts @@ -2,6 +2,7 @@ import fs from 'fs/promises'; import os from 'os'; import path from 'path'; import type lbug from '@ladybugdb/core'; +import { logger } from '../logger.js'; /** * Shared configuration for `@ladybugdb/core` `Database` construction. @@ -45,6 +46,44 @@ export const LBUG_MAX_DB_SIZE: number = (() => { return 16 * 1024 * 1024 * 1024; })(); +export const parseWalCheckpointThreshold = (raw: string | undefined): number | undefined => { + if (raw === undefined) return undefined; + const normalized = raw.trim(); + if (normalized.length === 0) return undefined; + const parsed = Number(normalized); + if (!Number.isInteger(parsed) || parsed < -1) return undefined; + return parsed; +}; + +/** + * Default GitNexus WAL auto-checkpoint threshold in bytes (64 MiB). + * + * Larger than Ladybug's stock ~16 MiB to reduce checkpoint rename/remove + * churn under heavy analyze write load — the original race that motivated + * issue #1741 triggered at the stock threshold. README examples in + * `README.md` and `gitnexus/README.md` and the recovery hint in + * `analyze.ts` MUST stay in sync with this value. + */ +const DEFAULT_WAL_CHECKPOINT_THRESHOLD = 64 * 1024 * 1024; + +const resolveCheckpointThreshold = (): number => { + const raw = process.env.GITNEXUS_WAL_CHECKPOINT_THRESHOLD; + if (raw === undefined) return DEFAULT_WAL_CHECKPOINT_THRESHOLD; + const parsed = parseWalCheckpointThreshold(raw); + if (parsed !== undefined) return parsed; + // Non-empty but unparseable input: warn the operator and fall back. Mirrors + // the CLI's `--wal-checkpoint-threshold` validation (which hard-errors) + // but the env-var path stays soft to preserve "set once in your shell" + // ergonomics across mixed-version invocations. + if (raw.trim().length > 0) { + logger.warn( + { rawValue: raw, fallback: DEFAULT_WAL_CHECKPOINT_THRESHOLD }, + `Ignoring invalid GITNEXUS_WAL_CHECKPOINT_THRESHOLD=${raw}; expected integer >= -1; falling back to default (${DEFAULT_WAL_CHECKPOINT_THRESHOLD}).`, + ); + } + return DEFAULT_WAL_CHECKPOINT_THRESHOLD; +}; + /** Matches WAL corruption errors from the LadybugDB engine. */ const WAL_CORRUPTION_RE = /corrupt(ed)?\s+wal|invalid\s+wal\s+record|wal.*corrupt|checksum.*wal/i; @@ -57,6 +96,50 @@ export function isWalCorruptionError(err: unknown): boolean { return WAL_CORRUPTION_RE.test(msg); } +// ─── Ladybug WAL checkpoint IO error matchers ─────────────────────────────── +// +// Matched against LadybugDB v0.16.1 (see `gitnexus/package.json` +// @ladybugdb/core). Strict regexes encode local_file_system.cpp wording +// verified at that version. Two-tier strategy: strict matchers first so we +// only fire on real checkpoint-rotation shapes; a permissive fallback +// catches future Ladybug message drift so the recovery hint keeps surfacing +// even if upstream wording changes. +// +// From Ladybug native LocalFileSystem exceptions (`local_file_system.cpp`), +// surfaced in Node as: +// "Runtime exception: IO exception: Error renaming file ..." +// "Runtime exception: IO exception: Error removing directory or file ..." +// We only match checkpoint-rotation shapes: +// - ".wal -> .wal.checkpoint" rename failures +// - ".wal.checkpoint" remove failures +// Example matches: +// "Runtime exception: IO exception: Error renaming file /x/lbug.wal to /x/lbug.wal.checkpoint. ErrorMessage: Permission denied" +// "Runtime exception: IO exception: Error removing directory or file /x/lbug.wal.checkpoint. Error Message: Permission denied" +// Matching is case-insensitive to remain robust across wrappers/platforms. +const LBUG_CHECKPOINT_RENAME_RE = + /^runtime exception: io exception:\s*error renaming file\s+.+?\.wal\s+to\s+.+?\.wal\.checkpoint(?:\.|\s|$)/i; +const LBUG_CHECKPOINT_REMOVE_RE = + /^runtime exception: io exception:\s*error removing directory or file\s+.+?\.wal\.checkpoint(?:\.|\s|$)/i; +/** + * Permissive fallback: any IO-exception-shaped message that mentions a + * `.wal.checkpoint` path. Catches future Ladybug message drift (different + * verb, additional preamble, locale variation) so the recovery hint keeps + * surfacing even if the strict regexes go stale. + */ +const LBUG_CHECKPOINT_PERMISSIVE_RE = /io exception.*\.wal\.checkpoint/i; + +/** + * True when `err` looks like a Ladybug WAL-checkpoint rotation/remove IO + * failure. Tries strict matchers first (renames + removes), then falls + * back to the permissive matcher. + */ +export const isLbugCheckpointIoError = (err: unknown): boolean => { + if (!err) return false; + const msg = err instanceof Error ? err.message : String(err); + if (LBUG_CHECKPOINT_RENAME_RE.test(msg) || LBUG_CHECKPOINT_REMOVE_RE.test(msg)) return true; + return LBUG_CHECKPOINT_PERMISSIVE_RE.test(msg); +}; + type LbugModule = typeof lbug; export interface LbugDatabaseOptions { @@ -103,8 +186,8 @@ export function createLbugDatabase( false, // enableCompression (pinned for v0.16.0) options.readOnly ?? false, LBUG_MAX_DB_SIZE, - true, // autoCheckpoint - -1, // checkpointThreshold + true, // autoCheckpoint (always on) + resolveCheckpointThreshold(), // checkpointThreshold (default 64 MiB; override with GITNEXUS_WAL_CHECKPOINT_THRESHOLD; -1 keeps Ladybug stock ~16 MiB) options.throwOnWalReplayFailure ?? true, true, // enableChecksums ) as lbug.Database; diff --git a/gitnexus/src/core/lbug/wal-checkpoint-driver.ts b/gitnexus/src/core/lbug/wal-checkpoint-driver.ts new file mode 100644 index 0000000000..57dc6b2d68 --- /dev/null +++ b/gitnexus/src/core/lbug/wal-checkpoint-driver.ts @@ -0,0 +1,232 @@ +/** + * Manual WAL checkpoint driver with bounded retry (#1741 follow-up). + * + * Background + * ---------- + * LadybugDB's native auto-checkpoint runs from inside the C++ engine on a + * background path that has no JS-side hook for mid-write rotation. When + * the rename of `.wal` → `.wal.checkpoint` races a transient file + * lock (Windows Defender, AV scanner, NTFS shadow copy) the engine raises + * a `Runtime exception: IO exception: Error renaming file …` that aborts + * the in-flight write. There is no engine-level retry. + * + * The auto-checkpoint cannot be made retryable from JS, but a *manual* + * `CHECKPOINT` query that the JS layer issues itself CAN be wrapped in a + * bounded retry. By draining the WAL on a tight cadence — more often than + * the native threshold — the auto-checkpoint almost never has work left + * to do, so the un-retriable native rename race is moved into the + * JS-controlled path where this module's retry absorbs it. + * + * Design contract + * --------------- + * - `autoCheckpoint` stays on (maintainer requirement). This driver is + * additive: it preempts the native checkpoint, it does not replace it. + * - The driver runs ONLY during analyze (callers opt-in explicitly). MCP + * and other long-lived flows continue to rely on the close-time + * CHECKPOINT in `safeClose`. + * - Opt-out is via `GITNEXUS_WAL_MANUAL_CHECKPOINT=0`. Default is on. + * - Retries only fire on `isLbugCheckpointIoError` — every other error + * surfaces immediately. The retry budget is small (3 attempts) with + * jittered backoff so a chronic rename failure escalates fast. + * - Retry attempts log at `debug`; only the final, exhausted failure + * surfaces to the caller (and is logged at `warn` here for operators). + */ + +import { logger } from '../logger.js'; +import { tryFlushWAL } from './lbug-adapter.js'; +import { isLbugCheckpointIoError } from './lbug-config.js'; + +/** + * Bounded retry budget. Total worst-case wall time is dominated by the + * three sleeps below (~750 ms before jitter) plus three CHECKPOINT round + * trips — small enough to stay invisible during a large analyze, large + * enough to ride out a single AV scanner sweep on Windows. + */ +const CHECKPOINT_RETRY_ATTEMPTS = 3; + +/** + * Base back-off in ms. Each attempt waits `BASE_DELAYS[attempt-1]` + * milliseconds before the next try, plus a small jitter to avoid + * synchronized retries when multiple analyzers ever share a host. + */ +const BASE_DELAYS_MS: readonly number[] = [50, 200, 500]; + +/** Maximum jitter added on top of each base delay. */ +const JITTER_MAX_MS = 50; + +const sleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); + +/** + * Run a single CHECKPOINT with bounded retry on + * `isLbugCheckpointIoError`. Returns the number of attempts actually + * spent (1-`CHECKPOINT_RETRY_ATTEMPTS`) on success, or rethrows the last + * checkpoint error after exhausting the budget. Non-checkpoint errors + * (e.g. WAL corruption, lock-busy) propagate immediately on the first + * attempt — those are not what this retry is designed to absorb. + * + * The split from `flushWAL` is deliberate: `flushWAL` is the swallow-and- + * log helper used by `safeClose` and the server's best-effort flush, + * which by contract cannot fail the surrounding operation. The manual + * driver MUST observe failures to decide whether to retry, and that is + * the role of `tryFlushWAL`. + * + * Exported for direct unit testing — production callers use + * {@link startWalCheckpointDriver} or {@link checkpointOnce}. + */ +export const runCheckpointWithRetry = async ( + options: { + /** Override the sleep implementation for tests. */ + sleepFn?: (ms: number) => Promise; + /** Override the CHECKPOINT call for tests. */ + checkpointFn?: () => Promise; + /** Override the jitter source for tests. Returns a value in [0, 1). */ + randomFn?: () => number; + } = {}, +): Promise<{ attempts: number; flushed: boolean }> => { + const sleepImpl = options.sleepFn ?? sleep; + const checkpointImpl = options.checkpointFn ?? tryFlushWAL; + const randomImpl = options.randomFn ?? Math.random; + + let lastError: unknown; + for (let attempt = 1; attempt <= CHECKPOINT_RETRY_ATTEMPTS; attempt++) { + try { + const flushed = await checkpointImpl(); + return { attempts: attempt, flushed }; + } catch (err) { + lastError = err; + if (!isLbugCheckpointIoError(err)) { + // Non-checkpoint error — propagate immediately. Examples: + // WAL corruption, missing connection, query syntax failure. + // Retrying these would only mask the real signal. + throw err; + } + if (attempt === CHECKPOINT_RETRY_ATTEMPTS) break; + const base = BASE_DELAYS_MS[Math.min(attempt - 1, BASE_DELAYS_MS.length - 1)] ?? 500; + // randomImpl defaults to Math.random — non-cryptographic by design; jitter only avoids + // synchronized retries between concurrent analyzers. + const delayMs = base + Math.floor(randomImpl() * JITTER_MAX_MS); + logger.debug( + { attempt, totalAttempts: CHECKPOINT_RETRY_ATTEMPTS, delayMs }, + 'GitNexus: WAL checkpoint IO error — retrying', + ); + await sleepImpl(delayMs); + } + } + + logger.warn( + { attempts: CHECKPOINT_RETRY_ATTEMPTS }, + 'GitNexus: manual WAL checkpoint exhausted retry budget — surfacing IO error to caller', + ); + throw lastError; +}; + +/** + * Single-shot manual checkpoint. Use this when the caller drives the + * cadence itself (e.g. a phase boundary in `runFullAnalysis`). + * + * Honors the `GITNEXUS_WAL_MANUAL_CHECKPOINT=0` opt-out so operators can + * disable the manual path if it ever interacts badly with a future + * Ladybug release. + */ +export const checkpointOnce = async (): Promise => { + if (!isManualCheckpointEnabled()) return; + await runCheckpointWithRetry(); +}; + +/** Default cadence (ms) for the periodic driver. */ +const DEFAULT_PERIOD_MS = 5_000; + +/** + * Start a periodic manual checkpoint driver. The returned handle has a + * `stop()` method that resolves once the in-flight checkpoint (if any) + * settles, so callers can `await driver.stop()` before close-time + * `safeClose` and avoid racing the final flush. + * + * The first checkpoint fires after `periodMs` (not immediately) so a + * cold analyze does not pay a CHECKPOINT round trip before any writes + * have happened. + */ +export interface WalCheckpointDriver { + /** Stop the driver and await any in-flight checkpoint. Idempotent. */ + stop(): Promise; +} + +export const startWalCheckpointDriver = ( + options: { periodMs?: number } = {}, +): WalCheckpointDriver => { + if (!isManualCheckpointEnabled()) { + return { stop: async () => undefined }; + } + + const periodMs = options.periodMs ?? DEFAULT_PERIOD_MS; + let stopped = false; + let inflight: Promise | null = null; + + const tick = async (): Promise => { + if (stopped) return; + inflight = runCheckpointWithRetry() + .then(() => undefined) + .catch((err) => { + // The retry budget exhausted. The caller's surrounding write + // will see the same engine error on its next operation and the + // `analyzeCommand` catch block will emit the recovery hint. + // Logging here keeps the operator-visible trail without + // double-logging the user-facing message. + logger.warn( + { err: err instanceof Error ? err.message : String(err) }, + 'GitNexus: manual WAL checkpoint failed after retries', + ); + }); + try { + await inflight; + } finally { + inflight = null; + } + }; + + const handle = setInterval(() => { + // Fire-and-forget: setInterval cannot await directly. The next tick + // is guarded by `stopped` and the `inflight` reference. + void tick(); + }, periodMs); + // `setInterval` returned by Node is a `Timeout` object with `.unref()` + // so a hung driver never prevents process exit. + if (typeof (handle as NodeJS.Timeout).unref === 'function') { + (handle as NodeJS.Timeout).unref(); + } + + return { + stop: async () => { + if (stopped) { + if (inflight) await inflight; + return; + } + stopped = true; + clearInterval(handle); + if (inflight) { + try { + await inflight; + } catch { + /* swallowed in tick() — surface path is the surrounding write */ + } + } + }, + }; +}; + +/** + * Reading `GITNEXUS_WAL_MANUAL_CHECKPOINT` at every call site (rather + * than caching at module load) keeps `analyzeCommand` env restoration + * honest: tests that toggle the flag between invocations see the live + * value, matching the `ANALYZE_CLI_ENV_KEYS` snapshot/restore contract + * in `analyze.ts`. + * + * Accepted opt-out values: '0', 'false', 'off', 'no' (case-insensitive). + * Anything else — including undefined — leaves the driver enabled. + */ +export const isManualCheckpointEnabled = (): boolean => { + const raw = process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT; + if (raw === undefined) return true; + const normalized = raw.trim().toLowerCase(); + return !['0', 'false', 'off', 'no'].includes(normalized); +}; diff --git a/gitnexus/src/core/run-analyze.ts b/gitnexus/src/core/run-analyze.ts index 79601a9a22..c28504c4c3 100644 --- a/gitnexus/src/core/run-analyze.ts +++ b/gitnexus/src/core/run-analyze.ts @@ -26,6 +26,10 @@ import { queryImporters, } from './lbug/lbug-adapter.js'; import { createSearchFTSIndexes, verifySearchFTSIndexes } from './search/fts-indexes.js'; +import { + startWalCheckpointDriver, + type WalCheckpointDriver, +} from './lbug/wal-checkpoint-driver.js'; import { getStoragePaths, saveMeta, @@ -521,6 +525,16 @@ export async function runFullAnalysis( } await initLbug(lbugPath); + + // Manual WAL checkpoint driver (#1741): periodically drain the WAL + // from JS so the un-retriable native auto-checkpoint almost never + // has work left to do. Failures of the manual CHECKPOINT are absorbed + // by the driver's bounded retry; the final un-recoverable error still + // surfaces via the surrounding write that follows the failed flush. + // Opt-out via `GITNEXUS_WAL_MANUAL_CHECKPOINT=0` (the driver itself + // returns a no-op handle when disabled). Analyze-only: MCP and serve + // paths continue to rely on the close-time CHECKPOINT in `safeClose`. + const walCheckpointDriver: WalCheckpointDriver = startWalCheckpointDriver(); try { // All work after initLbug is wrapped in try/finally to ensure closeLbug() // is called even if an error occurs — the module-level singleton DB handle @@ -961,6 +975,9 @@ export async function runFullAnalysis( } // ── Close LadybugDB ────────────────────────────────────────────── + // Stop the manual checkpoint driver before closeLbug so its + // in-flight CHECKPOINT cannot race the `safeClose` CHECKPOINT. + await walCheckpointDriver.stop(); await closeLbug(); progress('done', 100, 'Done'); @@ -972,7 +989,13 @@ export async function runFullAnalysis( pipelineResult, }; } catch (err) { - // Ensure LadybugDB is closed even on error + // Ensure LadybugDB is closed even on error. Stop the driver first + // so its retry loop cannot extend an already-failing analyze. + try { + await walCheckpointDriver.stop(); + } catch { + /* swallow — surface path is the rethrow below */ + } try { await closeLbug(); } catch { diff --git a/gitnexus/test/integration/analyze-wal-checkpoint-failure.test.ts b/gitnexus/test/integration/analyze-wal-checkpoint-failure.test.ts new file mode 100644 index 0000000000..c517263dfd --- /dev/null +++ b/gitnexus/test/integration/analyze-wal-checkpoint-failure.test.ts @@ -0,0 +1,133 @@ +/** + * Integration test: WAL auto-checkpoint rename failure (#1741 / #1772). + * + * Drives the real `analyzeCommand` against a real LadybugDB instance and + * provokes a genuine Ladybug-engine `IO exception: Error renaming file + * .wal to .wal.checkpoint` by pre-planting a *directory* at the + * `.wal.checkpoint` rename target. `fs.rename` (which Ladybug's native + * `LocalFileSystem` ultimately invokes) cannot overwrite a non-empty + * directory with a file on either POSIX or Windows, and Ladybug's + * `doInitLbug` orphan-cleanup uses `fs.unlink` which fails on a directory + * — so the blocker survives initialization and the next auto-checkpoint + * fires the natural rename failure that motivated PR #1772. + * + * No test-only hooks, no env-var fault toggles in production code: we use + * the same `GITNEXUS_WAL_CHECKPOINT_THRESHOLD=1` knob that real users have + * available to force checkpointing on every write, then arrange a real + * filesystem state that makes the rename impossible. + * + * Verifies that: + * 1. The CLI exits non-zero. + * 2. stderr contains the actionable recovery hint pointing at + * `--wal-checkpoint-threshold 67108864` (the + * `RECOMMENDED_WAL_CHECKPOINT_THRESHOLD` constant in `analyze.ts`). + * 3. The recovery message references the + * `GITNEXUS_WAL_CHECKPOINT_THRESHOLD` env var as a parallel route. + * + * Empirically confirmed portable on Windows; the same mechanism is + * expected to work on POSIX (`rename(2)` fails with `EISDIR`/`ENOTEMPTY` + * when the target is a non-empty directory). If a future Ladybug release + * changes the rename ordering, the loose match on the recovery hint + * (rather than the exact engine error wording) keeps this test stable. + */ +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import { spawnSync } from 'child_process'; +import path from 'path'; +import fs from 'fs'; +import os from 'os'; +import { createRequire } from 'module'; +import { fileURLToPath, pathToFileURL } from 'url'; +import { cleanupTempDirSync } from '../helpers/test-db.js'; + +const testDir = path.dirname(fileURLToPath(import.meta.url)); +const repoRoot = path.resolve(testDir, '../..'); +const cliEntry = path.join(repoRoot, 'src/cli/index.ts'); +const FIXTURE_SRC = path.resolve(testDir, '..', 'fixtures', 'mini-repo'); + +const _require = createRequire(import.meta.url); +const tsxPkgDir = path.dirname(_require.resolve('tsx/package.json')); +const tsxImportUrl = pathToFileURL(path.join(tsxPkgDir, 'dist', 'loader.mjs')).href; + +let tmpParent: string; +let suiteGitnexusHome: string; +let repoPath: string; + +beforeAll(() => { + tmpParent = fs.mkdtempSync(path.join(os.tmpdir(), 'gn-wal-checkpoint-e2e-')); + suiteGitnexusHome = fs.mkdtempSync(path.join(os.tmpdir(), 'gn-wal-checkpoint-home-')); + repoPath = path.join(tmpParent, 'mini-repo'); + fs.cpSync(FIXTURE_SRC, repoPath, { recursive: true }); + + spawnSync('git', ['init'], { cwd: repoPath, stdio: 'pipe' }); + spawnSync('git', ['add', '-A'], { cwd: repoPath, stdio: 'pipe' }); + spawnSync('git', ['commit', '-m', 'initial commit'], { + cwd: repoPath, + stdio: 'pipe', + env: { + ...process.env, + GIT_AUTHOR_NAME: 'test', + GIT_AUTHOR_EMAIL: 'test@test', + GIT_COMMITTER_NAME: 'test', + GIT_COMMITTER_EMAIL: 'test@test', + }, + }); +}); + +afterAll(() => { + if (tmpParent) cleanupTempDirSync(tmpParent); + if (suiteGitnexusHome) cleanupTempDirSync(suiteGitnexusHome); +}); + +describe('analyze WAL auto-checkpoint rename failure (real lbug, no mocks)', () => { + it('surfaces the --wal-checkpoint-threshold recovery hint when the rename target is blocked', () => { + // Plant a non-empty directory at the path Ladybug's auto-checkpoint + // will try to rename `.wal` over. `fs.rename` cannot overwrite a + // non-empty directory, and the adapter's orphan-sidecar cleanup uses + // `fs.unlink` (which fails on directories) — so the blocker persists + // through `doInitLbug` and trips the very first auto-checkpoint that + // a `GITNEXUS_WAL_CHECKPOINT_THRESHOLD=1` setting forces. + const storageDir = path.join(repoPath, '.gitnexus'); + fs.mkdirSync(storageDir, { recursive: true }); + const blockerDir = path.join(storageDir, 'lbug.wal.checkpoint'); + fs.mkdirSync(blockerDir, { recursive: true }); + fs.writeFileSync(path.join(blockerDir, 'blocker'), 'cannot-be-renamed-over'); + + const result = spawnSync( + process.execPath, + ['--import', tsxImportUrl, cliEntry, 'analyze', '--skip-skills'], + { + cwd: repoPath, + encoding: 'utf8', + // Generous timeout: the test does real CSV/COPY work before the + // first failing checkpoint, and CI runners are slow. + timeout: process.env.CI ? 120_000 : 60_000, + stdio: ['pipe', 'pipe', 'pipe'], + env: { + ...process.env, + GITNEXUS_HOME: suiteGitnexusHome, + // Skip ensureHeap re-exec (which drops the tsx loader). + NODE_OPTIONS: `${process.env.NODE_OPTIONS || ''} --max-old-space-size=8192`.trim(), + // Tiny threshold forces auto-checkpoint on every write so the + // first write into the WAL trips the planted rename blocker. + GITNEXUS_WAL_CHECKPOINT_THRESHOLD: '1', + CI: '1', + }, + }, + ); + + const combined = `${result.stderr}\n${result.stdout}`; + + // The CLI must exit non-zero. status === null means the timeout fired + // without a clean exit — also a failure for this assertion. + expect(result.status === null ? 'timeout' : result.status).not.toBe(0); + + // Recovery hint must reference the CLI flag and the recommended + // 64 MiB threshold (67_108_864 bytes). Both come from the + // RECOMMENDED_WAL_CHECKPOINT_THRESHOLD constant in analyze.ts; keep + // those values in sync with this assertion if the constant changes. + expect(combined).toContain('gitnexus analyze --wal-checkpoint-threshold'); + expect(combined).toContain('67108864'); + // The env-var route should be advertised alongside the flag. + expect(combined).toContain('GITNEXUS_WAL_CHECKPOINT_THRESHOLD'); + }, 180_000); +}); diff --git a/gitnexus/test/unit/analyze-lbug-checkpoint-threshold.test.ts b/gitnexus/test/unit/analyze-lbug-checkpoint-threshold.test.ts new file mode 100644 index 0000000000..de55d4e712 --- /dev/null +++ b/gitnexus/test/unit/analyze-lbug-checkpoint-threshold.test.ts @@ -0,0 +1,99 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +const runFullAnalysisMock = vi.fn(); + +vi.mock('../../src/core/run-analyze.js', () => ({ + runFullAnalysis: runFullAnalysisMock, +})); + +vi.mock('../../src/core/lbug/lbug-adapter.js', () => ({ + closeLbug: vi.fn(async () => undefined), +})); + +vi.mock('../../src/storage/repo-manager.js', () => ({ + getStoragePaths: vi.fn(() => ({ storagePath: '.gitnexus', lbugPath: '.gitnexus/lbug' })), + getGlobalRegistryPath: vi.fn(() => 'registry.json'), + RegistryNameCollisionError: class RegistryNameCollisionError extends Error {}, + AnalysisNotFinalizedError: class AnalysisNotFinalizedError extends Error {}, + assertAnalysisFinalized: vi.fn(async () => undefined), +})); + +vi.mock('../../src/storage/git.js', () => ({ + getGitRoot: vi.fn(() => '/repo'), + hasGitDir: vi.fn(() => true), +})); + +vi.mock('../../src/core/ingestion/utils/max-file-size.js', () => ({ + getMaxFileSizeBannerMessage: vi.fn(() => null), +})); + +describe('analyzeCommand --wal-checkpoint-threshold parsing', () => { + const ORIGINAL_NODE_OPTIONS = process.env.NODE_OPTIONS; + const ORIGINAL_THRESHOLD = process.env.GITNEXUS_WAL_CHECKPOINT_THRESHOLD; + + beforeEach(() => { + vi.resetModules(); + runFullAnalysisMock.mockReset(); + process.exitCode = undefined; + process.env.NODE_OPTIONS = `${process.env.NODE_OPTIONS ?? ''} --max-old-space-size=8192`.trim(); + }); + + afterEach(() => { + if (ORIGINAL_NODE_OPTIONS === undefined) { + delete process.env.NODE_OPTIONS; + } else { + process.env.NODE_OPTIONS = ORIGINAL_NODE_OPTIONS; + } + if (ORIGINAL_THRESHOLD === undefined) { + delete process.env.GITNEXUS_WAL_CHECKPOINT_THRESHOLD; + } else { + process.env.GITNEXUS_WAL_CHECKPOINT_THRESHOLD = ORIGINAL_THRESHOLD; + } + }); + + it.each(['maybe', '-2', '1.5', ''])( + 'rejects invalid --wal-checkpoint-threshold value %s before analysis starts', + async (walCheckpointThreshold) => { + const { _captureLogger } = await import('../../src/core/logger.js'); + const cap = _captureLogger(); + const { analyzeCommand } = await import('../../src/cli/analyze.js'); + + await analyzeCommand(undefined, { walCheckpointThreshold }); + + expect(process.exitCode).toBe(1); + expect(runFullAnalysisMock).not.toHaveBeenCalled(); + expect( + cap + .records() + .some((r) => r.msg === ' --wal-checkpoint-threshold must be an integer >= -1.\n'), + ).toBe(true); + cap.restore(); + }, + ); + + it.each([ + ['-1', '-1'], + ['0', '0'], + ['1024', '1024'], + ])( + 'sets GITNEXUS_WAL_CHECKPOINT_THRESHOLD=%s during runFullAnalysis and restores afterwards', + async (cliValue, expectedEnv) => { + const { analyzeCommand } = await import('../../src/cli/analyze.js'); + let envAtCallTime: string | undefined; + runFullAnalysisMock.mockImplementation(async () => { + envAtCallTime = process.env.GITNEXUS_WAL_CHECKPOINT_THRESHOLD; + return { + repoName: 'repo', + repoPath: '/repo', + stats: {}, + alreadyUpToDate: true, + }; + }); + + await analyzeCommand(undefined, { walCheckpointThreshold: cliValue }); + + expect(envAtCallTime).toBe(expectedEnv); + expect(process.env.GITNEXUS_WAL_CHECKPOINT_THRESHOLD).toBe(ORIGINAL_THRESHOLD); + }, + ); +}); diff --git a/gitnexus/test/unit/analyze-wal-error.test.ts b/gitnexus/test/unit/analyze-wal-error.test.ts index 1b4ed5101b..5264dfd3aa 100644 --- a/gitnexus/test/unit/analyze-wal-error.test.ts +++ b/gitnexus/test/unit/analyze-wal-error.test.ts @@ -134,4 +134,82 @@ describe('analyzeCommand WAL corruption error handling', () => { cap.restore(); }); + + it('recommends --wal-checkpoint-threshold on Ladybug checkpoint I/O failures', async () => { + runFullAnalysisMock.mockRejectedValue( + new Error( + 'Runtime exception: IO exception: Error renaming file /repo/.gitnexus/lbug.wal to /repo/.gitnexus/lbug.wal.checkpoint. ErrorMessage: Permission denied', + ), + ); + + const { _captureLogger } = await import('../../src/core/logger.js'); + const cap = _captureLogger(); + const { analyzeCommand } = await import('../../src/cli/analyze.js'); + + await analyzeCommand(undefined, {}); + + expect(process.exitCode).toBe(1); + const records = cap.records(); + expect( + records.some( + (r) => + typeof r.msg === 'string' && + r.msg.includes('gitnexus analyze --wal-checkpoint-threshold'), + ), + ).toBe(true); + + cap.restore(); + }); + + it('also recommends threshold on .wal.checkpoint remove failures', async () => { + runFullAnalysisMock.mockRejectedValue( + new Error( + 'Runtime exception: IO exception: Error removing directory or file /repo/.gitnexus/lbug.wal.checkpoint. Error Message: Permission denied', + ), + ); + + const { _captureLogger } = await import('../../src/core/logger.js'); + const cap = _captureLogger(); + const { analyzeCommand } = await import('../../src/cli/analyze.js'); + + await analyzeCommand(undefined, {}); + + expect(process.exitCode).toBe(1); + const records = cap.records(); + expect( + records.some( + (r) => + typeof r.msg === 'string' && + r.msg.includes('gitnexus analyze --wal-checkpoint-threshold'), + ), + ).toBe(true); + + cap.restore(); + }); + + it('does not recommend threshold for non-checkpoint IO exceptions', async () => { + runFullAnalysisMock.mockRejectedValue( + new Error( + 'Runtime exception: IO exception: Error renaming file /repo/.gitnexus/data.tmp to /repo/.gitnexus/data.tmp.bak. ErrorMessage: Permission denied', + ), + ); + + const { _captureLogger } = await import('../../src/core/logger.js'); + const cap = _captureLogger(); + const { analyzeCommand } = await import('../../src/cli/analyze.js'); + + await analyzeCommand(undefined, {}); + + expect(process.exitCode).toBe(1); + const records = cap.records(); + expect( + records.some( + (r) => + typeof r.msg === 'string' && + r.msg.includes('gitnexus analyze --wal-checkpoint-threshold'), + ), + ).toBe(false); + + cap.restore(); + }); }); diff --git a/gitnexus/test/unit/lbug-checkpoint.test.ts b/gitnexus/test/unit/lbug-checkpoint.test.ts index 5b9603997a..6eac7e73a0 100644 --- a/gitnexus/test/unit/lbug-checkpoint.test.ts +++ b/gitnexus/test/unit/lbug-checkpoint.test.ts @@ -53,9 +53,19 @@ describe('flushWAL / safeClose — consolidation guard (#1376)', () => { expect(closeLbugBlock).not.toMatch(/db\.close\(\)/); }); - it('flushWAL is the only place that issues conn.query(CHECKPOINT)', () => { + it('CHECKPOINT is issued only by flushWAL (best-effort) and tryFlushWAL (rethrows for the retry driver)', () => { const matches = adapterSource.match(/conn\.query\('CHECKPOINT'\)/g) ?? []; - expect(matches.length).toBe(1); + // Two authorized sites: `flushWAL` (swallows errors — used by + // `safeClose` and the server's best-effort flush) and `tryFlushWAL` + // (rethrows so the manual checkpoint driver in `wal-checkpoint-driver.ts` + // can apply its bounded retry). Any third occurrence is a regression — + // a CHECKPOINT outside these two helpers will be invisible to the + // retry/error policy. + expect(matches.length).toBe(2); + }); + + it('exports tryFlushWAL (CHECKPOINT-with-rethrow for the manual retry driver)', () => { + expect(adapterSource).toMatch(/export const tryFlushWAL/); }); it('flushWAL drains and closes the CHECKPOINT result before returning', () => { diff --git a/gitnexus/test/unit/lbug-config-wal.test.ts b/gitnexus/test/unit/lbug-config-wal.test.ts index 6baea3621b..626e5d2357 100644 --- a/gitnexus/test/unit/lbug-config-wal.test.ts +++ b/gitnexus/test/unit/lbug-config-wal.test.ts @@ -1,5 +1,12 @@ import { describe, expect, it, vi } from 'vitest'; -import { createLbugDatabase, isWalCorruptionError } from '../../src/core/lbug/lbug-config.js'; +import { + createLbugDatabase, + isLbugCheckpointIoError, + isWalCorruptionError, +} from '../../src/core/lbug/lbug-config.js'; +import { _captureLogger } from '../../src/core/logger.js'; + +const DEFAULT_THRESHOLD = 64 * 1024 * 1024; describe('isWalCorruptionError', () => { it.each([ @@ -35,6 +42,101 @@ describe('isWalCorruptionError', () => { }); describe('createLbugDatabase WAL replay option', () => { + it('enables auto-checkpoint by default and uses default threshold (64 MiB)', () => { + const Database = vi.fn(function (this: any) {}); + const lbugModule = { Database } as any; + + createLbugDatabase(lbugModule, '/tmp/lbug-default'); + + expect(Database).toHaveBeenCalledWith( + '/tmp/lbug-default', + 0, + false, + false, + expect.any(Number), + true, + DEFAULT_THRESHOLD, + true, + true, + ); + }); + + it.each([ + ['0', 0], + ['1024', 1024], + ['-1', -1], + ['invalid', DEFAULT_THRESHOLD], + ['', DEFAULT_THRESHOLD], + ])('respects GITNEXUS_WAL_CHECKPOINT_THRESHOLD=%s', (raw, expectedCheckpointThreshold) => { + try { + vi.stubEnv('GITNEXUS_WAL_CHECKPOINT_THRESHOLD', raw); + const Database = vi.fn(function (this: any) {}); + const lbugModule = { Database } as any; + + createLbugDatabase(lbugModule, '/tmp/lbug-env'); + + expect(Database).toHaveBeenCalledWith( + '/tmp/lbug-env', + 0, + false, + false, + expect.any(Number), + true, + expectedCheckpointThreshold, + true, + true, + ); + } finally { + vi.unstubAllEnvs(); + } + }); + + it('warns and falls back to default when GITNEXUS_WAL_CHECKPOINT_THRESHOLD is invalid', () => { + const cap = _captureLogger(); + try { + vi.stubEnv('GITNEXUS_WAL_CHECKPOINT_THRESHOLD', 'invalid'); + const Database = vi.fn(function (this: any) {}); + const lbugModule = { Database } as any; + + createLbugDatabase(lbugModule, '/tmp/lbug-invalid'); + + const warn = cap + .records() + .find( + (r) => + typeof r.msg === 'string' && + r.msg.includes('Ignoring invalid GITNEXUS_WAL_CHECKPOINT_THRESHOLD'), + ); + expect(warn).toBeDefined(); + } finally { + vi.unstubAllEnvs(); + cap.restore(); + } + }); + + it('does NOT warn when GITNEXUS_WAL_CHECKPOINT_THRESHOLD is empty (treated as unset)', () => { + const cap = _captureLogger(); + try { + vi.stubEnv('GITNEXUS_WAL_CHECKPOINT_THRESHOLD', ''); + const Database = vi.fn(function (this: any) {}); + const lbugModule = { Database } as any; + + createLbugDatabase(lbugModule, '/tmp/lbug-empty'); + + const warn = cap + .records() + .find( + (r) => + typeof r.msg === 'string' && + r.msg.includes('Ignoring invalid GITNEXUS_WAL_CHECKPOINT_THRESHOLD'), + ); + expect(warn).toBeUndefined(); + } finally { + vi.unstubAllEnvs(); + cap.restore(); + } + }); + it('passes throwOnWalReplayFailure and checksum constructor args explicitly', () => { const Database = vi.fn(function (this: any) {}); const lbugModule = { Database } as any; @@ -51,9 +153,43 @@ describe('createLbugDatabase WAL replay option', () => { true, expect.any(Number), true, - -1, + DEFAULT_THRESHOLD, false, true, ); }); }); + +// ─── Finding 8: strict + permissive checkpoint IO matchers ───────────────── +describe('isLbugCheckpointIoError', () => { + it.each([ + [ + 'native rename failure (v0.16.x exact)', + 'Runtime exception: IO exception: Error renaming file /repo/.gitnexus/lbug.wal to /repo/.gitnexus/lbug.wal.checkpoint. ErrorMessage: Permission denied', + ], + [ + 'native remove failure (v0.16.x exact)', + 'Runtime exception: IO exception: Error removing directory or file /repo/.gitnexus/lbug.wal.checkpoint. Error Message: Permission denied', + ], + ])('matches strict %s', (_label, msg) => { + expect(isLbugCheckpointIoError(msg)).toBe(true); + expect(isLbugCheckpointIoError(new Error(msg))).toBe(true); + }); + + it('matches permissive fallback for hypothetical message drift', () => { + // Permissive matcher accepts any IO-exception-shaped message mentioning .wal.checkpoint. + const drift = + 'Some new wrapper preamble :: IO exception when finalizing /repo/.gitnexus/lbug.wal.checkpoint'; + expect(isLbugCheckpointIoError(drift)).toBe(true); + }); + + it('does NOT match unrelated IO errors', () => { + expect( + isLbugCheckpointIoError( + 'Runtime exception: IO exception: Error renaming file /repo/data.tmp to /repo/data.tmp.bak', + ), + ).toBe(false); + expect(isLbugCheckpointIoError('Some other error')).toBe(false); + expect(isLbugCheckpointIoError(undefined)).toBe(false); + }); +}); diff --git a/gitnexus/test/unit/wal-checkpoint-driver.test.ts b/gitnexus/test/unit/wal-checkpoint-driver.test.ts new file mode 100644 index 0000000000..2079c74dcb --- /dev/null +++ b/gitnexus/test/unit/wal-checkpoint-driver.test.ts @@ -0,0 +1,159 @@ +/** + * Unit tests for the manual WAL checkpoint driver (#1741 follow-up). + * + * The driver wraps a CHECKPOINT call in a bounded retry that fires only + * on `isLbugCheckpointIoError` shapes. These tests inject a fake + * `checkpointFn`, fake `sleepFn`, and fake `randomFn` to exercise the + * retry policy deterministically without touching a real LadybugDB. + * + * Integration-level coverage that the driver actually runs against a + * native engine lives in `test/integration/analyze-wal-checkpoint-failure.test.ts`. + */ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { + isManualCheckpointEnabled, + runCheckpointWithRetry, + startWalCheckpointDriver, +} from '../../src/core/lbug/wal-checkpoint-driver.js'; + +const makeCheckpointError = () => + // Matches the strict rename matcher in lbug-config.ts. + new Error( + 'Runtime exception: IO exception: Error renaming file /tmp/lbug.wal to /tmp/lbug.wal.checkpoint. ErrorMessage: Permission denied', + ); + +describe('runCheckpointWithRetry — retry policy', () => { + it('returns on first success with attempts=1 and no sleeps', async () => { + const checkpointFn = vi.fn().mockResolvedValue(true); + const sleepFn = vi.fn().mockResolvedValue(undefined); + const randomFn = vi.fn().mockReturnValue(0); + + const result = await runCheckpointWithRetry({ checkpointFn, sleepFn, randomFn }); + + expect(result.attempts).toBe(1); + expect(result.flushed).toBe(true); + expect(checkpointFn).toHaveBeenCalledTimes(1); + expect(sleepFn).toHaveBeenCalledTimes(0); + }); + + it('retries up to 3 times on checkpoint IO errors and succeeds on the final attempt', async () => { + const checkpointFn = vi + .fn() + .mockRejectedValueOnce(makeCheckpointError()) + .mockRejectedValueOnce(makeCheckpointError()) + .mockResolvedValueOnce(true); + const sleepFn = vi.fn().mockResolvedValue(undefined); + // Fixed random returns 0, so jitter contributes 0 ms and we can + // assert exact delays against BASE_DELAYS_MS = [50, 200, 500]. + const randomFn = vi.fn().mockReturnValue(0); + + const result = await runCheckpointWithRetry({ checkpointFn, sleepFn, randomFn }); + + expect(result.attempts).toBe(3); + expect(result.flushed).toBe(true); + expect(checkpointFn).toHaveBeenCalledTimes(3); + // Sleeps happen between attempts: after attempt 1 (50 ms) and after + // attempt 2 (200 ms). No sleep after the final attempt. + expect(sleepFn).toHaveBeenCalledTimes(2); + expect(sleepFn).toHaveBeenNthCalledWith(1, 50); + expect(sleepFn).toHaveBeenNthCalledWith(2, 200); + }); + + it('rethrows the last error after exhausting all retries on persistent IO failures', async () => { + const persistent = makeCheckpointError(); + const checkpointFn = vi.fn().mockRejectedValue(persistent); + const sleepFn = vi.fn().mockResolvedValue(undefined); + const randomFn = vi.fn().mockReturnValue(0); + + await expect(runCheckpointWithRetry({ checkpointFn, sleepFn, randomFn })).rejects.toBe( + persistent, + ); + + expect(checkpointFn).toHaveBeenCalledTimes(3); + // Two backoffs (50 ms, 200 ms) but no sleep after the final attempt. + expect(sleepFn).toHaveBeenCalledTimes(2); + expect(sleepFn).toHaveBeenNthCalledWith(1, 50); + expect(sleepFn).toHaveBeenNthCalledWith(2, 200); + }); + + it('does NOT retry non-checkpoint errors (e.g. WAL corruption surfaces immediately)', async () => { + const corruption = new Error('Runtime exception: Corrupted wal file.'); + const checkpointFn = vi.fn().mockRejectedValue(corruption); + const sleepFn = vi.fn().mockResolvedValue(undefined); + + await expect(runCheckpointWithRetry({ checkpointFn, sleepFn })).rejects.toBe(corruption); + + expect(checkpointFn).toHaveBeenCalledTimes(1); + expect(sleepFn).toHaveBeenCalledTimes(0); + }); + + it('jitter is bounded: 0 <= jitter < 50 ms regardless of random source', async () => { + const checkpointFn = vi + .fn() + .mockRejectedValueOnce(makeCheckpointError()) + .mockResolvedValueOnce(true); + const sleepFn = vi.fn().mockResolvedValue(undefined); + // Random returns 0.999... — jitter should still be <50 ms (floor). + const randomFn = vi.fn().mockReturnValue(0.9999); + + await runCheckpointWithRetry({ checkpointFn, sleepFn, randomFn }); + + expect(sleepFn).toHaveBeenCalledTimes(1); + const delay = sleepFn.mock.calls[0][0] as number; + expect(delay).toBe(50 + Math.floor(0.9999 * 50)); // == 99 + }); +}); + +describe('isManualCheckpointEnabled — env var parsing', () => { + let originalEnv: string | undefined; + beforeEach(() => { + originalEnv = process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT; + }); + afterEach(() => { + if (originalEnv === undefined) delete process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT; + else process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT = originalEnv; + }); + + it('defaults to enabled when the env var is unset', () => { + delete process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT; + expect(isManualCheckpointEnabled()).toBe(true); + }); + + it.each(['0', 'false', 'FALSE', 'off', 'no', ' 0 '])( + 'returns false for opt-out value %s', + (value) => { + process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT = value; + expect(isManualCheckpointEnabled()).toBe(false); + }, + ); + + it.each(['1', 'true', 'on', 'yes', ''])('returns true for non-opt-out value %s', (value) => { + process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT = value; + expect(isManualCheckpointEnabled()).toBe(true); + }); +}); + +describe('startWalCheckpointDriver — lifecycle', () => { + let originalEnv: string | undefined; + beforeEach(() => { + originalEnv = process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT; + }); + afterEach(() => { + if (originalEnv === undefined) delete process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT; + else process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT = originalEnv; + }); + + it('returns a no-op handle when manual checkpoint is disabled', async () => { + process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT = '0'; + const driver = startWalCheckpointDriver({ periodMs: 10 }); + // stop() must resolve cleanly even when no interval was scheduled. + await expect(driver.stop()).resolves.toBeUndefined(); + }); + + it('stop() is idempotent (second call resolves without throwing)', async () => { + process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT = '0'; + const driver = startWalCheckpointDriver({ periodMs: 10 }); + await driver.stop(); + await expect(driver.stop()).resolves.toBeUndefined(); + }); +});