Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 39 additions & 17 deletions gitnexus/src/core/lbug/lbug-adapter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ import type { CachedEmbedding } from '../embeddings/types.js';
import { extensionManager, type ExtensionEnsureOptions } from './extension-loader.js';
import {
closeLbugConnection,
isDbBusyError,
isOpenRetryExhausted,
openLbugConnection,
waitForWindowsHandleRelease,
type LbugConnectionHandle,
} from './lbug-config.js';
import { isVectorExtensionSupportedByPlatform } from '../platform/capabilities.js';
Expand Down Expand Up @@ -185,21 +188,6 @@ const DB_LOCK_RETRY_ATTEMPTS = 3;
/** Base back-off in ms between BUSY retries (multiplied by attempt number). */
const DB_LOCK_RETRY_DELAY_MS = 500;

/**
* Return true when the error message indicates that another process holds
* an exclusive lock on the LadybugDB file (e.g. `gitnexus analyze` or
* `gitnexus serve` running at the same time).
*/
export const isDbBusyError = (err: unknown): boolean => {
const msg = (err instanceof Error ? err.message : String(err)).toLowerCase();
return (
msg.includes('busy') ||
msg.includes('lock') ||
msg.includes('already in use') ||
msg.includes('could not set lock')
);
};

/**
* Return true when the error message indicates a write was attempted against
* a read-only LadybugDB connection. The MCP query pool opens DBs read-only,
Expand Down Expand Up @@ -252,7 +240,11 @@ export const withLbugDb = async <T>(dbPath: string, operation: () => Promise<T>)
});
} catch (err) {
lastError = err;
if (!isDbBusyError(err) || attempt === DB_LOCK_RETRY_ATTEMPTS) {
// Skip outer retry when the inner open-retry already exhausted: the
// ~1.5s open-time budget was just spent, repeating the full reset+
// reopen cycle would only add 4-5s of tail latency without changing
// the outcome (both layers consult the same isDbBusyError matcher).
if (!isDbBusyError(err) || isOpenRetryExhausted(err) || attempt === DB_LOCK_RETRY_ATTEMPTS) {
throw err;
}
// Close stale connection inside the session lock to prevent race conditions
Expand Down Expand Up @@ -330,7 +322,16 @@ const doInitLbug = async (dbPath: string) => {
await conn.query(schemaQuery);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
if (!msg.includes('already exists')) {
// Suppression list:
// - "already exists": expected idempotent re-create on existing DBs
// - "could not set lock on file": LadybugDB v0.16.1 emits this on
// Windows when CREATE NODE TABLE runs against a path that was
// just opened (the WAL handle from a fresh Database briefly
// contests the table's first-write lock). The table is created
// anyway and any genuine cross-process lock contention surfaces
// on the next operation via withLbugDb's retry. Logging it here
// would just be noise in CI.
if (!msg.includes('already exists') && !isDbBusyError(err)) {
logger.warn(`⚠️ Schema creation warning: ${msg.slice(0, 120)}`);
}
}
Expand Down Expand Up @@ -1064,6 +1065,9 @@ export const flushWAL = async (): Promise<void> => {
*/
export const safeClose = async (): Promise<void> => {
await flushWAL();
// Capture before close — currentDbPath stays set so the Windows post-close
// probe below knows which file to wait on.
const closingDbPath = currentDbPath;
if (conn) {
try {
// eslint-disable-next-line no-restricted-syntax -- sole authorised close site
Expand All @@ -1082,6 +1086,24 @@ export const safeClose = async (): Promise<void> => {
}
db = null;
}
// Windows: libuv reports `db.close()` resolved before the kernel has
// released the file handle. A subsequent `new Database(samePath)` in
// the same process can race the release. The probe (lbug-config.ts)
// forces any residual lock to surface as EBUSY/EPERM/EACCES so the
// open-time retry absorbs the lag.
if (process.platform === 'win32' && closingDbPath) {
const released = await waitForWindowsHandleRelease(closingDbPath);
if (!released) {
// Probe exhausted with a lock code still in flight. The next
// openLbugConnection will absorb whatever residual lag remains, but
// a chronic warning helps operators spot AV interference (Windows
// Defender holding the file far past the 250ms budget).
logger.warn(
{ dbPath: closingDbPath },
'⚠️ LadybugDB file handle still locked after close (Windows). If this repeats, check antivirus/Defender exclusions for the GitNexus storage directory.',
);
}
}
};

export const closeLbug = async (): Promise<void> => {
Expand Down
240 changes: 239 additions & 1 deletion gitnexus/src/core/lbug/lbug-config.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import fs from 'fs/promises';
import os from 'os';
import path from 'path';
import type lbug from '@ladybugdb/core';

/**
Expand Down Expand Up @@ -66,6 +69,28 @@ export interface LbugConnectionHandle {
conn: lbug.Connection;
}

/**
* Return true when the error message indicates that a LadybugDB file lock
* could not be acquired — either at construction time
* (`new lbug.Database(...)` raises from `local_file_system.cpp`) or during
* a query (another writer holds the exclusive lock).
*
* Lives here (not in `lbug-adapter.ts`) so both the construction-time
* retry (`openWithLockRetry` in this file) and the query-time retry
* (`withLbugDb` in `lbug-adapter.ts`) consult the same matcher. Callers
* import directly from this module — no re-export to keep in sync.
*/
export const isDbBusyError = (err: unknown): boolean => {
const msg = (err instanceof Error ? err.message : String(err)).toLowerCase();
// `lock` already subsumes `could not set lock`; the broader term is kept
// because graph-DB transient errors include "deadlock", "lock contention",
// and the LadybugDB native module's "could not set lock on file" — all of
// which deserve a retry. If a non-transient lock-shaped error ever
// surfaces (e.g., "lock file missing" during recovery), tighten this
// matcher rather than raising the retry budget.
return msg.includes('busy') || msg.includes('lock') || msg.includes('already in use');
};

export function createLbugDatabase(
lbugModule: LbugModule,
databasePath: string,
Expand All @@ -85,14 +110,170 @@ export function createLbugDatabase(
) as lbug.Database;
}

// ─── Lock-busy retry tuning knobs ───────────────────────────────────────────
//
// All four GitNexus retry pairs that touch native LadybugDB locks live with
// a comment cross-reference here so an SRE tuning Windows flakes finds them
// in one grep:
//
// 1. OPEN_LOCK_RETRY_ATTEMPTS / OPEN_LOCK_RETRY_DELAY_MS (this file)
// → `new lbug.Database()` constructor lock failures
// 2. HANDLE_RELEASE_PROBE_ATTEMPTS / HANDLE_RELEASE_PROBE_DELAY_MS (this file)
// → post-close fs.open probe to absorb Windows handle-release lag
// 3. DB_LOCK_RETRY_ATTEMPTS / DB_LOCK_RETRY_DELAY_MS (lbug-adapter.ts withLbugDb)
// → query-time busy/lock retry around already-open connections
//
// `new lbug.Database()` calls into the native module which performs an
// OS-level exclusive lock on `<dbPath>`. On Windows that lock can fail
// for reasons specific to the OS (Defender briefly opens new files,
// libuv handle release lags the JS-side close). 5 attempts × 100ms
// linear back-off (max sleep 100+200+300+400 = 1s, plus 5 ctor RTTs
// of 10–50ms each = ~1.0–1.2s worst case) clears the typical
// AV-scanner hold without masking real cross-process conflicts.
//
// Source: https://github.com/LadybugDB/ladybug/blob/v0.16.1/src/common/file_system/local_file_system.cpp#L126
const OPEN_LOCK_RETRY_ATTEMPTS = 5;
const OPEN_LOCK_RETRY_DELAY_MS = 100;

const HANDLE_RELEASE_PROBE_ATTEMPTS = 5;
const HANDLE_RELEASE_PROBE_DELAY_MS = 50;
const HANDLE_RELEASE_LOCK_CODES = new Set(['EBUSY', 'EPERM', 'EACCES']);

/**
* Test-fixture directory prefixes recognized by `isTestFixturePath`.
*
* IMPORTANT: this list must stay in sync with the prefixes passed to
* `createTempDir` in `gitnexus/test/helpers/test-db.ts` and the prefixes
* used by `withTestLbugDB` (`gitnexus/test/helpers/test-indexed-db.ts`).
* If you add a new test that passes a custom prefix to `createTempDir`,
* add it here too — otherwise the stale-sidecar sweep silently won't
* fire for that fixture and CI flakes return.
*
* The default `createTempDir('gitnexus-test-')` and the lbug variant
* `'gitnexus-lbug-'` cover today's call sites.
*/
const TEST_FIXTURE_PREFIXES = ['gitnexus-lbug-', 'gitnexus-test-'];

/**
* Marker symbol attached to lock errors after `openWithLockRetry` exhausts
* its budget. `withLbugDb`'s outer query-time retry consults this so it
* does not re-retry a path that just spent up to ~1.5s in the open-time
* loop — preventing 6s tail latencies (3× outer × 5× inner attempts).
*
* The symbol is internal to GitNexus; consumers should treat the underlying
* error message as the user-visible signal.
*/
export const LBUG_OPEN_RETRY_EXHAUSTED = Symbol.for('gitnexus.lbug.openRetryExhausted');

export const isOpenRetryExhausted = (err: unknown): boolean => {
if (err === null || err === undefined || typeof err !== 'object') return false;
return (err as { [LBUG_OPEN_RETRY_EXHAUSTED]?: boolean })[LBUG_OPEN_RETRY_EXHAUSTED] === true;
};

const tagOpenRetryExhausted = (err: unknown): unknown => {
if (err && typeof err === 'object') {
(err as { [LBUG_OPEN_RETRY_EXHAUSTED]?: boolean })[LBUG_OPEN_RETRY_EXHAUSTED] = true;
}
return err;
};

/**
* True when `dbPath` resolves to a recognized test fixture under the OS
* temp directory. Used to gate the stale-sidecar sweep so production
* paths never have their `.wal` / `.lock` files deleted.
*
* Defensive shape:
* - `path.resolve` normalizes `..` segments before the prefix check, so
* `<tmp>/gitnexus-lbug-x/../../etc/passwd` is rejected.
* - The tmpRoot check trims any trailing separator returned by some
* Windows TMP configurations (`C:\Users\X\Temp\`) so the startsWith
* comparison stays correct.
* - Only the IMMEDIATE parent directory is matched against the prefix
* list. An ancestor walk would let a tmpdir whose own basename starts
* with `gitnexus-lbug-` accept arbitrary nested paths under it.
*/
const isTestFixturePath = (dbPath: string): boolean => {
const tmpRoot = os.tmpdir().replace(new RegExp(`${path.sep === '\\' ? '\\\\' : path.sep}+$`), '');
const resolved = path.resolve(dbPath);
if (!resolved.startsWith(tmpRoot + path.sep) && resolved !== tmpRoot) return false;
const parentBase = path.basename(path.dirname(resolved));
return TEST_FIXTURE_PREFIXES.some((p) => parentBase.startsWith(p));
};

/** Exported only for direct unit testing — production callers use `openWithLockRetry`. */
export const _isTestFixturePathForTest = isTestFixturePath;

const sleep = (ms: number): Promise<void> => new Promise((resolve) => setTimeout(resolve, ms));

/**
* Attempt to remove stale `.wal` / `.lock` sidecars that a previous aborted
* test run may have left behind. Best-effort: ENOENT is normal, anything
* else is swallowed so the caller's retry can surface the original error.
*/
const sweepStaleSidecars = async (dbPath: string): Promise<void> => {
for (const suffix of ['.wal', '.lock']) {
try {
await fs.unlink(dbPath + suffix);
} catch {
/* missing sidecar or permission error — let the open retry surface it */
}
}
};

/**
* Run `construct` with bounded retries when `new lbug.Database(...)` throws
* a busy/lock error. The original (loop-captured) error is preferred over
* any post-sweep error so triage sees the real LadybugDB lock message.
* On exhaustion the rethrown error is tagged via
* `LBUG_OPEN_RETRY_EXHAUSTED` so the outer query-time retry in
* `withLbugDb` skips re-retrying a freshly-exhausted path.
*/
const openWithLockRetry = async (
construct: () => lbug.Database,
dbPath: string,
): Promise<lbug.Database> => {
let originalLockError: unknown;
for (let attempt = 1; attempt <= OPEN_LOCK_RETRY_ATTEMPTS; attempt++) {
try {
return construct();
} catch (err) {
if (!isDbBusyError(err)) throw err;
originalLockError = err;
if (attempt === OPEN_LOCK_RETRY_ATTEMPTS) break;
await sleep(OPEN_LOCK_RETRY_DELAY_MS * attempt);
}
}

// Final defense: only for recognized test fixtures, sweep stale sidecars
// (a prior aborted test run can leave a `.wal` lock that survives the
// tmp dir cleanup). Production paths never reach this branch — the guard
// requires the immediate parent dir to match a test prefix AND the
// resolved path to live under the OS temp directory.
if (isTestFixturePath(dbPath)) {
await sweepStaleSidecars(dbPath);
try {
return construct();
} catch {
// Intentionally do NOT overwrite originalLockError. The user-actionable
// signal is "we exhausted lock retries" — a different error from the
// post-sweep attempt is less useful than the lock failure that drove
// the sweep in the first place.
}
}
throw tagOpenRetryExhausted(originalLockError);
};

export async function openLbugConnection(
lbugModule: LbugModule,
databasePath: string,
options: LbugDatabaseOptions = {},
): Promise<LbugConnectionHandle> {
let db: lbug.Database | undefined;
try {
db = createLbugDatabase(lbugModule, databasePath, options);
db = await openWithLockRetry(
() => createLbugDatabase(lbugModule, databasePath, options),
databasePath,
);
return { db, conn: new lbugModule.Connection(db) };
} catch (err) {
if (db) await db.close().catch(() => {});
Expand All @@ -104,3 +285,60 @@ export async function closeLbugConnection(handle: LbugConnectionHandle): Promise
await handle.conn.close().catch(() => {});
await handle.db.close().catch(() => {});
}

/**
* Probe `dbPath` AND its `.wal` sidecar after `db.close()` so any
* residual native file handle surfaces as EBUSY/EPERM/EACCES and the
* bounded retry absorbs the release lag. Windows-only — Linux/macOS do
* not exhibit this race.
*
* Both files matter. Empirically, on rapid open→close→reopen cycles the
* main `dbPath` handle releases first; the `.wal` handle from the
* previous Database lingers and the new Database's first write (CREATE
* NODE TABLE during schema init) fails with "Could not set lock on
* file". Probing both makes safeClose actually return when the kernel
* is fully done with the path.
*
* Returns `true` when both probes succeeded (or skipped on non-lock
* errors / missing files). Returns `false` when either probe exhausted
* its budget with a lock code still in flight.
*
* Defensive shape:
* - Opens read+write (`'r+'`) so the probe actually surfaces exclusive
* locks held by the previous Database. A read-only probe (`'r'`) is
* insufficient — Windows will grant read access while the previous
* handle's exclusive write lock is still in flight, which lets
* `safeClose` return before the next CREATE NODE TABLE can lock the
* file.
* - `try/finally` around `handle.close()` guarantees no fd leak even
* if close itself throws.
*/
export const waitForWindowsHandleRelease = async (dbPath: string): Promise<boolean> => {
const mainReleased = await probeSinglePath(dbPath);
const walReleased = await probeSinglePath(dbPath + '.wal');
return mainReleased && walReleased;
};

const probeSinglePath = async (filePath: string): Promise<boolean> => {
for (let attempt = 1; attempt <= HANDLE_RELEASE_PROBE_ATTEMPTS; attempt++) {
let handle: fs.FileHandle | undefined;
try {
handle = await fs.open(filePath, 'r+');
return true;
} catch (err) {
const code = (err as NodeJS.ErrnoException | undefined)?.code;
if (!code || !HANDLE_RELEASE_LOCK_CODES.has(code)) return true; // ENOENT / unrelated → not our problem
if (attempt === HANDLE_RELEASE_PROBE_ATTEMPTS) return false;
await sleep(HANDLE_RELEASE_PROBE_DELAY_MS * attempt);
} finally {
if (handle) {
try {
await handle.close();
} catch {
/* swallow — caller cannot do anything useful with a probe-close failure */
}
}
}
}
return false;
};
7 changes: 7 additions & 0 deletions gitnexus/test/helpers/test-db.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ export async function cleanupTempDir(tmpDir: string): Promise<void> {
/**
* Create a temporary directory for LadybugDB tests.
* Returns the path and a cleanup function.
*
* IMPORTANT: when adding a new test that passes a custom `prefix`, also add
* the prefix to `TEST_FIXTURE_PREFIXES` in
* `gitnexus/src/core/lbug/lbug-config.ts`. The stale-sidecar sweep relies
* on the prefix list to recognize test fixtures; an unknown prefix means
* the sweep silently won't fire for that fixture and Windows CI flakes
* return.
*/
export async function createTempDir(prefix: string = 'gitnexus-test-'): Promise<TestDBHandle> {
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), prefix));
Expand Down
Loading
Loading