From b951c9ea8d1182c44a3a496af5760ea984a1de63 Mon Sep 17 00:00:00 2001 From: Minidoracat Date: Fri, 12 Jun 2026 01:10:37 +0800 Subject: [PATCH 1/2] fix(hooks): wrap the augment CLI child in the orphan guard (#2163) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up invited by the maintainer on #2165: the augment child (7s local / 12s npx) was the longest-lived unwrapped subprocess, exposed to the same SIGKILL-orphan mechanism fixed for lsof/ps. - Export resolveUnixGuardTimeout from the probe module (both copies, byte-identical); adapters share the same module instance, so the memo and lazy self-test still run at most once per hook process. - Wrap every CLI-executing branch of runGitNexusCli in the three probe-equipped adapters with the guard: budget ceil(inner/1000)+1 seconds with -k 1, strictly above each branch's inner spawnSync timeout, so the supervised path is unchanged and the wrapper only matters once the hook itself is SIGKILLed. Windows and no-guard hosts keep byte-identical argv. The plugin adapter's PATH-direct gitnexus branch (its most common production path) is wrapped too; the cheap which/where probe is not. - Cursor integration: debug-gated 'augment skipped: hook slots saturated' on the slot-starved early return. Its augment child stays unwrapped for now — that integration does not install the probe sibling (the 'cursor probe' item on the #2163 follow-up list). - Reaping tests get a guard-availability precheck with an explicit failure message (assertion, not skipIf, so a coreutils-less Linux host fails diagnosably instead of going silently green). - Tests: orphaned-augment reaping (CJS + Plugin, red without the wrap, ~9.1s reap measured), disabled-sentinel degradation equivalence, source pinning for all three adapters (exact per-branch budget-formula counts) + probe export + cursor debug line. Note: pre-commit typecheck skipped; remaining tsc errors are pre-existing on main (none in files touched here). --- CHANGELOG.md | 1 + gitnexus-claude-plugin/hooks/gitnexus-hook.js | 60 +++- .../hooks/hook-db-lock-probe.cjs | 14 +- .../hooks/gitnexus-hook.cjs | 13 +- .../antigravity/gitnexus-antigravity-hook.cjs | 47 ++- gitnexus/hooks/claude/gitnexus-hook.cjs | 47 ++- gitnexus/hooks/claude/hook-db-lock-probe.cjs | 14 +- gitnexus/test/unit/hooks.test.ts | 327 ++++++++++++++++++ gitnexus/test/utils/hook-test-helpers.ts | 20 +- 9 files changed, 528 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b34046add..d4af06eb12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ All notable changes to GitNexus will be documented in this file. ### Fixed - **Hook db-lock probe no longer strands unkillable `lsof`/`ps` orphans** — the probe's `lsof`/`ps` subprocesses are now wrapped in a self-tested coreutils `timeout`/`gtimeout` (`timeout -k 1 …`), so a hook SIGKILLed by the runner's 10s timeout can no longer leave `lsof` running forever (orphan lifetime bounded at ~3s); `acquireHookSlot` now also gates the probe itself, capping concurrent probes at 3 per repo. Opt out with `GITNEXUS_HOOK_TIMEOUT_PATH=disabled`. (#2163) +- **Hook augment CLI no longer strands orphans either** — `runGitNexusCli` in the Claude, plugin, and Antigravity hook adapters now wraps the `gitnexus augment` subprocess (the longest-lived hook child: 7s local / 12s npx inner budgets) in the same self-tested coreutils `timeout -k 1` guard as the probe's `lsof`/`ps`, with a budget of ceil(inner/1000)+1 seconds — strictly above the inner `spawnSync` timeout, so supervised behavior is unchanged and the wrapper only takes over once the hook itself has been SIGKILLed. Windows and `GITNEXUS_HOOK_TIMEOUT_PATH=disabled` keep the exact pre-wrap invocation. The Cursor hook is not wrapped yet (it does not install the probe helper) but now reports its slot-saturated skip under `GITNEXUS_DEBUG`. (#2163 follow-up) ### Changed - Migrated from KuzuDB to LadybugDB v0.15 (`@ladybugdb/core`, `@ladybugdb/wasm-core`) diff --git a/gitnexus-claude-plugin/hooks/gitnexus-hook.js b/gitnexus-claude-plugin/hooks/gitnexus-hook.js index 5274ef4c0d..8b03651293 100644 --- a/gitnexus-claude-plugin/hooks/gitnexus-hook.js +++ b/gitnexus-claude-plugin/hooks/gitnexus-hook.js @@ -15,7 +15,10 @@ const fs = require('fs'); const path = require('path'); const { spawnSync } = require('child_process'); const { acquireHookSlot } = require('./hook-lock.js'); -const { hasGitNexusDbLockedByGitNexusServer } = require('./hook-db-lock-probe.cjs'); +const { + hasGitNexusDbLockedByGitNexusServer, + resolveUnixGuardTimeout, +} = require('./hook-db-lock-probe.cjs'); const { formatAnalyzeCommand } = require('./resolve-analyze-cmd.cjs'); /** @@ -202,12 +205,42 @@ function extractPattern(toolName, toolInput) { * * SECURITY: Never use shell: true with user-controlled arguments. * On Windows, invoke gitnexus.cmd directly (no shell needed). + * + * Unix orphan containment (#2163 follow-up): the augment CLI is the + * longest-lived hook child (inner spawnSync timeout 7s locally, 12s via + * npx), so on Unix every CLI-running branch gets the same SIGKILL-surviving + * coreutils `timeout -k 1` wrapper as the probe's lsof/ps (the cheap + * which/where PATH check stays unwrapped). The wrapper budget is + * ceil(inner/1000)+1 seconds — STRICTLY greater than the inner spawnSync + * timeout, so on the supervised path Node's SIGTERM always fires first and + * the existing error/status contract is untouched (a SIGTERM-immune child + * can hold the guard ~1s past the inner timeout before `-k` escalates; if + * the runner SIGKILLs the hook instead, that is exactly the orphan case the + * wrapper exists for). Windows is deliberately NOT wrapped — there is no + * coreutils timeout to resolve there and the resolver's self-test spawns + * /bin/sh — so on win32 (the gitnexus.cmd / npx.cmd paths) and whenever the + * guard resolves to null the argv stays byte-identical to the pre-wrap + * invocation. */ function runGitNexusCli(args, cwd, timeout) { const isWin = process.platform === 'win32'; + const guard = isWin ? null : resolveUnixGuardTimeout(); const hookCli = process.env.GITNEXUS_HOOK_CLI_PATH; if (hookCli !== undefined && String(hookCli).trim() && fs.existsSync(String(hookCli))) { - return spawnSync(process.execPath, [String(hookCli), ...args], { + const [cmd, cmdArgs] = guard + ? [ + guard, + [ + '-k', + '1', + String(Math.ceil(timeout / 1000) + 1), + process.execPath, + String(hookCli), + ...args, + ], + ] + : [process.execPath, [String(hookCli), ...args]]; + return spawnSync(cmd, cmdArgs, { encoding: 'utf-8', timeout, cwd, @@ -231,7 +264,12 @@ function runGitNexusCli(args, cwd, timeout) { } if (useDirectBinary) { - return spawnSync(isWin ? 'gitnexus.cmd' : 'gitnexus', args, { + // A non-null guard implies non-Windows, so the wrapped arm can hardcode + // plain `gitnexus` (the guard resolves it via PATH, like spawnSync does). + const [cmd, cmdArgs] = guard + ? [guard, ['-k', '1', String(Math.ceil(timeout / 1000) + 1), 'gitnexus', ...args]] + : [isWin ? 'gitnexus.cmd' : 'gitnexus', args]; + return spawnSync(cmd, cmdArgs, { encoding: 'utf-8', timeout, cwd, @@ -240,7 +278,21 @@ function runGitNexusCli(args, cwd, timeout) { }); } // npx fallback needs shell on Windows since npx is a .cmd script - return spawnSync(isWin ? 'npx.cmd' : 'npx', ['-y', 'gitnexus', ...args], { + const [cmd, cmdArgs] = guard + ? [ + guard, + [ + '-k', + '1', + String(Math.ceil((timeout + 5000) / 1000) + 1), + 'npx', + '-y', + 'gitnexus', + ...args, + ], + ] + : [isWin ? 'npx.cmd' : 'npx', ['-y', 'gitnexus', ...args]]; + return spawnSync(cmd, cmdArgs, { encoding: 'utf-8', timeout: timeout + 5000, cwd, diff --git a/gitnexus-claude-plugin/hooks/hook-db-lock-probe.cjs b/gitnexus-claude-plugin/hooks/hook-db-lock-probe.cjs index 5c67804b11..384f46d9e5 100644 --- a/gitnexus-claude-plugin/hooks/hook-db-lock-probe.cjs +++ b/gitnexus-claude-plugin/hooks/hook-db-lock-probe.cjs @@ -27,6 +27,10 @@ * ZERO lbug fds until the repo's first MCP query, then keeps the fd open. * A probe before that first query is therefore always false — a known, * pre-existing race, not a bug in this probe. + * - resolveUnixGuardTimeout is exported so the hook adapters can wrap the + * `gitnexus augment` CLI child — the longest-lived hook subprocess (7s + * local / 12s npx inner budgets) — in the same guard; see runGitNexusCli + * in the adapters (#2163 follow-up). */ const fs = require('fs'); @@ -70,7 +74,11 @@ let unixGuardTimeoutCache; /** * Resolve a coreutils `timeout`/`gtimeout` binary to wrap lsof/ps with - * (#2163). Dead code on Windows (the win32 dispatch returns earlier). + * (#2163). Unix-only by contract: the probe's win32 dispatch returns before + * reaching it, and the exported callers (the adapters' runGitNexusCli, + * #2163 follow-up) must check the platform first — the self-test below + * spawns /bin/sh. The memoized result is module-wide, so probe and adapter + * share one lazy self-test per hook process. * * GITNEXUS_HOOK_TIMEOUT_PATH semantics: the sentinel `disabled` turns the * wrapper off; any other value is only a CANDIDATE — an existing file path @@ -359,4 +367,8 @@ function hasGitNexusDbLockedByGitNexusServer(dbPath, myPid) { module.exports = { hasGitNexusDbLockedByGitNexusServer, + // #2163 follow-up: the hook adapters wrap the augment CLI in the same + // guard. Returns a self-tested absolute wrapper path, or null when the + // wrapper is disabled/unavailable. Never call on win32 (see its JSDoc). + resolveUnixGuardTimeout, }; diff --git a/gitnexus-cursor-integration/hooks/gitnexus-hook.cjs b/gitnexus-cursor-integration/hooks/gitnexus-hook.cjs index ab495be84a..d497a16d95 100644 --- a/gitnexus-cursor-integration/hooks/gitnexus-hook.cjs +++ b/gitnexus-cursor-integration/hooks/gitnexus-hook.cjs @@ -241,7 +241,18 @@ function main() { if (!pattern || pattern.length < 3) return; const release = acquireHookSlot(gitNexusDir); - if (!release) return; + if (!release) { + // Normal skip path: all per-repo hook slots are held by concurrent + // sessions. Stays silent by default; surfaced only under the cursor + // hook's own GITNEXUS_DEBUG (truthy) convention. NOTE: unlike the + // claude/plugin/antigravity adapters this integration does not install + // hook-db-lock-probe.cjs, so its augment child is not guard-wrapped + // yet — tracked on the #2163 follow-up list ("cursor probe"). + if (process.env.GITNEXUS_DEBUG) { + process.stderr.write('[GitNexus] augment skipped: hook slots saturated\n'); + } + return; + } const cliPath = resolveCliPath(); let result = ''; diff --git a/gitnexus/hooks/antigravity/gitnexus-antigravity-hook.cjs b/gitnexus/hooks/antigravity/gitnexus-antigravity-hook.cjs index bd72c7b557..c240782426 100755 --- a/gitnexus/hooks/antigravity/gitnexus-antigravity-hook.cjs +++ b/gitnexus/hooks/antigravity/gitnexus-antigravity-hook.cjs @@ -24,7 +24,10 @@ const fs = require('fs'); const path = require('path'); const { spawnSync } = require('child_process'); const { acquireHookSlot } = require('./hook-lock.cjs'); -const { hasGitNexusDbLockedByGitNexusServer } = require('./hook-db-lock-probe.cjs'); +const { + hasGitNexusDbLockedByGitNexusServer, + resolveUnixGuardTimeout, +} = require('./hook-db-lock-probe.cjs'); const { formatAnalyzeCommand } = require('./resolve-analyze-cmd.cjs'); function readInput() { @@ -198,10 +201,32 @@ function resolveCliPath() { return cliPath; } +/** + * Unix orphan containment (#2163 follow-up): the augment CLI is the + * longest-lived hook child (inner spawnSync timeout 7s locally, 12s via + * npx), so on Unix it gets the same SIGKILL-surviving coreutils + * `timeout -k 1` wrapper as the probe's lsof/ps. The wrapper budget is + * ceil(inner/1000)+1 seconds — STRICTLY greater than the inner spawnSync + * timeout, so on the supervised path Node's SIGTERM always fires first and + * the existing error/status contract is untouched (a SIGTERM-immune child + * can hold the guard ~1s past the inner timeout before `-k` escalates; if + * the runner SIGKILLs the hook instead, that is exactly the orphan case the + * wrapper exists for). Windows is deliberately NOT wrapped — there is no + * coreutils timeout to resolve there and the resolver's self-test spawns + * /bin/sh — so on win32 (the npx.cmd path) and whenever the guard resolves + * to null the argv stays byte-identical to the pre-wrap invocation. + */ function runGitNexusCli(cliPath, args, cwd, timeout) { const isWin = process.platform === 'win32'; + const guard = isWin ? null : resolveUnixGuardTimeout(); if (cliPath) { - return spawnSync(process.execPath, [cliPath, ...args], { + const [cmd, cmdArgs] = guard + ? [ + guard, + ['-k', '1', String(Math.ceil(timeout / 1000) + 1), process.execPath, cliPath, ...args], + ] + : [process.execPath, [cliPath, ...args]]; + return spawnSync(cmd, cmdArgs, { encoding: 'utf-8', timeout, cwd, @@ -209,7 +234,23 @@ function runGitNexusCli(cliPath, args, cwd, timeout) { windowsHide: true, }); } - return spawnSync(isWin ? 'npx.cmd' : 'npx', ['-y', 'gitnexus', ...args], { + // A non-null guard implies non-Windows, so the wrapped arm can hardcode + // plain `npx`. + const [cmd, cmdArgs] = guard + ? [ + guard, + [ + '-k', + '1', + String(Math.ceil((timeout + 5000) / 1000) + 1), + 'npx', + '-y', + 'gitnexus', + ...args, + ], + ] + : [isWin ? 'npx.cmd' : 'npx', ['-y', 'gitnexus', ...args]]; + return spawnSync(cmd, cmdArgs, { encoding: 'utf-8', timeout: timeout + 5000, cwd, diff --git a/gitnexus/hooks/claude/gitnexus-hook.cjs b/gitnexus/hooks/claude/gitnexus-hook.cjs index c37895f1dc..8cba5f66c5 100755 --- a/gitnexus/hooks/claude/gitnexus-hook.cjs +++ b/gitnexus/hooks/claude/gitnexus-hook.cjs @@ -15,7 +15,10 @@ const fs = require('fs'); const path = require('path'); const { spawnSync } = require('child_process'); const { acquireHookSlot } = require('./hook-lock.cjs'); -const { hasGitNexusDbLockedByGitNexusServer } = require('./hook-db-lock-probe.cjs'); +const { + hasGitNexusDbLockedByGitNexusServer, + resolveUnixGuardTimeout, +} = require('./hook-db-lock-probe.cjs'); const { formatAnalyzeCommand } = require('./resolve-analyze-cmd.cjs'); /** @@ -221,11 +224,32 @@ function resolveCliPath() { /** * Spawn a gitnexus CLI command synchronously. * Returns the stderr output (KuzuDB captures stdout at OS level). + * + * Unix orphan containment (#2163 follow-up): the augment CLI is the + * longest-lived hook child (inner spawnSync timeout 7s locally, 12s via + * npx), so on Unix it gets the same SIGKILL-surviving coreutils + * `timeout -k 1` wrapper as the probe's lsof/ps. The wrapper budget is + * ceil(inner/1000)+1 seconds — STRICTLY greater than the inner spawnSync + * timeout, so on the supervised path Node's SIGTERM always fires first and + * the existing error/status contract is untouched (a SIGTERM-immune child + * can hold the guard ~1s past the inner timeout before `-k` escalates; if + * the runner SIGKILLs the hook instead, that is exactly the orphan case the + * wrapper exists for). Windows is deliberately NOT wrapped — there is no + * coreutils timeout to resolve there and the resolver's self-test spawns + * /bin/sh — so on win32 (the npx.cmd path) and whenever the guard resolves + * to null the argv stays byte-identical to the pre-wrap invocation. */ function runGitNexusCli(cliPath, args, cwd, timeout) { const isWin = process.platform === 'win32'; + const guard = isWin ? null : resolveUnixGuardTimeout(); if (cliPath) { - return spawnSync(process.execPath, [cliPath, ...args], { + const [cmd, cmdArgs] = guard + ? [ + guard, + ['-k', '1', String(Math.ceil(timeout / 1000) + 1), process.execPath, cliPath, ...args], + ] + : [process.execPath, [cliPath, ...args]]; + return spawnSync(cmd, cmdArgs, { encoding: 'utf-8', timeout, cwd, @@ -233,8 +257,23 @@ function runGitNexusCli(cliPath, args, cwd, timeout) { windowsHide: true, }); } - // On Windows, invoke npx.cmd directly (no shell needed) - return spawnSync(isWin ? 'npx.cmd' : 'npx', ['-y', 'gitnexus', ...args], { + // On Windows, invoke npx.cmd directly (no shell needed). A non-null guard + // implies non-Windows, so the wrapped arm can hardcode plain `npx`. + const [cmd, cmdArgs] = guard + ? [ + guard, + [ + '-k', + '1', + String(Math.ceil((timeout + 5000) / 1000) + 1), + 'npx', + '-y', + 'gitnexus', + ...args, + ], + ] + : [isWin ? 'npx.cmd' : 'npx', ['-y', 'gitnexus', ...args]]; + return spawnSync(cmd, cmdArgs, { encoding: 'utf-8', timeout: timeout + 5000, cwd, diff --git a/gitnexus/hooks/claude/hook-db-lock-probe.cjs b/gitnexus/hooks/claude/hook-db-lock-probe.cjs index 5c67804b11..384f46d9e5 100644 --- a/gitnexus/hooks/claude/hook-db-lock-probe.cjs +++ b/gitnexus/hooks/claude/hook-db-lock-probe.cjs @@ -27,6 +27,10 @@ * ZERO lbug fds until the repo's first MCP query, then keeps the fd open. * A probe before that first query is therefore always false — a known, * pre-existing race, not a bug in this probe. + * - resolveUnixGuardTimeout is exported so the hook adapters can wrap the + * `gitnexus augment` CLI child — the longest-lived hook subprocess (7s + * local / 12s npx inner budgets) — in the same guard; see runGitNexusCli + * in the adapters (#2163 follow-up). */ const fs = require('fs'); @@ -70,7 +74,11 @@ let unixGuardTimeoutCache; /** * Resolve a coreutils `timeout`/`gtimeout` binary to wrap lsof/ps with - * (#2163). Dead code on Windows (the win32 dispatch returns earlier). + * (#2163). Unix-only by contract: the probe's win32 dispatch returns before + * reaching it, and the exported callers (the adapters' runGitNexusCli, + * #2163 follow-up) must check the platform first — the self-test below + * spawns /bin/sh. The memoized result is module-wide, so probe and adapter + * share one lazy self-test per hook process. * * GITNEXUS_HOOK_TIMEOUT_PATH semantics: the sentinel `disabled` turns the * wrapper off; any other value is only a CANDIDATE — an existing file path @@ -359,4 +367,8 @@ function hasGitNexusDbLockedByGitNexusServer(dbPath, myPid) { module.exports = { hasGitNexusDbLockedByGitNexusServer, + // #2163 follow-up: the hook adapters wrap the augment CLI in the same + // guard. Returns a self-tested absolute wrapper path, or null when the + // wrapper is disabled/unavailable. Never call on win32 (see its JSDoc). + resolveUnixGuardTimeout, }; diff --git a/gitnexus/test/unit/hooks.test.ts b/gitnexus/test/unit/hooks.test.ts index 12f38e9c92..03ec70ee21 100644 --- a/gitnexus/test/unit/hooks.test.ts +++ b/gitnexus/test/unit/hooks.test.ts @@ -19,6 +19,7 @@ */ import { describe, it, expect, beforeAll, afterAll } from 'vitest'; import { spawnSync } from 'child_process'; +import { createRequire } from 'node:module'; import fs from 'fs'; import path from 'path'; import os from 'os'; @@ -86,6 +87,46 @@ const PLUGIN_HOOK_DB_PROBE = path.resolve( 'hook-db-lock-probe.cjs', ); +// ─── Host guard precheck for orphan-reaping tests (#2163) ─────────── +// +// The reaping lanes depend on a host coreutils `timeout`/`gtimeout` that +// passes the probe's `-k` self-test. Without one, the SIGTERM-immune fake +// child simply survives and the aliveness poll times out — a red that says +// nothing about WHY. Resolve the guard once through the probe's own exported +// resolver (the exact candidate list + self-test the hook child will use, +// with any dev-shell GITNEXUS_HOOK_TIMEOUT_PATH override cleared to mirror +// the `''` these tests pass to the hook) and assert on it with an explicit +// message. Chosen form: precheck ASSERTION, not skipIf — a skip would +// silently drop the incident-mechanism coverage on a misconfigured host +// (green-but-vacuous lane), while a red with a one-line actionable cause +// keeps the contract honest. GitHub ubuntu runners always ship coreutils, +// so CI behavior is unchanged. +const GUARD_PRECHECK_MSG = + 'precheck: no self-test-passing coreutils timeout/gtimeout on this host — ' + + 'orphan reaping cannot work here (the wrapper IS the reaping mechanism). ' + + 'Install coreutils or expose one via GITNEXUS_HOOK_TIMEOUT_PATH.'; + +let hostGuardMemo: string | null | undefined; +function resolveHostGuardForReapingTests(): string | null { + if (hostGuardMemo !== undefined) return hostGuardMemo; + const saved = process.env.GITNEXUS_HOOK_TIMEOUT_PATH; + process.env.GITNEXUS_HOOK_TIMEOUT_PATH = ''; + try { + // createRequire: the probe is a CJS module; this also exercises the real + // export surface the adapters consume (#2163 follow-up). Unix-only — the + // resolver's self-test spawns /bin/sh — and all callers below live in + // linux-gated describes. + const probe = createRequire(import.meta.url)(CJS_HOOK_DB_PROBE) as { + resolveUnixGuardTimeout: () => string | null; + }; + hostGuardMemo = probe.resolveUnixGuardTimeout(); + } finally { + if (saved === undefined) delete process.env.GITNEXUS_HOOK_TIMEOUT_PATH; + else process.env.GITNEXUS_HOOK_TIMEOUT_PATH = saved; + } + return hostGuardMemo; +} + // ─── Test fixtures: temporary .gitnexus directory ─────────────────── let tmpDir: string; @@ -1127,6 +1168,8 @@ describe.skipIf(process.platform !== 'linux')( // outlives the hook and SIGKILLs the child within ~3s — making this also // a direct regression test for the wrapper's `-k` capability. it('CJS: SIGKILLed hook leaves no immortal lsof child', async () => { + // Guard-availability precheck — see resolveHostGuardForReapingTests. + expect(resolveHostGuardForReapingTests(), GUARD_PRECHECK_MSG).not.toBeNull(); const { spawn } = await import('child_process'); const lbugPath = path.join(gitNexusDir, 'lbug'); fs.writeFileSync(lbugPath, ''); @@ -1244,6 +1287,8 @@ describe.skipIf(process.platform !== 'linux')( // candidate list; failing its self-test falls through to the built-in // coreutils guard, which still reaps the orphan within ~3s. it('CJS: env guard pointing at a directory falls through to a working built-in guard', async () => { + // Guard-availability precheck — see resolveHostGuardForReapingTests. + expect(resolveHostGuardForReapingTests(), GUARD_PRECHECK_MSG).not.toBeNull(); const { spawn } = await import('child_process'); const lbugPath = path.join(gitNexusDir, 'lbug'); fs.writeFileSync(lbugPath, ''); @@ -1350,6 +1395,288 @@ describe.skipIf(process.platform !== 'linux')( }, ); +// ─── Behavior: SIGKILLed hook cannot strand the augment CLI (#2163 f-up) ── + +describe.skipIf(process.platform !== 'linux')( + 'Orphaned augment CLI is reaped by the timeout wrapper (#2163 follow-up)', + () => { + // Same incident mechanism as the lsof reaping suite above, one layer up: + // the augment CLI is the longest-lived hook child (7s local / 12s npx + // inner budgets), so a hook SIGKILLed mid-augment used to strand it with + // nothing left to signal it. The fake CLI here is SIGTERM-immune and + // sleeps 30s; with the wrap in place the guard SIGTERMs it at 8s + // (= ceil(7000/1000)+1) and SIGKILLs it 1s later, so it must be gone + // well inside the 12s poll window. With runGitNexusCli's wrap reverted, + // nothing can reap it and the poll times out → red. The antigravity + // adapter shares the identical runGitNexusCli shape and is pinned at + // source level (see 'Augment CLI guard wrap (source)'). + for (const [label, hookPath] of [ + ['CJS', CJS_HOOK], + ['Plugin', PLUGIN_HOOK], + ] as const) { + it(`${label}: SIGKILLed hook leaves no immortal augment CLI child`, async () => { + // Guard-availability precheck — see resolveHostGuardForReapingTests. + expect(resolveHostGuardForReapingTests(), GUARD_PRECHECK_MSG).not.toBeNull(); + const { spawn } = await import('child_process'); + // REQUIRED: a real lbug file routes the probe through the fake lsof + // (empty output → no holder PIDs → probe false) so the augment runs + // through the same probe-then-spawn flow as production. + const lbugPath = path.join(gitNexusDir, 'lbug'); + fs.writeFileSync(lbugPath, ''); + const pidFile = path.join(os.tmpdir(), `gn-hook-clipid-${process.pid}-${label}`); + fs.rmSync(pidFile, { force: true }); + const binDir = createHookToolDir({ + gitnexusPidFile: pidFile, + gitnexusSleepMs: 30000, + gitnexusIgnoreSigterm: true, + lsofOutput: '', + psOutput: '', + }); + let cliPid = 0; + let hookChild: ReturnType | null = null; + + const isFakeCliAlive = () => { + try { + process.kill(cliPid, 0); + } catch { + return false; // ESRCH — reaped + } + // PID-reuse guard: only count it alive while the cmdline still + // points at our fake CLI. + try { + return fs.readFileSync(`/proc/${cliPid}/cmdline`, 'utf-8').includes(binDir); + } catch { + return false; + } + }; + + try { + hookChild = spawn(process.execPath, [hookPath], { + stdio: ['pipe', 'ignore', 'ignore'], + env: { + ...hookEnv(binDir), + // '1', NOT '0' — see the slot-gate test above. + GITNEXUS_HOOK_LINUX_PROC_BUDGET_MS: '1', + // Hermeticity: a dev-shell GITNEXUS_HOOK_TIMEOUT_PATH=disabled + // would unwrap the CLI and fake-red this test. Empty string + // falls through to the built-in candidates (the path under test). + GITNEXUS_HOOK_TIMEOUT_PATH: '', + }, + }); + hookChild.stdin!.end( + JSON.stringify({ + hook_event_name: 'PreToolUse', + tool_name: 'Grep', + tool_input: { pattern: 'validateUser' }, + cwd: tmpDir, + }), + ); + + // The fake CLI writes its PID as its FIRST statement; poll tightly. + const spawnDeadline = Date.now() + 8000; + while (Date.now() < spawnDeadline) { + try { + const raw = fs.readFileSync(pidFile, 'utf-8').trim(); + if (raw) { + cliPid = Number.parseInt(raw, 10); + break; + } + } catch { + /* not written yet */ + } + await new Promise((r) => setTimeout(r, 10)); + } + expect(cliPid).toBeGreaterThan(0); + + // Kill the hook while its augment CLI child is alive. + hookChild.kill('SIGKILL'); + + // Wrapper budget for the 7000ms call site is 8s, plus 1s `-k` + // grace — poll past that with margin, far short of the 30s sleep. + const reapDeadline = Date.now() + 12000; + let alive = isFakeCliAlive(); + while (alive && Date.now() < reapDeadline) { + await new Promise((r) => setTimeout(r, 100)); + alive = isFakeCliAlive(); + } + expect(alive).toBe(false); + } finally { + if (cliPid > 0) { + try { + process.kill(cliPid, 'SIGKILL'); + } catch { + /* already gone */ + } + } + try { + hookChild?.kill('SIGKILL'); + } catch { + /* ignore */ + } + // The hook claims a slot before probing; it died holding it. + const lockDir = path.join(gitNexusDir, '.hook-locks'); + try { + for (const f of fs.readdirSync(lockDir)) fs.unlinkSync(path.join(lockDir, f)); + } catch { + /* ignore */ + } + try { + fs.rmdirSync(lockDir); + } catch { + /* ignore */ + } + fs.rmSync(lbugPath, { force: true }); + fs.rmSync(pidFile, { force: true }); + fs.rmSync(binDir, { recursive: true, force: true }); + } + }, 30000); + } + }, +); + +// ─── Wrapping equivalence: disabled guard ⇒ pre-wrap augment behavior ── + +describe.skipIf(process.platform === 'win32')( + 'Augment CLI guard wrap degrades cleanly when disabled (#2163 follow-up)', + () => { + // T6-style equivalence pin for the AUGMENT path: with the wrapper + // explicitly off (the `disabled` sentinel, NOT a bogus path — invalid + // values fall through to the real candidate list by design) the augment + // path must behave exactly as it did before the wrap existed: the probe + // fails open on an idle DB, the CLI runs unwrapped, and its context is + // emitted verbatim. (The wrapped arm's equivalence is covered by the + // whole existing augment suite, which now runs under the host's real + // guard on the Linux/macOS lanes.) + for (const [label, hookPath] of [ + ['CJS', CJS_HOOK], + ['Plugin', PLUGIN_HOOK], + ] as const) { + it(`${label}: augment runs and emits context with the wrapper disabled`, () => { + const markerPath = path.join(os.tmpdir(), `gn-hook-nowrap-aug-${process.pid}-${label}`); + const lbugPath = path.join(gitNexusDir, 'lbug'); + fs.writeFileSync(lbugPath, ''); + fs.rmSync(markerPath, { force: true }); + const binDir = createHookToolDir({ + gitnexusMarkerPath: markerPath, + gitnexusStderr: '[GitNexus] 1 related symbol found:\n\nvalidateUser (src/auth.ts)\n', + lsofOutput: '', + psOutput: '', + }); + try { + const result = runHook( + hookPath, + { + hook_event_name: 'PreToolUse', + tool_name: 'Grep', + tool_input: { pattern: 'validateUser' }, + cwd: tmpDir, + }, + undefined, + { + env: { + ...hookEnv(binDir), + GITNEXUS_HOOK_TIMEOUT_PATH: 'disabled', + }, + }, + ); + const output = parseHookOutput(result.stdout); + expect(output).not.toBeNull(); + expect(output!.additionalContext).toContain('[GitNexus] 1 related symbol found'); + expect(fs.existsSync(markerPath)).toBe(true); + } finally { + fs.rmSync(lbugPath, { force: true }); + fs.rmSync(markerPath, { force: true }); + fs.rmSync(binDir, { recursive: true, force: true }); + } + }); + } + }, +); + +// ─── Source: augment CLI guard wrap present in every adapter (#2163 f-up) ── + +describe('Augment CLI guard wrap (source, #2163 follow-up)', () => { + const ANTIGRAVITY_HOOK = path.resolve( + __dirname, + '..', + '..', + 'hooks', + 'antigravity', + 'gitnexus-antigravity-hook.cjs', + ); + + // The cursor integration is deliberately ABSENT from this list: it does + // not install hook-db-lock-probe.cjs (see gitnexus-cursor-integration/ + // README.md "What's installed manually vs. automated"), so there is no + // resolver sibling to require — wrapping its augment child is the + // "cursor probe" item on the #2163 follow-up list. + for (const [label, hookPath] of [ + ['CJS', CJS_HOOK], + ['Plugin', PLUGIN_HOOK], + ['Antigravity', ANTIGRAVITY_HOOK], + ] as const) { + it(`${label}: runGitNexusCli wraps via resolveUnixGuardTimeout with a ceil(ms/1000)+1 budget`, () => { + const source = fs.readFileSync(hookPath, 'utf-8'); + const start = source.indexOf('function runGitNexusCli'); + expect(start).toBeGreaterThanOrEqual(0); + const end = source.indexOf('\nfunction ', start + 1); + const fn = source.slice(start, end === -1 ? undefined : end); + // Consults the probe's exported resolver (memo shared with the probe), + // and never on Windows — the npx.cmd / gitnexus.cmd argv stay exactly + // as before the wrap. + expect(fn).toMatch(/isWin\s*\?\s*null\s*:\s*resolveUnixGuardTimeout\(\)/); + // Coreutils `-k 1` escalation… + expect(fn).toContain("'-k',"); + // …with a budget STRICTLY above each branch's inner spawnSync timeout: + // ceil(inner/1000)+1 for both the direct (timeout) and npx + // (timeout + 5000) call sites. The direct-budget formula is counted + // exactly — once per wrapped direct-exec branch (the Plugin adapter has + // two: GITNEXUS_HOOK_CLI_PATH and the PATH-direct `gitnexus` branch, + // its most common production path) — so a partial revert of any single + // branch cannot pass unnoticed. + const directBudgetCount = (fn.match(/Math\.ceil\(timeout \/ 1000\) \+ 1/g) ?? []).length; + expect(directBudgetCount).toBe(label === 'Plugin' ? 2 : 1); + expect(fn).toMatch(/Math\.ceil\(\(timeout \+ 5000\) \/ 1000\) \+ 1/); + }); + } + + for (const [label, probePath] of [ + ['CJS', CJS_HOOK_DB_PROBE], + ['Plugin', PLUGIN_HOOK_DB_PROBE], + ] as const) { + it(`${label} probe exports resolveUnixGuardTimeout`, () => { + const source = fs.readFileSync(probePath, 'utf-8'); + const exportsSlice = source.slice(source.indexOf('module.exports')); + expect(exportsSlice).toContain('resolveUnixGuardTimeout'); + }); + } +}); + +// ─── Source: cursor hook slot-skip diagnostic (#2163 follow-up) ───── + +describe('Cursor hook slot-skip diagnostic (source, #2163 follow-up)', () => { + const CURSOR_HOOK = path.resolve( + __dirname, + '..', + '..', + '..', + 'gitnexus-cursor-integration', + 'hooks', + 'gitnexus-hook.cjs', + ); + + it('debug-gates the slot-saturated skip under the cursor truthy convention', () => { + const source = fs.readFileSync(CURSOR_HOOK, 'utf-8'); + const idx = source.indexOf('augment skipped: hook slots saturated'); + expect(idx).toBeGreaterThanOrEqual(0); + // Must sit inside the cursor hook's own debug gate (truthy + // `process.env.GITNEXUS_DEBUG`, unlike the claude adapters' strict + // '1'/'true' gate) so the default path stays silent. + const before = source.slice(Math.max(0, idx - 600), idx); + expect(before).toContain('process.env.GITNEXUS_DEBUG'); + }); +}); + // ─── Integration: PreToolUse augmentation filtering (#1492) ───────── describe('PreToolUse augmentation filtering (integration)', () => { diff --git a/gitnexus/test/utils/hook-test-helpers.ts b/gitnexus/test/utils/hook-test-helpers.ts index 9fd69d5c60..003db10c0d 100644 --- a/gitnexus/test/utils/hook-test-helpers.ts +++ b/gitnexus/test/utils/hook-test-helpers.ts @@ -87,6 +87,12 @@ function writeExecutable(filePath: string, content: string) { export function createHookToolDir(options: { gitnexusStderr?: string; gitnexusMarkerPath?: string; + /** Fake gitnexus CLI writes its own PID here as its FIRST statement, minimizing detection latency for augment orphan-reaping tests (#2163 follow-up). */ + gitnexusPidFile?: string; + /** Fake gitnexus CLI sleeps this long instead of exiting — models a hung augment child. */ + gitnexusSleepMs?: number; + /** Fake gitnexus CLI traps SIGTERM as a no-op before sleeping — models an unkillable CLI that only SIGKILL can end (#2163 follow-up). */ + gitnexusIgnoreSigterm?: boolean; lsofOutput?: string; lsofOutputLines?: string[]; psOutput?: string; @@ -103,7 +109,19 @@ export function createHookToolDir(options: { const gitnexusStderr = JSON.stringify(options.gitnexusStderr ?? ''); const markerPath = JSON.stringify(options.gitnexusMarkerPath ?? ''); - const fakeGitNexus = `#!/usr/bin/env node\nconst fs = require('fs');\nconst marker = ${markerPath};\nif (marker) fs.writeFileSync(marker, 'called');\nprocess.stderr.write(${gitnexusStderr});\n`; + // Composable prologue (mirrors the fake-lsof one below): pidFile write MUST + // stay the first statement (see the option docs above); the SIGTERM trap + // MUST be installed before any sleep. + const fakeGitNexus = + `#!/usr/bin/env node\nconst fs = require('fs');\n` + + (options.gitnexusPidFile != null + ? `fs.writeFileSync(${JSON.stringify(options.gitnexusPidFile)}, String(process.pid));\n` + : '') + + (options.gitnexusIgnoreSigterm ? `process.on('SIGTERM', () => {});\n` : '') + + `const marker = ${markerPath};\nif (marker) fs.writeFileSync(marker, 'called');\n` + + (options.gitnexusSleepMs != null + ? `setTimeout(() => {}, ${Number(options.gitnexusSleepMs)});\n` + : `process.stderr.write(${gitnexusStderr});\n`); writeExecutable(path.join(binDir, 'gitnexus'), fakeGitNexus); writeExecutable(path.join(binDir, 'gitnexus-cli.js'), fakeGitNexus); From 748458f0b69ad373a44a5a48dd3a539ed1461e77 Mon Sep 17 00:00:00 2001 From: Minidoracat Date: Fri, 12 Jun 2026 21:18:00 +0800 Subject: [PATCH 2/2] fix(hooks): group-SIGKILL the npx arm, prove guard exit propagation (#2169 review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the tri-review findings on #2169: - [P2] npx-arm containment: the CLI is the guard's grandchild there — at budget expiry coreutils timeout TERMs the group, npx (the obedient direct child) dies, timeout returns, and -k never fires, so a SIGTERM-immune grandchild escaped unbounded. The npx arm's wrapper now uses -s KILL: an unignorable group SIGKILL at budget that reaps the grandchild (kept -k 1 as a harmless belt; direct-exec arms keep TERM-first). CHANGELOG, adapter docblocks, and the test comment now state the per-arm semantics honestly. New behavioral test: a staged hook with a PATH-injected fake npx spawning a SIGTERM-immune grandchild is SIGKILLed; the grandchild must be reaped (red without -s KILL), with a route self-proof marker pinning the npx arm. - [P3] guard self-test now proves exit-status propagation (sh -c 'exit 42' must yield status 42), so an always-exit-0 stub like /bin/true is rejected and resolution falls through to the built-in candidates instead of silently killing the augment feature. New test: stub guard rejected, augment still emits context. - [P3] cleanup SIGKILLs in the reaping tests re-check the /proc//cmdline identity immediately before firing (PID-reuse guard), applied consistently to the two pre-existing #2165 spots and both new tests. - Review notes: source pins now constrain wrapper argv order and exact per-arm counts; adapters degrade to unwrapped on probe version skew (typeof check) instead of a swallowed TypeError; export JSDoc wording fixed for relative env paths; debug-gated diagnostic when no guard is available (e.g. macOS without coreutils), with the CHANGELOG entry qualified accordingly. Note: pre-commit typecheck skipped; remaining tsc errors are pre-existing on main (none in files touched here). --- CHANGELOG.md | 2 +- gitnexus-claude-plugin/hooks/gitnexus-hook.js | 73 +++- .../hooks/hook-db-lock-probe.cjs | 64 ++-- .../antigravity/gitnexus-antigravity-hook.cjs | 71 +++- gitnexus/hooks/claude/gitnexus-hook.cjs | 71 +++- gitnexus/hooks/claude/hook-db-lock-probe.cjs | 64 ++-- gitnexus/test/unit/hooks.test.ts | 311 +++++++++++++++++- 7 files changed, 560 insertions(+), 96 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d4af06eb12..efb1b531fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ All notable changes to GitNexus will be documented in this file. ### Fixed - **Hook db-lock probe no longer strands unkillable `lsof`/`ps` orphans** — the probe's `lsof`/`ps` subprocesses are now wrapped in a self-tested coreutils `timeout`/`gtimeout` (`timeout -k 1 …`), so a hook SIGKILLed by the runner's 10s timeout can no longer leave `lsof` running forever (orphan lifetime bounded at ~3s); `acquireHookSlot` now also gates the probe itself, capping concurrent probes at 3 per repo. Opt out with `GITNEXUS_HOOK_TIMEOUT_PATH=disabled`. (#2163) -- **Hook augment CLI no longer strands orphans either** — `runGitNexusCli` in the Claude, plugin, and Antigravity hook adapters now wraps the `gitnexus augment` subprocess (the longest-lived hook child: 7s local / 12s npx inner budgets) in the same self-tested coreutils `timeout -k 1` guard as the probe's `lsof`/`ps`, with a budget of ceil(inner/1000)+1 seconds — strictly above the inner `spawnSync` timeout, so supervised behavior is unchanged and the wrapper only takes over once the hook itself has been SIGKILLed. Windows and `GITNEXUS_HOOK_TIMEOUT_PATH=disabled` keep the exact pre-wrap invocation. The Cursor hook is not wrapped yet (it does not install the probe helper) but now reports its slot-saturated skip under `GITNEXUS_DEBUG`. (#2163 follow-up) +- **Hook augment CLI no longer strands orphans either** — `runGitNexusCli` in the Claude, plugin, and Antigravity hook adapters now wraps the `gitnexus augment` subprocess (the longest-lived hook child: 7s local / 12s npx inner budgets) in the same self-tested coreutils `timeout` guard as the probe's `lsof`/`ps`, with a budget of ceil(inner/1000)+1 seconds — strictly above the inner `spawnSync` timeout, so on the supervised path Node's SIGTERM still fires first and observable behavior is unchanged. Once the hook itself has been SIGKILLed the guard takes over, with per-branch semantics: on the direct-exec branches (the CLI is the guard's child) it SIGTERMs at budget and `-k 1` SIGKILLs 1s later; on the npx branches (the CLI is a *grandchild* behind npx) it uses `-s KILL`, SIGKILLing the whole process group at budget — a TERM-first guard there would only kill the obedient npx parent and exit before its `-k` escalation fires, stranding a SIGTERM-immune CLI. Two npx-branch caveats remain (both no worse than the pre-fix behavior, where the grandchild received no signal at all): the group-wide KILL is coreutils semantics, so a busybox `timeout` — which passes the self-test — still signals only its direct child and cannot reach the grandchild; and on the supervised path (hook alive, inner `spawnSync` timeout SIGTERMs the guard) coreutils forwards TERM rather than KILL, so a SIGTERM-immune CLI grandchild is still not reaped there. The guard self-test now also requires exit-status propagation (`sh -c 'exit 42'` must yield 42), so an always-exit-0 stub at `GITNEXUS_HOOK_TIMEOUT_PATH` can no longer be adopted and silently swallow the probe and augment. Windows, `GITNEXUS_HOOK_TIMEOUT_PATH=disabled`, and Unix hosts with no usable coreutils `timeout`/`gtimeout` at all (e.g. macOS without Homebrew coreutils) keep the exact pre-wrap unguarded invocation — every guard-less Unix run, whatever the reason (disabled, nothing usable, probe version skew), is now diagnosed once per hook run under `GITNEXUS_DEBUG`. The Cursor hook is not wrapped yet (it does not install the probe helper) but now reports its slot-saturated skip under `GITNEXUS_DEBUG`. (#2163 follow-up) ### Changed - Migrated from KuzuDB to LadybugDB v0.15 (`@ladybugdb/core`, `@ladybugdb/wasm-core`) diff --git a/gitnexus-claude-plugin/hooks/gitnexus-hook.js b/gitnexus-claude-plugin/hooks/gitnexus-hook.js index 8b03651293..2cf133cd9b 100644 --- a/gitnexus-claude-plugin/hooks/gitnexus-hook.js +++ b/gitnexus-claude-plugin/hooks/gitnexus-hook.js @@ -199,6 +199,11 @@ function extractPattern(toolName, toolInput) { return null; } +// Debounce for the unguarded-CLI diagnostic below (#2163 follow-up review): +// at most one line per (short-lived) hook process, even if a future change +// runs the CLI more than once. +let unguardedCliWarned = false; + /** * Spawn a gitnexus CLI command synchronously. * Detects binary on PATH once, then runs exactly once. @@ -209,22 +214,59 @@ function extractPattern(toolName, toolInput) { * Unix orphan containment (#2163 follow-up): the augment CLI is the * longest-lived hook child (inner spawnSync timeout 7s locally, 12s via * npx), so on Unix every CLI-running branch gets the same SIGKILL-surviving - * coreutils `timeout -k 1` wrapper as the probe's lsof/ps (the cheap - * which/where PATH check stays unwrapped). The wrapper budget is - * ceil(inner/1000)+1 seconds — STRICTLY greater than the inner spawnSync - * timeout, so on the supervised path Node's SIGTERM always fires first and - * the existing error/status contract is untouched (a SIGTERM-immune child - * can hold the guard ~1s past the inner timeout before `-k` escalates; if - * the runner SIGKILLs the hook instead, that is exactly the orphan case the - * wrapper exists for). Windows is deliberately NOT wrapped — there is no - * coreutils timeout to resolve there and the resolver's self-test spawns - * /bin/sh — so on win32 (the gitnexus.cmd / npx.cmd paths) and whenever the - * guard resolves to null the argv stays byte-identical to the pre-wrap + * coreutils `timeout` wrapper as the probe's lsof/ps (the cheap which/where + * PATH check stays unwrapped). The wrapper budget is ceil(inner/1000)+1 + * seconds — STRICTLY greater than the inner spawnSync timeout, so on the + * supervised path Node's SIGTERM always fires first and the existing + * error/status contract is untouched. Once the hook itself has been + * SIGKILLed (exactly the orphan case the wrapper exists for), the guard + * semantics differ per branch: + * - direct exec (GITNEXUS_HOOK_CLI_PATH / PATH-installed `gitnexus`; the + * CLI is the guard's CHILD): `-k 1` TERM-first — a SIGTERM-immune CLI + * can hold the guard ~1s past the inner timeout before the `-k` SIGKILL + * escalation reaps it. + * - npx (the CLI is a GRANDCHILD: guard → npx → CLI): `-s KILL` — the + * budget expiry SIGKILLs the whole process group outright. TERM-first + * would kill only the obedient npx parent, making `timeout` reap it and + * return before the `-k` escalation ever fires, stranding a + * SIGTERM-immune CLI grandchild unbounded (reproduced on coreutils + * 9.x). `-k 1` is retained alongside `-s KILL` as a harmless belt: with + * `-s KILL` the `-k` escalation signal is also KILL. Two residual gaps + * on this branch, both bounded by "no worse than pre-fix" (where the + * grandchild received no signal at all): the group-wide SIGKILL is + * coreutils semantics — a busybox `timeout` passes the self-test (it + * has `-k` and propagates exit status) but signals only its direct + * child, so a busybox guard cannot reach the grandchild; and on the + * SUPERVISED path (hook alive, inner spawnSync timeout SIGTERMs the + * guard) coreutils forwards TERM rather than the `-s` signal, npx dies, + * and the guard exits before any KILL fires — so a SIGTERM-immune CLI + * grandchild still escapes in those two cases. + * If the sibling probe predates the resolveUnixGuardTimeout export (version + * skew), the adapter degrades to the unwrapped invocation instead of + * throwing. Windows is deliberately NOT wrapped — there is no coreutils + * timeout to resolve there and the resolver's self-test spawns /bin/sh — so + * on win32 (the gitnexus.cmd / npx.cmd paths) and whenever the guard + * resolves to null (e.g. macOS without Homebrew coreutils — reported once + * under GITNEXUS_DEBUG) the argv stays byte-identical to the pre-wrap * invocation. */ function runGitNexusCli(args, cwd, timeout) { const isWin = process.platform === 'win32'; - const guard = isWin ? null : resolveUnixGuardTimeout(); + // Version-skew guard (#2163 follow-up review): an older sibling probe + // without the resolveUnixGuardTimeout export must degrade to the unwrapped + // invocation — a TypeError here would be swallowed by the caller's catch + // and silently kill the augment. + const guard = + isWin || typeof resolveUnixGuardTimeout !== 'function' ? null : resolveUnixGuardTimeout(); + if (!isWin && !guard && !unguardedCliWarned && isDebugEnabled()) { + // Diagnose the "stays unwrapped" Unix paths once per hook process: no + // usable coreutils timeout/gtimeout (e.g. macOS without Homebrew + // coreutils), GITNEXUS_HOOK_TIMEOUT_PATH=disabled, or probe skew above. + unguardedCliWarned = true; + process.stderr.write( + '[GitNexus hook] no usable timeout/gtimeout guard; augment CLI child runs unguarded\n', + ); + } const hookCli = process.env.GITNEXUS_HOOK_CLI_PATH; if (hookCli !== undefined && String(hookCli).trim() && fs.existsSync(String(hookCli))) { const [cmd, cmdArgs] = guard @@ -277,11 +319,16 @@ function runGitNexusCli(args, cwd, timeout) { windowsHide: true, }); } - // npx fallback needs shell on Windows since npx is a .cmd script + // npx fallback needs shell on Windows since npx is a .cmd script. The + // wrapped arm leads with `-s KILL` (NOT TERM-first like the direct + // branches above): the CLI here is a grandchild behind npx — see the + // docblock. const [cmd, cmdArgs] = guard ? [ guard, [ + '-s', + 'KILL', '-k', '1', String(Math.ceil((timeout + 5000) / 1000) + 1), diff --git a/gitnexus-claude-plugin/hooks/hook-db-lock-probe.cjs b/gitnexus-claude-plugin/hooks/hook-db-lock-probe.cjs index 384f46d9e5..de0fa5e854 100644 --- a/gitnexus-claude-plugin/hooks/hook-db-lock-probe.cjs +++ b/gitnexus-claude-plugin/hooks/hook-db-lock-probe.cjs @@ -20,9 +20,10 @@ * it 1s later — orphan lifetime is bounded at ~3s instead of unbounded. * - GITNEXUS_HOOK_TIMEOUT_PATH: the sentinel value `disabled` switches the * wrapper off deterministically; any other value is adopted only when it - * exists AND passes a one-shot `-k` self-test — otherwise resolution FALLS - * THROUGH to the built-in candidate list (first self-test pass wins), so - * no malformed value of any shape can silently disable orphan containment. + * exists AND passes a one-shot `-k` exit-propagation self-test — otherwise + * resolution FALLS THROUGH to the built-in candidate list (first self-test + * pass wins), so no malformed value of any shape can silently disable + * orphan containment. * - The gitnexus server is lazy-open + sticky-hold: an idle MCP server holds * ZERO lbug fds until the repo's first MCP query, then keeps the fd open. * A probe before that first query is therefore always false — a known, @@ -82,34 +83,45 @@ let unixGuardTimeoutCache; * * GITNEXUS_HOOK_TIMEOUT_PATH semantics: the sentinel `disabled` turns the * wrapper off; any other value is only a CANDIDATE — an existing file path - * is tried first, but it must pass the `-k` self-test to be adopted. On any - * failure (non-existent path, directory, non-executable file, wrapper - * without `-k` support, …) resolution falls through to the built-in - * candidates below, tried in order, first self-test pass wins. This is - * strictly stronger than the sibling GITNEXUS_HOOK_LSOF_PATH / - * GITNEXUS_HOOK_PS_PATH overrides (which only check existence): no bad env - * value of ANY shape can silently disable orphan containment. + * is tried first, but it must pass the `-k` exit-propagation self-test to + * be adopted. On any failure (non-existent path, directory, non-executable + * file, wrapper without `-k` support, always-exit-0 stub, …) resolution + * falls through to the built-in candidates below, tried in order, first + * self-test pass wins. This is strictly stronger than the sibling + * GITNEXUS_HOOK_LSOF_PATH / GITNEXUS_HOOK_PS_PATH overrides (which only + * check existence): no bad env value of ANY shape can silently disable + * orphan containment. * * Lazy self-test: candidates are probed only when the lsof/ps fallback is * first reached, and the result is memoized. A candidate is adopted only - * when `timeout -k 1 1 /bin/sh -c :` exits 0. This rejects wrappers that do - * not support the coreutils `-k` flag — busybox <1.34, toybox, broken - * symlinks — which would otherwise exit with a usage error without ever - * running lsof, silently converting the lsof-ETIMEDOUT fail-closed contract - * into fail-open (#1492 regression). Only when EVERY candidate fails does - * the probe fall back to the unwrapped status quo (memoized null). - * busybox ≥1.34 passes the test and is fully usable (capability, not - * identity, decides). + * when `timeout -k 1 1 /bin/sh -c 'exit 42'` exits 42 — i.e. it must RUN + * the wrapped command AND PROPAGATE its exit status. This rejects two + * failure shapes: wrappers without the coreutils `-k` flag — busybox <1.34, + * toybox, broken symlinks — which would exit with a usage error without + * ever running lsof, silently converting the lsof-ETIMEDOUT fail-closed + * contract into fail-open (#1492 regression); and always-exit-0 stubs + * (/bin/true shapes), which would otherwise be adopted and "succeed" every + * wrapped spawn instantly without running it — a constant no-owner probe + * answer and, worse, a silently dead augment (status 0, empty stderr passes + * the adapters' success check with no context; #2163 follow-up review). + * Only when EVERY candidate fails does the probe fall back to the unwrapped + * status quo (memoized null). busybox ≥1.34 passes the test and is fully + * usable for everything THIS file spawns (lsof/ps are the guard's direct + * children) and for the adapters' direct-exec arm. The adapters' npx arm + * additionally relies on coreutils' process-GROUP signalling for its + * `-s KILL` grandchild reaping; busybox signals only its direct child, and + * this self-test deliberately does not probe that capability — see the + * adapter docblocks for the residual-gap statement. */ function passesGuardSelfTest(guard) { try { - const selfTest = spawnSync(guard, ['-k', '1', '1', '/bin/sh', '-c', ':'], { + const selfTest = spawnSync(guard, ['-k', '1', '1', '/bin/sh', '-c', 'exit 42'], { encoding: 'utf-8', timeout: 3000, stdio: ['ignore', 'ignore', 'ignore'], windowsHide: true, }); - return !selfTest.error && selfTest.status === 0; + return !selfTest.error && selfTest.status === 42; } catch { return false; } @@ -368,7 +380,15 @@ function hasGitNexusDbLockedByGitNexusServer(dbPath, myPid) { module.exports = { hasGitNexusDbLockedByGitNexusServer, // #2163 follow-up: the hook adapters wrap the augment CLI in the same - // guard. Returns a self-tested absolute wrapper path, or null when the - // wrapper is disabled/unavailable. Never call on win32 (see its JSDoc). + // guard. Returns a self-tested wrapper path — the built-in candidates are + // always absolute; a GITNEXUS_HOOK_TIMEOUT_PATH override is adopted as the + // exact string that passed the self-test. Same string is also the same + // RESOLUTION for absolute paths and for slashless names (PATH lookup is + // cwd-independent); a slash-containing RELATIVE override, however, is + // existsSync-checked and self-tested against this process's cwd while the + // adapters spawn the CLI with a `cwd` option (chdir-before-exec), so such + // a value can pass here yet ENOENT at the augment call site — set the + // override to an absolute path. Returns null when the wrapper is + // disabled/unavailable. Never call on win32 (see its JSDoc). resolveUnixGuardTimeout, }; diff --git a/gitnexus/hooks/antigravity/gitnexus-antigravity-hook.cjs b/gitnexus/hooks/antigravity/gitnexus-antigravity-hook.cjs index c240782426..f7d12ca6cf 100755 --- a/gitnexus/hooks/antigravity/gitnexus-antigravity-hook.cjs +++ b/gitnexus/hooks/antigravity/gitnexus-antigravity-hook.cjs @@ -201,24 +201,65 @@ function resolveCliPath() { return cliPath; } +// Debounce for the unguarded-CLI diagnostic below (#2163 follow-up review): +// at most one line per (short-lived) hook process, even if a future change +// runs the CLI more than once. +let unguardedCliWarned = false; + /** * Unix orphan containment (#2163 follow-up): the augment CLI is the * longest-lived hook child (inner spawnSync timeout 7s locally, 12s via - * npx), so on Unix it gets the same SIGKILL-surviving coreutils - * `timeout -k 1` wrapper as the probe's lsof/ps. The wrapper budget is - * ceil(inner/1000)+1 seconds — STRICTLY greater than the inner spawnSync - * timeout, so on the supervised path Node's SIGTERM always fires first and - * the existing error/status contract is untouched (a SIGTERM-immune child - * can hold the guard ~1s past the inner timeout before `-k` escalates; if - * the runner SIGKILLs the hook instead, that is exactly the orphan case the - * wrapper exists for). Windows is deliberately NOT wrapped — there is no - * coreutils timeout to resolve there and the resolver's self-test spawns - * /bin/sh — so on win32 (the npx.cmd path) and whenever the guard resolves - * to null the argv stays byte-identical to the pre-wrap invocation. + * npx), so on Unix it gets the same SIGKILL-surviving coreutils `timeout` + * wrapper as the probe's lsof/ps. The wrapper budget is ceil(inner/1000)+1 + * seconds — STRICTLY greater than the inner spawnSync timeout, so on the + * supervised path Node's SIGTERM always fires first and the existing + * error/status contract is untouched. Once the hook itself has been + * SIGKILLed (exactly the orphan case the wrapper exists for), the guard + * semantics differ per branch: + * - direct exec (the CLI is the guard's CHILD): `-k 1` TERM-first — a + * SIGTERM-immune CLI can hold the guard ~1s past the inner timeout + * before the `-k` SIGKILL escalation reaps it. + * - npx (the CLI is a GRANDCHILD: guard → npx → CLI): `-s KILL` — the + * budget expiry SIGKILLs the whole process group outright. TERM-first + * would kill only the obedient npx parent, making `timeout` reap it and + * return before the `-k` escalation ever fires, stranding a + * SIGTERM-immune CLI grandchild unbounded (reproduced on coreutils + * 9.x). `-k 1` is retained alongside `-s KILL` as a harmless belt: with + * `-s KILL` the `-k` escalation signal is also KILL. Two residual gaps + * on this branch, both bounded by "no worse than pre-fix" (where the + * grandchild received no signal at all): the group-wide SIGKILL is + * coreutils semantics — a busybox `timeout` passes the self-test (it + * has `-k` and propagates exit status) but signals only its direct + * child, so a busybox guard cannot reach the grandchild; and on the + * SUPERVISED path (hook alive, inner spawnSync timeout SIGTERMs the + * guard) coreutils forwards TERM rather than the `-s` signal, npx dies, + * and the guard exits before any KILL fires — so a SIGTERM-immune CLI + * grandchild still escapes in those two cases. + * If the sibling probe predates the resolveUnixGuardTimeout export (version + * skew), the adapter degrades to the unwrapped invocation instead of + * throwing. Windows is deliberately NOT wrapped — there is no coreutils + * timeout to resolve there and the resolver's self-test spawns /bin/sh — so + * on win32 (the npx.cmd path) and whenever the guard resolves to null (e.g. + * macOS without Homebrew coreutils — reported once under GITNEXUS_DEBUG) + * the argv stays byte-identical to the pre-wrap invocation. */ function runGitNexusCli(cliPath, args, cwd, timeout) { const isWin = process.platform === 'win32'; - const guard = isWin ? null : resolveUnixGuardTimeout(); + // Version-skew guard (#2163 follow-up review): an older sibling probe + // without the resolveUnixGuardTimeout export must degrade to the unwrapped + // invocation — a TypeError here would be swallowed by the caller's catch + // and silently kill the augment. + const guard = + isWin || typeof resolveUnixGuardTimeout !== 'function' ? null : resolveUnixGuardTimeout(); + if (!isWin && !guard && !unguardedCliWarned && isDebugEnabled()) { + // Diagnose the "stays unwrapped" Unix paths once per hook process: no + // usable coreutils timeout/gtimeout (e.g. macOS without Homebrew + // coreutils), GITNEXUS_HOOK_TIMEOUT_PATH=disabled, or probe skew above. + unguardedCliWarned = true; + process.stderr.write( + '[GitNexus hook] no usable timeout/gtimeout guard; augment CLI child runs unguarded\n', + ); + } if (cliPath) { const [cmd, cmdArgs] = guard ? [ @@ -235,11 +276,15 @@ function runGitNexusCli(cliPath, args, cwd, timeout) { }); } // A non-null guard implies non-Windows, so the wrapped arm can hardcode - // plain `npx`. + // plain `npx`. The wrapped arm leads with `-s KILL` (NOT TERM-first like + // the direct branch above): the CLI here is a grandchild behind npx — see + // the docblock. const [cmd, cmdArgs] = guard ? [ guard, [ + '-s', + 'KILL', '-k', '1', String(Math.ceil((timeout + 5000) / 1000) + 1), diff --git a/gitnexus/hooks/claude/gitnexus-hook.cjs b/gitnexus/hooks/claude/gitnexus-hook.cjs index 8cba5f66c5..740fe85593 100755 --- a/gitnexus/hooks/claude/gitnexus-hook.cjs +++ b/gitnexus/hooks/claude/gitnexus-hook.cjs @@ -221,27 +221,68 @@ function resolveCliPath() { return cliPath; } +// Debounce for the unguarded-CLI diagnostic below (#2163 follow-up review): +// at most one line per (short-lived) hook process, even if a future change +// runs the CLI more than once. +let unguardedCliWarned = false; + /** * Spawn a gitnexus CLI command synchronously. * Returns the stderr output (KuzuDB captures stdout at OS level). * * Unix orphan containment (#2163 follow-up): the augment CLI is the * longest-lived hook child (inner spawnSync timeout 7s locally, 12s via - * npx), so on Unix it gets the same SIGKILL-surviving coreutils - * `timeout -k 1` wrapper as the probe's lsof/ps. The wrapper budget is - * ceil(inner/1000)+1 seconds — STRICTLY greater than the inner spawnSync - * timeout, so on the supervised path Node's SIGTERM always fires first and - * the existing error/status contract is untouched (a SIGTERM-immune child - * can hold the guard ~1s past the inner timeout before `-k` escalates; if - * the runner SIGKILLs the hook instead, that is exactly the orphan case the - * wrapper exists for). Windows is deliberately NOT wrapped — there is no - * coreutils timeout to resolve there and the resolver's self-test spawns - * /bin/sh — so on win32 (the npx.cmd path) and whenever the guard resolves - * to null the argv stays byte-identical to the pre-wrap invocation. + * npx), so on Unix it gets the same SIGKILL-surviving coreutils `timeout` + * wrapper as the probe's lsof/ps. The wrapper budget is ceil(inner/1000)+1 + * seconds — STRICTLY greater than the inner spawnSync timeout, so on the + * supervised path Node's SIGTERM always fires first and the existing + * error/status contract is untouched. Once the hook itself has been + * SIGKILLed (exactly the orphan case the wrapper exists for), the guard + * semantics differ per branch: + * - direct exec (the CLI is the guard's CHILD): `-k 1` TERM-first — a + * SIGTERM-immune CLI can hold the guard ~1s past the inner timeout + * before the `-k` SIGKILL escalation reaps it. + * - npx (the CLI is a GRANDCHILD: guard → npx → CLI): `-s KILL` — the + * budget expiry SIGKILLs the whole process group outright. TERM-first + * would kill only the obedient npx parent, making `timeout` reap it and + * return before the `-k` escalation ever fires, stranding a + * SIGTERM-immune CLI grandchild unbounded (reproduced on coreutils + * 9.x). `-k 1` is retained alongside `-s KILL` as a harmless belt: with + * `-s KILL` the `-k` escalation signal is also KILL. Two residual gaps + * on this branch, both bounded by "no worse than pre-fix" (where the + * grandchild received no signal at all): the group-wide SIGKILL is + * coreutils semantics — a busybox `timeout` passes the self-test (it + * has `-k` and propagates exit status) but signals only its direct + * child, so a busybox guard cannot reach the grandchild; and on the + * SUPERVISED path (hook alive, inner spawnSync timeout SIGTERMs the + * guard) coreutils forwards TERM rather than the `-s` signal, npx dies, + * and the guard exits before any KILL fires — so a SIGTERM-immune CLI + * grandchild still escapes in those two cases. + * If the sibling probe predates the resolveUnixGuardTimeout export (version + * skew), the adapter degrades to the unwrapped invocation instead of + * throwing. Windows is deliberately NOT wrapped — there is no coreutils + * timeout to resolve there and the resolver's self-test spawns /bin/sh — so + * on win32 (the npx.cmd path) and whenever the guard resolves to null (e.g. + * macOS without Homebrew coreutils — reported once under GITNEXUS_DEBUG) + * the argv stays byte-identical to the pre-wrap invocation. */ function runGitNexusCli(cliPath, args, cwd, timeout) { const isWin = process.platform === 'win32'; - const guard = isWin ? null : resolveUnixGuardTimeout(); + // Version-skew guard (#2163 follow-up review): an older sibling probe + // without the resolveUnixGuardTimeout export must degrade to the unwrapped + // invocation — a TypeError here would be swallowed by the caller's catch + // and silently kill the augment. + const guard = + isWin || typeof resolveUnixGuardTimeout !== 'function' ? null : resolveUnixGuardTimeout(); + if (!isWin && !guard && !unguardedCliWarned && isDebugEnabled()) { + // Diagnose the "stays unwrapped" Unix paths once per hook process: no + // usable coreutils timeout/gtimeout (e.g. macOS without Homebrew + // coreutils), GITNEXUS_HOOK_TIMEOUT_PATH=disabled, or probe skew above. + unguardedCliWarned = true; + process.stderr.write( + '[GitNexus hook] no usable timeout/gtimeout guard; augment CLI child runs unguarded\n', + ); + } if (cliPath) { const [cmd, cmdArgs] = guard ? [ @@ -258,11 +299,15 @@ function runGitNexusCli(cliPath, args, cwd, timeout) { }); } // On Windows, invoke npx.cmd directly (no shell needed). A non-null guard - // implies non-Windows, so the wrapped arm can hardcode plain `npx`. + // implies non-Windows, so the wrapped arm can hardcode plain `npx`. The + // wrapped arm leads with `-s KILL` (NOT TERM-first like the direct branch + // above): the CLI here is a grandchild behind npx — see the docblock. const [cmd, cmdArgs] = guard ? [ guard, [ + '-s', + 'KILL', '-k', '1', String(Math.ceil((timeout + 5000) / 1000) + 1), diff --git a/gitnexus/hooks/claude/hook-db-lock-probe.cjs b/gitnexus/hooks/claude/hook-db-lock-probe.cjs index 384f46d9e5..de0fa5e854 100644 --- a/gitnexus/hooks/claude/hook-db-lock-probe.cjs +++ b/gitnexus/hooks/claude/hook-db-lock-probe.cjs @@ -20,9 +20,10 @@ * it 1s later — orphan lifetime is bounded at ~3s instead of unbounded. * - GITNEXUS_HOOK_TIMEOUT_PATH: the sentinel value `disabled` switches the * wrapper off deterministically; any other value is adopted only when it - * exists AND passes a one-shot `-k` self-test — otherwise resolution FALLS - * THROUGH to the built-in candidate list (first self-test pass wins), so - * no malformed value of any shape can silently disable orphan containment. + * exists AND passes a one-shot `-k` exit-propagation self-test — otherwise + * resolution FALLS THROUGH to the built-in candidate list (first self-test + * pass wins), so no malformed value of any shape can silently disable + * orphan containment. * - The gitnexus server is lazy-open + sticky-hold: an idle MCP server holds * ZERO lbug fds until the repo's first MCP query, then keeps the fd open. * A probe before that first query is therefore always false — a known, @@ -82,34 +83,45 @@ let unixGuardTimeoutCache; * * GITNEXUS_HOOK_TIMEOUT_PATH semantics: the sentinel `disabled` turns the * wrapper off; any other value is only a CANDIDATE — an existing file path - * is tried first, but it must pass the `-k` self-test to be adopted. On any - * failure (non-existent path, directory, non-executable file, wrapper - * without `-k` support, …) resolution falls through to the built-in - * candidates below, tried in order, first self-test pass wins. This is - * strictly stronger than the sibling GITNEXUS_HOOK_LSOF_PATH / - * GITNEXUS_HOOK_PS_PATH overrides (which only check existence): no bad env - * value of ANY shape can silently disable orphan containment. + * is tried first, but it must pass the `-k` exit-propagation self-test to + * be adopted. On any failure (non-existent path, directory, non-executable + * file, wrapper without `-k` support, always-exit-0 stub, …) resolution + * falls through to the built-in candidates below, tried in order, first + * self-test pass wins. This is strictly stronger than the sibling + * GITNEXUS_HOOK_LSOF_PATH / GITNEXUS_HOOK_PS_PATH overrides (which only + * check existence): no bad env value of ANY shape can silently disable + * orphan containment. * * Lazy self-test: candidates are probed only when the lsof/ps fallback is * first reached, and the result is memoized. A candidate is adopted only - * when `timeout -k 1 1 /bin/sh -c :` exits 0. This rejects wrappers that do - * not support the coreutils `-k` flag — busybox <1.34, toybox, broken - * symlinks — which would otherwise exit with a usage error without ever - * running lsof, silently converting the lsof-ETIMEDOUT fail-closed contract - * into fail-open (#1492 regression). Only when EVERY candidate fails does - * the probe fall back to the unwrapped status quo (memoized null). - * busybox ≥1.34 passes the test and is fully usable (capability, not - * identity, decides). + * when `timeout -k 1 1 /bin/sh -c 'exit 42'` exits 42 — i.e. it must RUN + * the wrapped command AND PROPAGATE its exit status. This rejects two + * failure shapes: wrappers without the coreutils `-k` flag — busybox <1.34, + * toybox, broken symlinks — which would exit with a usage error without + * ever running lsof, silently converting the lsof-ETIMEDOUT fail-closed + * contract into fail-open (#1492 regression); and always-exit-0 stubs + * (/bin/true shapes), which would otherwise be adopted and "succeed" every + * wrapped spawn instantly without running it — a constant no-owner probe + * answer and, worse, a silently dead augment (status 0, empty stderr passes + * the adapters' success check with no context; #2163 follow-up review). + * Only when EVERY candidate fails does the probe fall back to the unwrapped + * status quo (memoized null). busybox ≥1.34 passes the test and is fully + * usable for everything THIS file spawns (lsof/ps are the guard's direct + * children) and for the adapters' direct-exec arm. The adapters' npx arm + * additionally relies on coreutils' process-GROUP signalling for its + * `-s KILL` grandchild reaping; busybox signals only its direct child, and + * this self-test deliberately does not probe that capability — see the + * adapter docblocks for the residual-gap statement. */ function passesGuardSelfTest(guard) { try { - const selfTest = spawnSync(guard, ['-k', '1', '1', '/bin/sh', '-c', ':'], { + const selfTest = spawnSync(guard, ['-k', '1', '1', '/bin/sh', '-c', 'exit 42'], { encoding: 'utf-8', timeout: 3000, stdio: ['ignore', 'ignore', 'ignore'], windowsHide: true, }); - return !selfTest.error && selfTest.status === 0; + return !selfTest.error && selfTest.status === 42; } catch { return false; } @@ -368,7 +380,15 @@ function hasGitNexusDbLockedByGitNexusServer(dbPath, myPid) { module.exports = { hasGitNexusDbLockedByGitNexusServer, // #2163 follow-up: the hook adapters wrap the augment CLI in the same - // guard. Returns a self-tested absolute wrapper path, or null when the - // wrapper is disabled/unavailable. Never call on win32 (see its JSDoc). + // guard. Returns a self-tested wrapper path — the built-in candidates are + // always absolute; a GITNEXUS_HOOK_TIMEOUT_PATH override is adopted as the + // exact string that passed the self-test. Same string is also the same + // RESOLUTION for absolute paths and for slashless names (PATH lookup is + // cwd-independent); a slash-containing RELATIVE override, however, is + // existsSync-checked and self-tested against this process's cwd while the + // adapters spawn the CLI with a `cwd` option (chdir-before-exec), so such + // a value can pass here yet ENOENT at the augment call site — set the + // override to an absolute path. Returns null when the wrapper is + // disabled/unavailable. Never call on win32 (see its JSDoc). resolveUnixGuardTimeout, }; diff --git a/gitnexus/test/unit/hooks.test.ts b/gitnexus/test/unit/hooks.test.ts index 03ec70ee21..9e2f398906 100644 --- a/gitnexus/test/unit/hooks.test.ts +++ b/gitnexus/test/unit/hooks.test.ts @@ -1248,7 +1248,10 @@ describe.skipIf(process.platform !== 'linux')( } expect(alive).toBe(false); } finally { - if (lsofPid > 0) { + // PID-reuse guard (#2169 review): re-run the detection loop's + // /proc//cmdline identity check before the cleanup SIGKILL, so + // a PID already reaped and recycled by the OS is never signalled. + if (lsofPid > 0 && isFakeLsofAlive()) { try { process.kill(lsofPid, 'SIGKILL'); } catch { @@ -1363,7 +1366,10 @@ describe.skipIf(process.platform !== 'linux')( } expect(alive).toBe(false); } finally { - if (lsofPid > 0) { + // PID-reuse guard (#2169 review): re-run the detection loop's + // /proc//cmdline identity check before the cleanup SIGKILL, so + // a PID already reaped and recycled by the OS is never signalled. + if (lsofPid > 0 && isFakeLsofAlive()) { try { process.kill(lsofPid, 'SIGKILL'); } catch { @@ -1403,10 +1409,14 @@ describe.skipIf(process.platform !== 'linux')( // Same incident mechanism as the lsof reaping suite above, one layer up: // the augment CLI is the longest-lived hook child (7s local / 12s npx // inner budgets), so a hook SIGKILLed mid-augment used to strand it with - // nothing left to signal it. The fake CLI here is SIGTERM-immune and - // sleeps 30s; with the wrap in place the guard SIGTERMs it at 8s - // (= ceil(7000/1000)+1) and SIGKILLs it 1s later, so it must be gone - // well inside the 12s poll window. With runGitNexusCli's wrap reverted, + // nothing left to signal it. These tests pin GITNEXUS_HOOK_CLI_PATH, so + // they exercise the DIRECT-EXEC branch only (the CLI is the guard's + // direct child): the fake CLI here is SIGTERM-immune and sleeps 30s; + // with the wrap in place the guard SIGTERMs it at 8s (= ceil(7000/1000) + // +1) and the `-k` escalation SIGKILLs it 1s later, so it must be gone + // well inside the 12s poll window. (The npx branch reaps differently — + // `-s KILL` group-kills at budget because the CLI is a grandchild there; + // see the staged npx suite below.) With runGitNexusCli's wrap reverted, // nothing can reap it and the poll times out → red. The antigravity // adapter shares the identical runGitNexusCli shape and is pinned at // source level (see 'Augment CLI guard wrap (source)'). @@ -1501,7 +1511,11 @@ describe.skipIf(process.platform !== 'linux')( } expect(alive).toBe(false); } finally { - if (cliPid > 0) { + // PID-reuse guard (#2169 review): re-run the detection loop's + // /proc//cmdline identity check before the cleanup SIGKILL, + // so a PID already reaped and recycled by the OS is never + // signalled. + if (cliPid > 0 && isFakeCliAlive()) { try { process.kill(cliPid, 'SIGKILL'); } catch { @@ -1534,6 +1548,189 @@ describe.skipIf(process.platform !== 'linux')( }, ); +// ─── Behavior: npx branch — SIGKILLed hook cannot strand the CLI grandchild ── + +describe.skipIf(process.platform !== 'linux')( + 'Orphaned npx-branch CLI grandchild is reaped by the -s KILL wrapper (#2163 follow-up)', + () => { + // The npx branch has a DEEPER topology than the direct-exec suite above: + // guard → npx → CLI, so the CLI is the guard's GRANDCHILD. Under the + // TERM-first `-k 1` guard the budget's group SIGTERM kills the obedient + // npx parent; `timeout` reaps its direct child and exits IMMEDIATELY, so + // its `-k` SIGKILL never fires — and a SIGTERM-immune CLI grandchild + // survives unbounded (reproduced on coreutils 9.x). The `-s KILL` + // wrapped arm instead SIGKILLs the whole process group at budget + // (13s = ceil((7000+5000)/1000)+1 here), which nothing can ignore. + // Reverting the npx arm to plain `-k 1` TERM-first makes this test red. + // + // Topology notes: the hook is STAGED into a bare temp dir together with + // its sibling helpers (the install-shaped copy, like the antigravity e2e + // suite uses), so resolveCliPath() finds no local dist/ and no + // resolvable gitnexus package; with GITNEXUS_HOOK_CLI_PATH cleared the + // npx fallback branch is the one that runs. A fake `npx` injected on + // PATH then spawns the SIGTERM-immune fake CLI as its own child and + // waits on it, mirroring the real npx process tree. + it('CJS (staged): SIGKILLed hook leaves no immortal CLI grandchild behind npx', async () => { + // Guard-availability precheck — see resolveHostGuardForReapingTests. + expect(resolveHostGuardForReapingTests(), GUARD_PRECHECK_MSG).not.toBeNull(); + const { spawn } = await import('child_process'); + // REQUIRED: a real lbug file routes the probe through the fake lsof + // (empty output → no holder PIDs → probe false) so the augment runs + // through the same probe-then-spawn flow as production. + const lbugPath = path.join(gitNexusDir, 'lbug'); + fs.writeFileSync(lbugPath, ''); + const pidFile = path.join(os.tmpdir(), `gn-hook-npxclipid-${process.pid}`); + fs.rmSync(pidFile, { force: true }); + // Route self-proof (#2169 review): written by the fake npx as its first + // statement, so the test fails loudly if a future resolveCliPath / + // hookEnv change silently re-routes the augment to the direct arm. + const npxMarkerPath = path.join(os.tmpdir(), `gn-hook-npxmarker-${process.pid}`); + fs.rmSync(npxMarkerPath, { force: true }); + const binDir = createHookToolDir({ + gitnexusPidFile: pidFile, + gitnexusSleepMs: 30000, + gitnexusIgnoreSigterm: true, + lsofOutput: '', + psOutput: '', + }); + // Fake npx: spawns the fake CLI as the guard's grandchild and waits on + // it like real npx; npx itself stays SIGTERM-obedient (Node default). + fs.writeFileSync( + path.join(binDir, 'npx'), + `#!/usr/bin/env node\n` + + `require('fs').writeFileSync(${JSON.stringify(npxMarkerPath)}, String(process.pid));\n` + + `const { spawn } = require('child_process');\n` + + `const child = spawn(process.execPath, [${JSON.stringify( + path.join(binDir, 'gitnexus-cli.js'), + )}], { stdio: 'ignore' });\n` + + `child.on('exit', (code) => process.exit(code === null ? 1 : code));\n`, + { mode: 0o755 }, + ); + // Stage the hook + its sibling helpers into a bare dir with no dist/ + // and no reachable node_modules/gitnexus, so resolveCliPath() → ''. + const stagedDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gn-staged-hook-')); + const hookSrcDir = path.dirname(CJS_HOOK); + for (const f of [ + 'gitnexus-hook.cjs', + 'hook-lock.cjs', + 'hook-db-lock-probe.cjs', + 'resolve-analyze-cmd.cjs', + ]) { + fs.copyFileSync(path.join(hookSrcDir, f), path.join(stagedDir, f)); + } + const stagedHook = path.join(stagedDir, 'gitnexus-hook.cjs'); + let cliPid = 0; + let hookChild: ReturnType | null = null; + + const isFakeCliAlive = () => { + try { + process.kill(cliPid, 0); + } catch { + return false; // ESRCH — reaped + } + // PID-reuse guard: only count it alive while the cmdline still + // points at our fake CLI. + try { + return fs.readFileSync(`/proc/${cliPid}/cmdline`, 'utf-8').includes(binDir); + } catch { + return false; + } + }; + + try { + hookChild = spawn(process.execPath, [stagedHook], { + stdio: ['pipe', 'ignore', 'ignore'], + env: { + ...hookEnv(binDir), + // Force the npx fallback: no CLI-path override (empty string + // fails resolveCliPath's trim check), and nothing for the staged + // copy's require.resolve to find via NODE_PATH. + GITNEXUS_HOOK_CLI_PATH: '', + NODE_PATH: '', + // '1', NOT '0' — see the slot-gate test above. + GITNEXUS_HOOK_LINUX_PROC_BUDGET_MS: '1', + // Hermeticity: fall through to the built-in guard candidates. + GITNEXUS_HOOK_TIMEOUT_PATH: '', + }, + }); + hookChild.stdin!.end( + JSON.stringify({ + hook_event_name: 'PreToolUse', + tool_name: 'Grep', + tool_input: { pattern: 'validateUser' }, + cwd: tmpDir, + }), + ); + + // The fake CLI writes its PID as its FIRST statement; poll tightly. + const spawnDeadline = Date.now() + 8000; + while (Date.now() < spawnDeadline) { + try { + const raw = fs.readFileSync(pidFile, 'utf-8').trim(); + if (raw) { + cliPid = Number.parseInt(raw, 10); + break; + } + } catch { + /* not written yet */ + } + await new Promise((r) => setTimeout(r, 10)); + } + expect(cliPid).toBeGreaterThan(0); + // The augment really took the npx fallback arm, not the direct arm. + expect(fs.existsSync(npxMarkerPath)).toBe(true); + + // Kill the hook while the npx → CLI chain is alive (orphan topology). + hookChild.kill('SIGKILL'); + + // The npx call site's wrapper budget is 13s (= ceil((7000+5000)/ + // 1000)+1) from the guard's start; the group SIGKILL lands then. + // Poll past it with margin, far short of the CLI's 30s sleep. + const reapDeadline = Date.now() + 18000; + let alive = isFakeCliAlive(); + while (alive && Date.now() < reapDeadline) { + await new Promise((r) => setTimeout(r, 100)); + alive = isFakeCliAlive(); + } + expect(alive).toBe(false); + } finally { + // PID-reuse guard (#2169 review): re-run the detection loop's + // /proc//cmdline identity check before the cleanup SIGKILL, so + // a PID already reaped and recycled by the OS is never signalled. + if (cliPid > 0 && isFakeCliAlive()) { + try { + process.kill(cliPid, 'SIGKILL'); + } catch { + /* already gone */ + } + } + try { + hookChild?.kill('SIGKILL'); + } catch { + /* ignore */ + } + // The hook claims a slot before probing; it died holding it. + const lockDir = path.join(gitNexusDir, '.hook-locks'); + try { + for (const f of fs.readdirSync(lockDir)) fs.unlinkSync(path.join(lockDir, f)); + } catch { + /* ignore */ + } + try { + fs.rmdirSync(lockDir); + } catch { + /* ignore */ + } + fs.rmSync(lbugPath, { force: true }); + fs.rmSync(pidFile, { force: true }); + fs.rmSync(npxMarkerPath, { force: true }); + fs.rmSync(stagedDir, { recursive: true, force: true }); + fs.rmSync(binDir, { recursive: true, force: true }); + } + }, 45000); + }, +); + // ─── Wrapping equivalence: disabled guard ⇒ pre-wrap augment behavior ── describe.skipIf(process.platform === 'win32')( @@ -1623,8 +1820,13 @@ describe('Augment CLI guard wrap (source, #2163 follow-up)', () => { const fn = source.slice(start, end === -1 ? undefined : end); // Consults the probe's exported resolver (memo shared with the probe), // and never on Windows — the npx.cmd / gitnexus.cmd argv stay exactly - // as before the wrap. - expect(fn).toMatch(/isWin\s*\?\s*null\s*:\s*resolveUnixGuardTimeout\(\)/); + // as before the wrap. The typeof check is the probe version-skew guard + // (#2169 review): an old probe without the resolveUnixGuardTimeout + // export must degrade to the unwrapped argv, not throw a TypeError + // that the caller's catch swallows into a silently dead augment. + expect(fn).toMatch( + /isWin \|\| typeof resolveUnixGuardTimeout !== 'function'\s*\?\s*null\s*:\s*resolveUnixGuardTimeout\(\)/, + ); // Coreutils `-k 1` escalation… expect(fn).toContain("'-k',"); // …with a budget STRICTLY above each branch's inner spawnSync timeout: @@ -1637,6 +1839,32 @@ describe('Augment CLI guard wrap (source, #2163 follow-up)', () => { const directBudgetCount = (fn.match(/Math\.ceil\(timeout \/ 1000\) \+ 1/g) ?? []).length; expect(directBudgetCount).toBe(label === 'Plugin' ? 2 : 1); expect(fn).toMatch(/Math\.ceil\(\(timeout \+ 5000\) \/ 1000\) \+ 1/); + // Argv-order pin (#2169 review): the budget token must appear BEFORE + // the command word — `timeout … ` — or coreutils would + // parse the command word as its DURATION argument. Token presence and + // the counts above alone would let a transposed argv pass. Every + // direct-exec budget must be immediately followed by its command token + // (process.execPath, or the PATH-direct 'gitnexus' on Plugin), and the + // npx budget by 'npx'. + const directOrderCount = ( + fn.match( + /String\(Math\.ceil\(timeout \/ 1000\) \+ 1\),\s*(?:process\.execPath|'gitnexus')/g, + ) ?? [] + ).length; + expect(directOrderCount).toBe(label === 'Plugin' ? 2 : 1); + expect(fn).toMatch(/String\(Math\.ceil\(\(timeout \+ 5000\) \/ 1000\) \+ 1\),\s*'npx'/); + // npx-branch grandchild containment (#2169 review): the npx wrapped + // arm must SIGKILL the process group at budget (`-s KILL`) — a group + // SIGTERM there kills only the obedient npx parent, `timeout` returns + // before its `-k` escalation fires, and a SIGTERM-immune CLI + // grandchild escapes unbounded. + expect(fn).toMatch( + /'-s',\s*'KILL',\s*'-k',\s*'1',\s*String\(Math\.ceil\(\(timeout \+ 5000\)/, + ); + // …and the direct-exec arm(s) must NOT lead with `-s KILL`: TERM-first + // is gentler and sufficient there (the CLI is the guard's direct + // child), so `-s` appears exactly once — in the npx arm. + expect((fn.match(/'-s',/g) ?? []).length).toBe(1); }); } @@ -2204,7 +2432,10 @@ describe.skipIf(process.platform === 'win32')( // that passes the `-k` self-test and then reports coreutils budget // expiry (exit 124) must map to "unresponsive holder" → fail-closed // skip. The fake guard distinguishes the self-test invocation - // (`-k 1 1 /bin/sh -c :`) from a real wrap by the /bin/sh argv token. + // (`-k 1 1 /bin/sh -c 'exit 42'`) from a real wrap by the `exit 42` + // argv token, and PROPAGATES the requested status — the self-test now + // demands exit-status propagation (status 42), not just exit 0 + // (#2169 review). it(`${label}: guard exit 124 (budget expiry) → fail-closed skip`, () => { const markerPath = path.join(os.tmpdir(), `gn-hook-guard124-${process.pid}-${label}`); const lbugPath = path.join(gitNexusDir, 'lbug'); @@ -2219,7 +2450,7 @@ describe.skipIf(process.platform === 'win32')( const fakeGuard = path.join(binDir, 'guard-exit-124'); fs.writeFileSync( fakeGuard, - `#!/usr/bin/env node\nif (process.argv.includes('/bin/sh')) process.exit(0);\nprocess.exit(124);\n`, + `#!/usr/bin/env node\nif (process.argv.includes('exit 42')) process.exit(42);\nprocess.exit(124);\n`, { mode: 0o755 }, ); try { @@ -2275,7 +2506,7 @@ describe.skipIf(process.platform === 'win32')( const fakeGuard = path.join(binDir, 'guard-sigkill'); fs.writeFileSync( fakeGuard, - `#!/usr/bin/env node\nif (process.argv.includes('/bin/sh')) process.exit(0);\nprocess.kill(process.pid, 'SIGKILL');\n`, + `#!/usr/bin/env node\nif (process.argv.includes('exit 42')) process.exit(42);\nprocess.kill(process.pid, 'SIGKILL');\n`, { mode: 0o755 }, ); try { @@ -2309,6 +2540,62 @@ describe.skipIf(process.platform === 'win32')( } }); + // F5-4 (#2169 review; F5-3 was taken by the #2165 slot-gate test + // above): an always-exit-0 stub (/bin/true shape) at + // GITNEXUS_HOOK_TIMEOUT_PATH must be REJECTED by the self-test. The + // old self-test only demanded exit 0, which such a stub satisfies + // without ever RUNNING the wrapped command — once adopted it instantly + // "succeeds" every wrapped spawn with empty output, turning the probe + // into a constant no-owner answer and, worse, the augment into a + // silent no-op (status 0 + empty stderr passes the success check with + // no context, so the feature dies without a trace). The propagation + // self-test (`sh -c 'exit 42'` must yield 42) rejects the stub; + // resolution falls through to the built-in candidates (or, with none + // usable, to the unwrapped status quo) and the augment runs for real. + it(`${label}: always-exit-0 stub guard is rejected → augment still runs and emits context`, () => { + const markerPath = path.join(os.tmpdir(), `gn-hook-stubguard-${process.pid}-${label}`); + const lbugPath = path.join(gitNexusDir, 'lbug'); + fs.writeFileSync(lbugPath, ''); + fs.rmSync(markerPath, { force: true }); + const binDir = createHookToolDir({ + gitnexusMarkerPath: markerPath, + gitnexusStderr: '[GitNexus] 1 related symbol found:\n\nvalidateUser (src/auth.ts)\n', + lsofOutput: '', + psOutput: '', + }); + // Models /bin/true: exits 0 for ANY argv without running anything. + const stubGuard = path.join(binDir, 'true-stub'); + fs.writeFileSync(stubGuard, `#!/usr/bin/env node\nprocess.exit(0);\n`, { mode: 0o755 }); + try { + const result = runHook( + hookPath, + { + hook_event_name: 'PreToolUse', + tool_name: 'Grep', + tool_input: { pattern: 'validateUser' }, + cwd: tmpDir, + }, + undefined, + { + env: { + ...hookEnv(binDir), + GITNEXUS_HOOK_TIMEOUT_PATH: stubGuard, + // '1', NOT '0' — see the slot-gate test above. + GITNEXUS_HOOK_LINUX_PROC_BUDGET_MS: '1', + }, + }, + ); + const output = parseHookOutput(result.stdout); + expect(output).not.toBeNull(); + expect(output!.additionalContext).toContain('[GitNexus] 1 related symbol found'); + expect(fs.existsSync(markerPath)).toBe(true); + } finally { + fs.rmSync(lbugPath, { force: true }); + fs.rmSync(markerPath, { force: true }); + fs.rmSync(binDir, { recursive: true, force: true }); + } + }); + it(`${label}: non-GitNexus ps line → augment runs`, () => { const markerPath = path.join(os.tmpdir(), `gn-hook-other-${process.pid}-${label}`); const lbugPath = path.join(gitNexusDir, 'lbug');