diff --git a/packages/sdk/server/bare/registry/registry-client.ts b/packages/sdk/server/bare/registry/registry-client.ts index 2edbac1e0c..f9c50a6e91 100644 --- a/packages/sdk/server/bare/registry/registry-client.ts +++ b/packages/sdk/server/bare/registry/registry-client.ts @@ -11,8 +11,22 @@ import { DEFAULT_REGISTRY_CORE_KEY } from "@/constants"; const logger = getServerLogger(); -const MAX_RETRIES = 3; -const BASE_DELAY_MS = 500; +// fd-lock retry budget for the registry corestore. +// +// We pass `corestoreOpts: { wait: false }` (tryLock semantics) and provide a +// JS-bounded retry budget here instead of letting Hypercore's underlying +// fd-lock issue a blocking flock(LOCK_EX) on a libuv worker thread. The +// blocking variant cannot be cancelled from JS, so if another SDK process +// holds the lock indefinitely it leaves a pending native handle that +// prevents process.exit() from terminating the worker — see QVAC-18197. +// +// With tryLock + bounded retries we get the same "tolerate transient locks +// during another SDK's startup/shutdown" property #1480 wanted, but every +// retry step is a fresh non-blocking syscall that surfaces failure to JS, +// so shutdown always remains cancellable. +const MAX_RETRIES = 8; +const BASE_DELAY_MS = 250; +const MAX_TOTAL_WAIT_MS = 10_000; let registryClient: QVACRegistryClient | null = null; let inflightInit: Promise | null = null; @@ -30,14 +44,13 @@ async function delay(ms: number): Promise { async function initRegistryClient(): Promise { let lastError: unknown; + const deadline = Date.now() + MAX_TOTAL_WAIT_MS; for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { try { const client = new QVACRegistryClient({ registryCoreKey: DEFAULT_REGISTRY_CORE_KEY, - storage: getCacheDir( - `registry-corestore/${DEFAULT_REGISTRY_CORE_KEY}`, - ), - corestoreOpts: { wait: true }, + storage: getCacheDir(`registry-corestore/${DEFAULT_REGISTRY_CORE_KEY}`), + corestoreOpts: { wait: false }, }); await client.ready(); @@ -61,7 +74,17 @@ async function initRegistryClient(): Promise { } catch (error) { lastError = error; if (isFdLockError(error) && attempt < MAX_RETRIES) { - const backoff = BASE_DELAY_MS * Math.pow(2, attempt - 1); + const remaining = deadline - Date.now(); + if (remaining <= 0) { + logger.warn( + `Registry client fd-lock failed after ${MAX_TOTAL_WAIT_MS}ms wait budget exhausted (attempt ${attempt}/${MAX_RETRIES})`, + ); + throw error; + } + const backoff = Math.min( + BASE_DELAY_MS * Math.pow(2, attempt - 1), + remaining, + ); logger.warn( `Registry client fd-lock failed (attempt ${attempt}/${MAX_RETRIES}), retrying in ${backoff}ms...`, ); diff --git a/packages/sdk/server/worker-core.ts b/packages/sdk/server/worker-core.ts index d755c70b6c..c936ee5b13 100644 --- a/packages/sdk/server/worker-core.ts +++ b/packages/sdk/server/worker-core.ts @@ -34,6 +34,33 @@ let isShuttingDown = false; const logger = getServerLogger(); +// Defense-in-depth grace period for the SIGKILL safety net armed before +// process.exit() in shutdownBareDirectWorker. If process.exit cannot +// terminate the worker within this window — typically because some path +// holds a non-cancellable native handle (e.g. a libuv worker thread +// blocked on flock; see QVAC-18197) — we force-kill the OS process to +// guarantee bounded shutdown time. +const FORCE_EXIT_GRACE_MS = 3_000; + +function scheduleForceExit(): void { + const timer: unknown = setTimeout(() => { + logger.error( + `process.exit did not terminate the worker within ${FORCE_EXIT_GRACE_MS}ms — ` + + `force-killing self (likely blocked native handle)`, + ); + try { + process.kill(process.pid, "SIGKILL"); + } catch { + // best-effort — if SIGKILL itself fails, there's nothing more to do + } + }, FORCE_EXIT_GRACE_MS); + // Don't let the safety-net timer keep the process alive on the happy + // path. Bare returns an object (not a number) from setTimeout. + if (timer && typeof timer === "object" && "unref" in timer) { + (timer as { unref: () => void }).unref(); + } +} + export function initializeWorkerCore(): { hasRPCConfig: boolean } { if (coreInitialized) { const validatedEnv = getValidatedEnv(); @@ -184,6 +211,8 @@ export async function shutdownBareDirectWorker( releaseWorkerLock(); + scheduleForceExit(); + const isGraceful = reason === "signal" || reason === "rpc-close"; process.exit(isGraceful ? 0 : 1); }