Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 30 additions & 7 deletions packages/sdk/server/bare/registry/registry-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,22 @@ import { DEFAULT_REGISTRY_CORE_KEY } from "@/constants";

const logger = getServerLogger();

const MAX_RETRIES = 3;
const BASE_DELAY_MS = 500;
// fd-lock retry budget for the registry corestore.
//
// We pass `corestoreOpts: { wait: false }` (tryLock semantics) and provide a
// JS-bounded retry budget here instead of letting Hypercore's underlying
// fd-lock issue a blocking flock(LOCK_EX) on a libuv worker thread. The
// blocking variant cannot be cancelled from JS, so if another SDK process
// holds the lock indefinitely it leaves a pending native handle that
// prevents process.exit() from terminating the worker β€” see QVAC-18197.
//
// With tryLock + bounded retries we get the same "tolerate transient locks
// during another SDK's startup/shutdown" property #1480 wanted, but every
// retry step is a fresh non-blocking syscall that surfaces failure to JS,
// so shutdown always remains cancellable.
const MAX_RETRIES = 8;
const BASE_DELAY_MS = 250;
const MAX_TOTAL_WAIT_MS = 10_000;

let registryClient: QVACRegistryClient | null = null;
let inflightInit: Promise<QVACRegistryClient> | null = null;
Expand All @@ -30,14 +44,13 @@ async function delay(ms: number): Promise<void> {

async function initRegistryClient(): Promise<QVACRegistryClient> {
let lastError: unknown;
const deadline = Date.now() + MAX_TOTAL_WAIT_MS;
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
try {
const client = new QVACRegistryClient({
registryCoreKey: DEFAULT_REGISTRY_CORE_KEY,
storage: getCacheDir(
`registry-corestore/${DEFAULT_REGISTRY_CORE_KEY}`,
),
corestoreOpts: { wait: true },
storage: getCacheDir(`registry-corestore/${DEFAULT_REGISTRY_CORE_KEY}`),
corestoreOpts: { wait: false },
});

await client.ready();
Expand All @@ -61,7 +74,17 @@ async function initRegistryClient(): Promise<QVACRegistryClient> {
} catch (error) {
lastError = error;
if (isFdLockError(error) && attempt < MAX_RETRIES) {
const backoff = BASE_DELAY_MS * Math.pow(2, attempt - 1);
const remaining = deadline - Date.now();
if (remaining <= 0) {
logger.warn(
`Registry client fd-lock failed after ${MAX_TOTAL_WAIT_MS}ms wait budget exhausted (attempt ${attempt}/${MAX_RETRIES})`,
);
throw error;
}
const backoff = Math.min(
BASE_DELAY_MS * Math.pow(2, attempt - 1),
remaining,
);
logger.warn(
`Registry client fd-lock failed (attempt ${attempt}/${MAX_RETRIES}), retrying in ${backoff}ms...`,
);
Expand Down
29 changes: 29 additions & 0 deletions packages/sdk/server/worker-core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,33 @@ let isShuttingDown = false;

const logger = getServerLogger();

// Defense-in-depth grace period for the SIGKILL safety net armed before
// process.exit() in shutdownBareDirectWorker. If process.exit cannot
// terminate the worker within this window β€” typically because some path
// holds a non-cancellable native handle (e.g. a libuv worker thread
// blocked on flock; see QVAC-18197) β€” we force-kill the OS process to
// guarantee bounded shutdown time.
const FORCE_EXIT_GRACE_MS = 3_000;

function scheduleForceExit(): void {
const timer: unknown = setTimeout(() => {
logger.error(
`process.exit did not terminate the worker within ${FORCE_EXIT_GRACE_MS}ms β€” ` +
`force-killing self (likely blocked native handle)`,
);
try {
process.kill(process.pid, "SIGKILL");
} catch {
// best-effort β€” if SIGKILL itself fails, there's nothing more to do
}
}, FORCE_EXIT_GRACE_MS);
// Don't let the safety-net timer keep the process alive on the happy
// path. Bare returns an object (not a number) from setTimeout.
if (timer && typeof timer === "object" && "unref" in timer) {
(timer as { unref: () => void }).unref();
}
}

export function initializeWorkerCore(): { hasRPCConfig: boolean } {
if (coreInitialized) {
const validatedEnv = getValidatedEnv();
Expand Down Expand Up @@ -184,6 +211,8 @@ export async function shutdownBareDirectWorker(

releaseWorkerLock();

scheduleForceExit();

const isGraceful = reason === "signal" || reason === "rpc-close";
process.exit(isGraceful ? 0 : 1);
}
Expand Down
Loading