diff --git a/apps/desktop/plans/done/20260510-host-service-recovery.md b/apps/desktop/plans/done/20260510-host-service-recovery.md new file mode 100644 index 00000000000..e18816eefb7 --- /dev/null +++ b/apps/desktop/plans/done/20260510-host-service-recovery.md @@ -0,0 +1,19 @@ +# Host-service recovery (#4299) — shipped + +**Issue:** [superset-sh/superset#4299](https://github.com/superset-sh/superset/issues/4299) — after Cmd+R the v2 right pane goes blank because the renderer keeps getting handed a dead host-service port. + +**Root cause:** `tryAdopt` only checked `isProcessAlive(pid)` + app-version. A live-but-not-serving host-service (hung on migrations, deadlocked, port no longer bound) got adopted as `running`, and `getConnection` returned its dead port forever — an absorbing state nothing climbed out of. + +## What shipped (PR #4395) + +- **Adopt health-check** — `tryAdopt` now `pollHealthCheck`s the manifest endpoint (2s cap) before registering an adopted instance; on failure it SIGKILLs the stale pid, removes the manifest, and falls through to a clean `spawn`. This is the fix. +- **`coordinator.reset(orgId)`** + `hostServiceCoordinator.reset` tRPC mutation — force-kill (SIGKILL on whatever pid the manifest names, even if untracked) + remove manifest + respawn. No UI caller yet; intended for a support escape hatch / future Settings button. +- **Tray "Restart" enabled in `stopped`** — was gated on `isRunning`, i.e. disabled exactly when restart helps; now disabled only while a start is in flight. +- **Coordinator logs through `electron-log`** — adoption health-check failures now land in `main.log` (were bare `console.log`, invisible in packaged builds). `log.warn` on non-ESRCH SIGKILL failures. + +## Considered, not shipped + +- **Full-screen "host stopped" recovery screen** in the v2-workspace layout — dropped. [#4430](https://github.com/superset-sh/superset/pull/4430) removed the analogous remote `WorkspaceHostOfflineState` ("render optimistically; downstream queries surface their own errors"); a local equivalent would swim against that. A non-blocking banner could be a future PR. +- **Renderer retry-with-backoff** in `LocalHostServiceProvider` — built, then dropped: heavier than the bug needs and invisible without the recovery screen. +- **`reset({ wipeHostDb })`** (archive `host.db` → `host.db.broken-`) + a Settings "Reset and clear local data" button — deferred until there's a caller. +- **The white-screen-before-Cmd+R variant** — tracked separately at [#4396](https://github.com/superset-sh/superset/issues/4396): `getHostId()` shells out to `ioreg` via `execFileSync` with no timeout, blocking the main event loop when subprocess spawning is sandboxed. diff --git a/apps/desktop/src/lib/trpc/routers/host-service-coordinator/index.ts b/apps/desktop/src/lib/trpc/routers/host-service-coordinator/index.ts index 5683a865f4f..d7d3aaa16fd 100644 --- a/apps/desktop/src/lib/trpc/routers/host-service-coordinator/index.ts +++ b/apps/desktop/src/lib/trpc/routers/host-service-coordinator/index.ts @@ -46,6 +46,18 @@ export const createHostServiceCoordinatorRouter = () => { }); }), + reset: publicProcedure.input(orgInput).mutation(async ({ input }) => { + const coordinator = getHostServiceCoordinator(); + const { token } = await loadToken(); + if (!token) { + throw new Error("No auth token available — user must be logged in"); + } + return coordinator.reset(input.organizationId, { + authToken: token, + cloudApiUrl: env.NEXT_PUBLIC_API_URL, + }); + }), + onStatusChange: publicProcedure.subscription(() => { return observable((emit) => { const coordinator = getHostServiceCoordinator(); diff --git a/apps/desktop/src/main/lib/host-service-coordinator.test.ts b/apps/desktop/src/main/lib/host-service-coordinator.test.ts new file mode 100644 index 00000000000..5ed168f1d1e --- /dev/null +++ b/apps/desktop/src/main/lib/host-service-coordinator.test.ts @@ -0,0 +1,292 @@ +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; +import * as fs from "node:fs"; +import * as os from "node:os"; +import path from "node:path"; + +const APP_VERSION = "1.2.3"; + +const manifestStore: { + current: { + pid: number; + endpoint: string; + authToken: string; + startedAt: number; + organizationId: string; + spawnedByAppVersion: string; + } | null; +} = { current: null }; + +// Per-test temp dir backing the mocked `manifestDir`. A real path (not a +// fixed string) so tests stay isolated; assigned in beforeEach, removed in +// afterEach. +let testManifestRoot = ""; + +const readManifestMock = mock(() => manifestStore.current); +const removeManifestMock = mock(() => { + manifestStore.current = null; +}); +const isProcessAliveMock = mock(() => true); + +mock.module("./host-service-manifest", () => ({ + readManifest: readManifestMock, + removeManifest: removeManifestMock, + isProcessAlive: isProcessAliveMock, + listManifests: mock(() => []), + manifestDir: (orgId: string) => path.join(testManifestRoot, orgId), +})); + +const pollHealthCheckMock = mock(() => Promise.resolve(true)); + +mock.module("./host-service-utils", () => ({ + HEALTH_POLL_TIMEOUT_MS: 10_000, + MAX_HOST_LOG_BYTES: 1024, + findFreePort: mock(() => Promise.resolve(40000)), + openRotatingLogFd: mock(() => -1), + pollHealthCheck: pollHealthCheckMock, +})); + +mock.module("electron", () => ({ + app: { + getVersion: () => APP_VERSION, + isPackaged: false, + getAppPath: () => "/tmp/app", + }, +})); + +mock.module("electron-log/main", () => ({ + default: { + info: () => {}, + warn: () => {}, + error: () => {}, + }, +})); + +mock.module("@superset/local-db", () => ({ settings: {} })); +mock.module("@superset/shared/host-info", () => ({ + getHostId: () => "host-1", + getHostName: () => "host", +})); +mock.module("main/env.main", () => ({ + env: { NEXT_PUBLIC_API_URL: "", RELAY_URL: "" }, +})); +mock.module("shared/env.shared", () => ({ + env: { DESKTOP_VITE_PORT: 3000, DESKTOP_NOTIFICATIONS_PORT: 4000 }, +})); +mock.module("./app-environment", () => ({ + SUPERSET_HOME_DIR: "/tmp/superset", +})); +mock.module("./local-db", () => ({ + localDb: { + select: () => ({ from: () => ({ get: () => null }) }), + }, +})); +mock.module("./terminal/env", () => ({ HOOK_PROTOCOL_VERSION: "1" })); +mock.module("../../lib/trpc/routers/workspaces/utils/shell-env", () => ({ + getProcessEnvWithShellPath: async (e: Record) => e, +})); + +const { HostServiceCoordinator } = await import("./host-service-coordinator"); + +const baseManifest = (pid: number, endpoint = "http://127.0.0.1:55555") => ({ + pid, + endpoint, + authToken: "manifest-secret", + startedAt: 0, + organizationId: "org-1", + spawnedByAppVersion: APP_VERSION, +}); + +const spawnConfig = { authToken: "token", cloudApiUrl: "https://api.example" }; + +describe("HostServiceCoordinator.tryAdopt — adoption health check", () => { + let coordinator: InstanceType; + let killedPids: Array<{ pid: number; signal: NodeJS.Signals | number }>; + let originalKill: typeof process.kill; + let spawnMock: ReturnType; + + beforeEach(() => { + manifestStore.current = null; + readManifestMock.mockClear(); + removeManifestMock.mockClear(); + isProcessAliveMock.mockClear(); + pollHealthCheckMock.mockClear(); + + testManifestRoot = fs.mkdtempSync(path.join(os.tmpdir(), "hsc-test-")); + + killedPids = []; + originalKill = process.kill; + // `process.kill` is read-only in some Bun versions — assign via cast. + (process as unknown as { kill: typeof process.kill }).kill = (( + pid: number, + signal?: NodeJS.Signals | number, + ) => { + killedPids.push({ pid, signal: signal ?? "SIGTERM" }); + return true; + }) as typeof process.kill; + + coordinator = new HostServiceCoordinator(); + // Replace spawn so a failed adoption doesn't actually launch electron. + spawnMock = mock(async () => ({ + port: 60000, + secret: "fresh-secret", + machineId: "host-1", + })); + (coordinator as unknown as { spawn: typeof spawnMock }).spawn = spawnMock; + }); + + afterEach(() => { + // Unconditional — if an assertion throws mid-test, the override must + // still be torn down or the next test captures the wrong `originalKill`. + (process as unknown as { kill: typeof process.kill }).kill = originalKill; + if (testManifestRoot) { + fs.rmSync(testManifestRoot, { recursive: true, force: true }); + testManifestRoot = ""; + } + }); + + test("adopts when manifest is healthy", async () => { + manifestStore.current = baseManifest(1234); + pollHealthCheckMock.mockImplementationOnce(() => Promise.resolve(true)); + + const conn = await coordinator.start("org-1", spawnConfig); + + expect(conn.port).toBe(55555); + expect(conn.secret).toBe("manifest-secret"); + expect(pollHealthCheckMock).toHaveBeenCalledTimes(1); + expect(spawnMock).not.toHaveBeenCalled(); + expect(removeManifestMock).not.toHaveBeenCalled(); + expect(coordinator.getProcessStatus("org-1")).toBe("running"); + }); + + test("kills the adopted pid with SIGKILL and falls through to spawn when health check fails", async () => { + manifestStore.current = baseManifest(4321); + pollHealthCheckMock.mockImplementationOnce(() => Promise.resolve(false)); + + const conn = await coordinator.start("org-1", spawnConfig); + + expect(pollHealthCheckMock).toHaveBeenCalledTimes(1); + expect(killedPids).toContainEqual({ pid: 4321, signal: "SIGKILL" }); + expect(removeManifestMock).toHaveBeenCalledTimes(1); + expect(spawnMock).toHaveBeenCalledTimes(1); + expect(conn.port).toBe(60000); + expect(conn.secret).toBe("fresh-secret"); + }); + + test("swallows SIGKILL ESRCH (pid already gone) and still respawns", async () => { + manifestStore.current = baseManifest(7777); + pollHealthCheckMock.mockImplementationOnce(() => Promise.resolve(false)); + (process as unknown as { kill: typeof process.kill }).kill = (() => { + const err: NodeJS.ErrnoException = new Error("kill ESRCH"); + err.code = "ESRCH"; + throw err; + }) as typeof process.kill; + + const conn = await coordinator.start("org-1", spawnConfig); + + expect(removeManifestMock).toHaveBeenCalledTimes(1); + expect(spawnMock).toHaveBeenCalledTimes(1); + expect(conn.port).toBe(60000); + }); + + test("kills with SIGTERM (existing behavior) on app-version mismatch, before health check", async () => { + manifestStore.current = { + ...baseManifest(5555), + spawnedByAppVersion: "0.9.0", + }; + + const conn = await coordinator.start("org-1", spawnConfig); + + // App-version gate runs before the new health check. + expect(pollHealthCheckMock).not.toHaveBeenCalled(); + expect(killedPids).toContainEqual({ pid: 5555, signal: "SIGTERM" }); + expect(removeManifestMock).toHaveBeenCalledTimes(1); + expect(spawnMock).toHaveBeenCalledTimes(1); + expect(conn.port).toBe(60000); + }); +}); + +describe("HostServiceCoordinator.reset", () => { + let coordinator: InstanceType; + let killedPids: Array<{ pid: number; signal: NodeJS.Signals | number }>; + let originalKill: typeof process.kill; + let spawnMock: ReturnType; + + beforeEach(() => { + manifestStore.current = null; + readManifestMock.mockClear(); + removeManifestMock.mockClear(); + isProcessAliveMock.mockClear(); + pollHealthCheckMock.mockClear(); + + testManifestRoot = fs.mkdtempSync(path.join(os.tmpdir(), "hsc-test-")); + + killedPids = []; + originalKill = process.kill; + (process as unknown as { kill: typeof process.kill }).kill = (( + pid: number, + signal?: NodeJS.Signals | number, + ) => { + killedPids.push({ pid, signal: signal ?? "SIGTERM" }); + return true; + }) as typeof process.kill; + + coordinator = new HostServiceCoordinator(); + spawnMock = mock(async () => ({ + port: 60000, + secret: "fresh-secret", + machineId: "host-1", + })); + (coordinator as unknown as { spawn: typeof spawnMock }).spawn = spawnMock; + }); + + afterEach(() => { + (process as unknown as { kill: typeof process.kill }).kill = originalKill; + if (testManifestRoot) { + fs.rmSync(testManifestRoot, { recursive: true, force: true }); + testManifestRoot = ""; + } + }); + + test("removes manifest, SIGKILLs live pid, then spawns fresh", async () => { + manifestStore.current = baseManifest(8888); + + const conn = await coordinator.reset("org-1", spawnConfig); + + expect(killedPids).toContainEqual({ pid: 8888, signal: "SIGKILL" }); + expect(removeManifestMock).toHaveBeenCalledTimes(1); + expect(spawnMock).toHaveBeenCalledTimes(1); + expect(conn.port).toBe(60000); + expect(conn.secret).toBe("fresh-secret"); + }); + + test("SIGKILLs the manifest pid even when an instance is tracked (stop's SIGTERM may not be enough)", async () => { + // First adopt a healthy instance so it's tracked in `this.instances`. + manifestStore.current = baseManifest(2468); + pollHealthCheckMock.mockImplementationOnce(() => Promise.resolve(true)); + await coordinator.start("org-1", spawnConfig); + expect(coordinator.getProcessStatus("org-1")).toBe("running"); + killedPids.length = 0; + + // Adoption leaves the manifest in place; reset must read its pid before + // stop() removes it, then escalate SIGTERM → SIGKILL on a wedged process. + const conn = await coordinator.reset("org-1", spawnConfig); + + expect(killedPids).toContainEqual({ pid: 2468, signal: "SIGTERM" }); + expect(killedPids).toContainEqual({ pid: 2468, signal: "SIGKILL" }); + expect(spawnMock).toHaveBeenCalledTimes(1); + expect(conn.port).toBe(60000); + }); + + test("is safe when no manifest exists — no kill, still spawns", async () => { + manifestStore.current = null; + + const conn = await coordinator.reset("org-1", spawnConfig); + + expect(killedPids).toHaveLength(0); + // `removeManifest` is called unconditionally — that's fine, the impl + // in host-service-manifest treats a missing file as a no-op. + expect(removeManifestMock).toHaveBeenCalledTimes(1); + expect(spawnMock).toHaveBeenCalledTimes(1); + expect(conn.port).toBe(60000); + }); +}); diff --git a/apps/desktop/src/main/lib/host-service-coordinator.ts b/apps/desktop/src/main/lib/host-service-coordinator.ts index e32c75759f5..91eeae74ddc 100644 --- a/apps/desktop/src/main/lib/host-service-coordinator.ts +++ b/apps/desktop/src/main/lib/host-service-coordinator.ts @@ -6,6 +6,7 @@ import path from "node:path"; import { settings } from "@superset/local-db"; import { getHostId, getHostName } from "@superset/shared/host-info"; import { app } from "electron"; +import log from "electron-log/main"; import { env } from "main/env.main"; import { env as sharedEnv } from "shared/env.shared"; import { getProcessEnvWithShellPath } from "../../lib/trpc/routers/workspaces/utils/shell-env"; @@ -56,6 +57,13 @@ interface HostServiceProcess { const ADOPTED_LIVENESS_INTERVAL = 5_000; +/** + * Cap how long an adoption health check can take before we decide the adopted + * process is dead and respawn. Short enough that a Cmd+R into a wedged + * host-service heals quickly; long enough to ride out brief startup blips. + */ +const ADOPT_HEALTH_CHECK_TIMEOUT_MS = 2_000; + export class HostServiceCoordinator extends EventEmitter { private instances = new Map(); private pendingStarts = new Map>(); @@ -152,6 +160,41 @@ export class HostServiceCoordinator extends EventEmitter { return this.start(organizationId, config); } + /** + * Forcefully reset host-service state for an org. Unlike `restart`, this + * SIGKILLs whatever pid the manifest names — even when no instance is + * tracked in this process (e.g. a manifest left by a previous app session) + * — then removes the manifest so adoption can't pick up the stale entry, + * and respawns. Used by the recovery path for superset-sh/superset#4299 + * where a live-but-wedged host-service keeps getting re-adopted. + */ + async reset( + organizationId: string, + config: SpawnConfig, + ): Promise { + // Capture the manifest pid *before* stop() — stop() removes the manifest + // for tracked instances and only sends SIGTERM, which a wedged process + // can ignore. We escalate to SIGKILL on whatever pid the manifest named. + const manifestPid = readManifest(organizationId)?.pid; + + this.stop(organizationId); + + if (manifestPid != null && isProcessAlive(manifestPid)) { + try { + process.kill(manifestPid, "SIGKILL"); + } catch (error) { + log.warn( + `[host-service:${organizationId}] reset: SIGKILL of pid=${manifestPid} failed`, + error, + ); + } + } + + removeManifest(organizationId); + + return this.start(organizationId, config); + } + getConnection(organizationId: string): Connection | null { const instance = this.instances.get(organizationId); if (!instance || instance.status !== "running") return null; @@ -238,19 +281,19 @@ export class HostServiceCoordinator extends EventEmitter { try { const ready = await waitForStableBundle(); if (!ready) { - console.warn( + log.warn( "[host-service] bundle did not stabilize, skipping reload", ); return; } const config = await configProvider(); if (!config) return; - console.log( + log.info( "[host-service] bundle changed, restarting running instances", ); await this.restartAll(config); } catch (error) { - console.error("[host-service] dev reload failed:", error); + log.error("[host-service] dev reload failed:", error); } finally { reloading = false; } @@ -264,7 +307,7 @@ export class HostServiceCoordinator extends EventEmitter { trigger(); }); } catch (error) { - console.error("[host-service] failed to enable dev reload:", error); + log.error("[host-service] failed to enable dev reload:", error); return () => {}; } @@ -289,7 +332,7 @@ export class HostServiceCoordinator extends EventEmitter { const reason = manifest.spawnedByAppVersion ? `spawned by app ${manifest.spawnedByAppVersion} != current ${currentAppVersion}` : "no recorded app version (pre-upgrade manifest)"; - console.log( + log.info( `[host-service:${organizationId}] Adopted service ${reason}, killing`, ); try { @@ -299,6 +342,34 @@ export class HostServiceCoordinator extends EventEmitter { return null; } + // A live pid is not the same as a serving host-service — the process can + // be hung on migrations, deadlocked, or no longer bound to the recorded + // port. Without this check the renderer's `getConnection` keeps handing + // out a dead port forever, which is the failure mode in superset-sh/superset#4299. + const healthy = await pollHealthCheck( + manifest.endpoint, + manifest.authToken, + ADOPT_HEALTH_CHECK_TIMEOUT_MS, + ); + if (!healthy) { + log.info( + `[host-service:${organizationId}] Adopted pid=${manifest.pid} did not respond on ${manifest.endpoint}, killing and respawning`, + ); + try { + process.kill(manifest.pid, "SIGKILL"); + } catch (error) { + // ESRCH (already gone) is fine; anything else (EPERM) we want to see. + if ((error as NodeJS.ErrnoException)?.code !== "ESRCH") { + log.warn( + `[host-service:${organizationId}] SIGKILL of stale pid=${manifest.pid} failed`, + error, + ); + } + } + removeManifest(organizationId); + return null; + } + this.instances.set(organizationId, { pid: manifest.pid, port, @@ -307,7 +378,7 @@ export class HostServiceCoordinator extends EventEmitter { }); this.startAdoptedLivenessCheck(organizationId, manifest.pid); - console.log( + log.info( `[host-service:${organizationId}] Adopted pid=${manifest.pid} port=${port}`, ); this.emitStatus(organizationId, "running", null); @@ -408,7 +479,7 @@ export class HostServiceCoordinator extends EventEmitter { instance.pid = childPid; child.on("exit", (code) => { - console.log(`[host-service:${organizationId}] exited with code ${code}`); + log.info(`[host-service:${organizationId}] exited with code ${code}`); const current = this.instances.get(organizationId); if (!current || current.pid !== childPid || current.status === "stopped") return; @@ -431,7 +502,7 @@ export class HostServiceCoordinator extends EventEmitter { instance.status = "running"; - console.log(`[host-service:${organizationId}] listening on port ${port}`); + log.info(`[host-service:${organizationId}] listening on port ${port}`); this.emitStatus(organizationId, "running", "starting"); return { port, secret, machineId: this.machineId }; } @@ -490,7 +561,7 @@ export class HostServiceCoordinator extends EventEmitter { this.adoptedLivenessTimers.delete(organizationId); const instance = this.instances.get(organizationId); if (instance && instance.status !== "stopped") { - console.log( + log.info( `[host-service:${organizationId}] Adopted process ${pid} died`, ); this.instances.delete(organizationId); diff --git a/apps/desktop/src/main/lib/tray/index.ts b/apps/desktop/src/main/lib/tray/index.ts index 81948961fb1..4717a1cbd42 100644 --- a/apps/desktop/src/main/lib/tray/index.ts +++ b/apps/desktop/src/main/lib/tray/index.ts @@ -169,8 +169,11 @@ function buildHostServiceSubmenu( enabled: false, }); menuItems.push({ + // Enabled in "stopped" too — that's the state where users most need + // restart to work (host-service crashed or never came up). Disabled + // only while a start is in flight, to avoid racing the pending start. label: " Restart", - enabled: isRunning, + enabled: status !== "starting", click: () => { void (async () => { try {