From 9ba43d7ca103a40f8b3965899a3f8c7aad0c3f56 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 20 May 2026 15:32:15 -0700 Subject: [PATCH 1/2] Preserve PTYs on daemon auto-update failure --- .../DaemonAutoUpdateFailureDialog.tsx | 139 ++++++++++++ .../DaemonAutoUpdateFailureDialog/index.ts | 1 + .../renderer/routes/_authenticated/layout.tsx | 2 + packages/host-service/DAEMON_SUPERVISION.md | 13 +- .../src/daemon/DaemonSupervisor.test.ts | 173 +++++---------- .../src/daemon/DaemonSupervisor.ts | 204 +++++------------- .../src/daemon/expected-version.ts | 9 +- .../router/terminal/terminal.daemon.test.ts | 2 + 8 files changed, 261 insertions(+), 282 deletions(-) create mode 100644 apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/DaemonAutoUpdateFailureDialog.tsx create mode 100644 apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/index.ts diff --git a/apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/DaemonAutoUpdateFailureDialog.tsx b/apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/DaemonAutoUpdateFailureDialog.tsx new file mode 100644 index 00000000000..feb3ebd8161 --- /dev/null +++ b/apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/DaemonAutoUpdateFailureDialog.tsx @@ -0,0 +1,139 @@ +import { + AlertDialog, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, +} from "@superset/ui/alert-dialog"; +import { Button } from "@superset/ui/button"; +import { toast } from "@superset/ui/sonner"; +import { + WorkspaceClientProvider, + workspaceTrpc, +} from "@superset/workspace-client"; +import { useEffect, useState } from "react"; +import { + getHostServiceHeaders, + getHostServiceWsToken, +} from "renderer/lib/host-service-auth"; +import { useLocalHostService } from "renderer/routes/_authenticated/providers/LocalHostServiceProvider"; + +const STATUS_REFETCH_MS = 5_000; + +export function DaemonAutoUpdateFailureDialog() { + const { activeHostUrl } = useLocalHostService(); + if (!activeHostUrl) return null; + return ( + getHostServiceHeaders(activeHostUrl)} + wsToken={() => getHostServiceWsToken(activeHostUrl)} + > + + + ); +} + +function DaemonAutoUpdateFailureDialogInner() { + const [activeFailureId, setActiveFailureId] = useState(null); + const [dismissedFailureId, setDismissedFailureId] = useState( + null, + ); + + const updateStatusQuery = + workspaceTrpc.terminal.daemon.getUpdateStatus.useQuery(undefined, { + refetchInterval: STATUS_REFETCH_MS, + refetchOnWindowFocus: true, + }); + const sessionsQuery = workspaceTrpc.terminal.daemon.listSessions.useQuery( + undefined, + { + enabled: activeFailureId !== null, + refetchOnWindowFocus: true, + }, + ); + const restartDaemon = workspaceTrpc.terminal.daemon.restart.useMutation({ + onSuccess: () => { + toast.success("Daemon restarted", { + description: "All sessions were closed and a fresh daemon is running.", + }); + void updateStatusQuery.refetch(); + }, + onError: (error) => { + toast.error("Failed to restart daemon", { description: error.message }); + }, + }); + + const failure = updateStatusQuery.data?.autoUpdateFailure ?? null; + useEffect(() => { + if (!failure) { + setActiveFailureId(null); + return; + } + if (failure.id === dismissedFailureId) return; + setActiveFailureId(failure.id); + }, [failure, dismissedFailureId]); + + const closeDialog = () => { + if (activeFailureId) setDismissedFailureId(activeFailureId); + setActiveFailureId(null); + }; + + const sessions = sessionsQuery.data ?? null; + const aliveCount = + sessions === null + ? null + : sessions.filter((session) => session.alive).length; + + return ( + { + if (!open) closeDialog(); + }} + > + + + + Daemon update needs confirmation + + +
+ + Superset tried to update the terminal daemon without closing + sessions, but the handoff did not finish. Reason: + + + {failure?.reason ?? ""} + + + Force update will close every terminal session + {aliveCount && aliveCount > 0 ? ` (${aliveCount} running)` : ""}{" "} + and start a fresh daemon. + +
+
+
+ + + + +
+
+ ); +} diff --git a/apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/index.ts b/apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/index.ts new file mode 100644 index 00000000000..4423c87ddba --- /dev/null +++ b/apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/index.ts @@ -0,0 +1 @@ +export { DaemonAutoUpdateFailureDialog } from "./DaemonAutoUpdateFailureDialog"; diff --git a/apps/desktop/src/renderer/routes/_authenticated/layout.tsx b/apps/desktop/src/renderer/routes/_authenticated/layout.tsx index dbc28401f70..0ce1429fcd9 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/layout.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/layout.tsx @@ -22,6 +22,7 @@ import { dragDropManager } from "renderer/lib/dnd"; import { electronTrpc } from "renderer/lib/electron-trpc"; import { showWorkspaceAutoNameWarningToast } from "renderer/lib/workspaces/showWorkspaceAutoNameWarningToast"; import { InitGitDialog } from "renderer/react-query/projects/InitGitDialog"; +import { DaemonAutoUpdateFailureDialog } from "renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog"; import { DashboardNewWorkspaceModal } from "renderer/routes/_authenticated/components/DashboardNewWorkspaceModal"; import { V1ImportModal } from "renderer/routes/_authenticated/components/V1ImportModal"; import { WorkspaceInitEffects } from "renderer/screens/main/components/WorkspaceInitEffects"; @@ -210,6 +211,7 @@ function AuthenticatedLayout() { + diff --git a/packages/host-service/DAEMON_SUPERVISION.md b/packages/host-service/DAEMON_SUPERVISION.md index 53a4ed9054b..3a5aff3ecac 100644 --- a/packages/host-service/DAEMON_SUPERVISION.md +++ b/packages/host-service/DAEMON_SUPERVISION.md @@ -65,8 +65,11 @@ read the running daemon's `daemonVersion`, compares against `updatePending: true` on the instance — the renderer surfaces a "restart to update" affordance. Manual updates try fd-handoff first and only force-restart after the user confirms. Automatic adoption updates -also try fd-handoff first, then force-restart on failure because there is -no foreground UI to ask for the destructive fallback. +also try fd-handoff first, but they never force-restart in the background; +on failure, the predecessor keeps running and `updatePending` remains +visible for an explicit user action. The failure reason is exposed through +`getUpdateStatus().autoUpdateFailure` so the desktop can show a global +force-update dialog without the supervisor taking the destructive path itself. Probe failure ≠ stale: a transient socket issue produces `runningVersion: "unknown", updatePending: false` rather than a @@ -137,11 +140,7 @@ The supervisor emits structured `console.log` lines with `pty_daemon_spawn`, `pty_daemon_adopt`, `pty_daemon_user_restart`, `pty_daemon_update_pending`, `pty_daemon_update`, `pty_daemon_auto_update_attempt`, `pty_daemon_auto_update_ok`, -`pty_daemon_auto_update_failed`, -`pty_daemon_auto_update_force_restart`, -`pty_daemon_auto_update_force_restart_ok`, -`pty_daemon_auto_update_force_restart_failed`, -`pty_daemon_auto_update_force_restart_skipped`, `pty_daemon_crash`, +`pty_daemon_auto_update_failed`, `pty_daemon_crash`, `pty_daemon_circuit_open`, `pty_daemon_spawn_failed`. No PostHog plumbing on host-service yet — promote to real telemetry when the path is needed. diff --git a/packages/host-service/src/daemon/DaemonSupervisor.test.ts b/packages/host-service/src/daemon/DaemonSupervisor.test.ts index 30464bc4f27..7fce703d867 100644 --- a/packages/host-service/src/daemon/DaemonSupervisor.test.ts +++ b/packages/host-service/src/daemon/DaemonSupervisor.test.ts @@ -368,6 +368,7 @@ describe("DaemonSupervisor.getUpdateStatus", () => { pending: false, running: "0.1.0", expected: "0.1.0", + autoUpdateFailure: null, }); }); @@ -381,6 +382,7 @@ describe("DaemonSupervisor.getUpdateStatus", () => { pending: true, running: "0.0.9", expected: "0.1.0", + autoUpdateFailure: null, }); }); @@ -393,6 +395,7 @@ describe("DaemonSupervisor.getUpdateStatus", () => { const status = sup.getUpdateStatus("org-probe-failed"); expect(status?.pending).toBe(false); expect(status?.running).toBe("unknown"); + expect(status?.autoUpdateFailure).toBeNull(); }); }); @@ -665,105 +668,88 @@ describe("DaemonSupervisor.update failure mode", () => { }); }); -describe("DaemonSupervisor auto-update fallback", () => { +describe("DaemonSupervisor auto-update best effort", () => { let sup: DaemonSupervisor; beforeEach(() => { sup = new DaemonSupervisor({ scriptPath: "/nonexistent" }); }); - test("force-restarts when the background smooth update returns ok:false", async () => { + test("leaves the predecessor running when the background smooth update returns ok:false", async () => { const instance = staleInstance("0.0.9"); - seedDaemonInstance(sup, "org-auto-fallback", instance); + seedDaemonInstance(sup, "org-auto-best-effort", instance); mockListSessions(sup, []); const runUpdateMock = mock(async () => ({ ok: false as const, reason: "snapshot write failed: ENOSPC", })); - const forceRestartCalls: { - organizationId: string; - log: ForceRestartLog; - }[] = []; - const forceRestartMock = mock( - async (organizationId: string, log: ForceRestartLog) => { - forceRestartCalls.push({ organizationId, log }); - return { success: true as const }; - }, - ); + const forceRestartMock = mock(async () => ({ success: true as const })); (sup as unknown as { runUpdate: typeof runUpdateMock }).runUpdate = runUpdateMock; ( sup as unknown as { - forceRestart: ( - organizationId: string, - log: ForceRestartLog, - ) => Promise<{ success: true }>; + forceRestart: () => Promise<{ success: true }>; } ).forceRestart = forceRestartMock; - invokeKickoffAutoUpdate(sup, "org-auto-fallback", instance); + invokeKickoffAutoUpdate(sup, "org-auto-best-effort", instance); await flushAutoUpdate(); - expect(runUpdateMock).toHaveBeenCalledWith("org-auto-fallback"); - expect(forceRestartMock).toHaveBeenCalledTimes(1); - expect(forceRestartCalls[0]?.organizationId).toBe("org-auto-fallback"); - expect(forceRestartCalls[0]?.log).toMatchObject({ - event: "pty_daemon_auto_update_force_restart", - props: { - smoothUpdateFailureReason: "snapshot write failed: ENOSPC", - smoothUpdatePid: instance.pid, - smoothUpdateRunningVersion: instance.runningVersion, - smoothUpdateExpectedVersion: instance.expectedVersion, - }, - }); + expect(runUpdateMock).toHaveBeenCalledWith("org-auto-best-effort"); + expect(forceRestartMock).not.toHaveBeenCalled(); + expect(getInstancePid(sup, "org-auto-best-effort")).toBe(instance.pid); + const status = sup.getUpdateStatus("org-auto-best-effort"); + expect(status?.pending).toBe(true); + expect(status?.autoUpdateFailure?.reason).toBe( + "snapshot write failed: ENOSPC", + ); expect( loggedEvents.some( - (e) => e.event === "pty_daemon_auto_update_force_restart_ok", + (e) => + e.event === "pty_daemon_auto_update_failed" && + e.props.reason === "snapshot write failed: ENOSPC" && + e.props.leftPending === true, ), ).toBe(true); }); - test("force-restarts when the background smooth update throws", async () => { + test("leaves the predecessor running when the background smooth update throws", async () => { const instance = staleInstance("0.0.8"); seedDaemonInstance(sup, "org-auto-throw", instance); mockListSessions(sup, []); const runUpdateMock = mock(async () => { throw new Error("transport: ECONNRESET"); }); - const forceRestartCalls: { - organizationId: string; - log: ForceRestartLog; - }[] = []; - const forceRestartMock = mock( - async (organizationId: string, log: ForceRestartLog) => { - forceRestartCalls.push({ organizationId, log }); - return { success: true as const }; - }, - ); + const forceRestartMock = mock(async () => ({ success: true as const })); (sup as unknown as { runUpdate: typeof runUpdateMock }).runUpdate = runUpdateMock; ( sup as unknown as { - forceRestart: ( - organizationId: string, - log: ForceRestartLog, - ) => Promise<{ success: true }>; + forceRestart: () => Promise<{ success: true }>; } ).forceRestart = forceRestartMock; invokeKickoffAutoUpdate(sup, "org-auto-throw", instance); await flushAutoUpdate(); - expect(forceRestartMock).toHaveBeenCalledTimes(1); - expect(forceRestartCalls[0]?.log).toMatchObject({ - event: "pty_daemon_auto_update_force_restart", - props: { - smoothUpdateFailureReason: "threw: transport: ECONNRESET", - }, - }); + expect(forceRestartMock).not.toHaveBeenCalled(); + expect(getInstancePid(sup, "org-auto-throw")).toBe(instance.pid); + const status = sup.getUpdateStatus("org-auto-throw"); + expect(status?.pending).toBe(true); + expect(status?.autoUpdateFailure?.reason).toBe( + "threw: transport: ECONNRESET", + ); + expect( + loggedEvents.some( + (e) => + e.event === "pty_daemon_auto_update_failed" && + e.props.reason === "threw: transport: ECONNRESET" && + e.props.leftPending === true, + ), + ).toBe(true); }); - test("skips force-restart when the failed stale daemon is no longer current", async () => { + test("does not overwrite the current daemon if the failed update changed it", async () => { const instance = staleInstance("0.0.7"); seedDaemonInstance(sup, "org-auto-changed", instance); mockListSessions(sup, []); @@ -784,10 +770,7 @@ describe("DaemonSupervisor auto-update fallback", () => { runUpdateMock; ( sup as unknown as { - forceRestart: ( - organizationId: string, - log: ForceRestartLog, - ) => Promise<{ success: true }>; + forceRestart: () => Promise<{ success: true }>; } ).forceRestart = forceRestartMock; @@ -795,13 +778,16 @@ describe("DaemonSupervisor auto-update fallback", () => { await flushAutoUpdate(); expect(forceRestartMock).not.toHaveBeenCalled(); + expect(getInstancePid(sup, "org-auto-changed")).toBe(4321); + expect( + sup.getUpdateStatus("org-auto-changed")?.autoUpdateFailure, + ).toBeNull(); expect( loggedEvents.some( (e) => - e.event === "pty_daemon_auto_update_force_restart_skipped" && - e.props.reason === "daemon_changed" && - e.props.smoothUpdatePid === instance.pid && - e.props.currentPid === 4321, + e.event === "pty_daemon_auto_update_failed" && + e.props.reason === "successor ack timed out after 5000ms" && + e.props.leftPending === true, ), ).toBe(true); }); @@ -832,49 +818,7 @@ describe("DaemonSupervisor auto-update fallback", () => { ).toBe(true); }); - test("skips force-restart when live sessions appear after a failed smooth update", async () => { - const instance = staleInstance("0.0.5"); - seedDaemonInstance(sup, "org-auto-race", instance); - let listCalls = 0; - const listSessionsMock = mock(async () => { - listCalls += 1; - return listCalls === 1 ? [] : [aliveSession("late-live")]; - }); - (sup as unknown as { listSessions: typeof sup.listSessions }).listSessions = - listSessionsMock as typeof sup.listSessions; - const runUpdateMock = mock(async () => ({ - ok: false as const, - reason: "successor ack timed out after 5000ms", - })); - const forceRestartMock = mock(async () => ({ success: true as const })); - (sup as unknown as { runUpdate: typeof runUpdateMock }).runUpdate = - runUpdateMock; - ( - sup as unknown as { - forceRestart: ( - organizationId: string, - log: ForceRestartLog, - ) => Promise<{ success: true }>; - } - ).forceRestart = forceRestartMock; - - invokeKickoffAutoUpdate(sup, "org-auto-race", instance); - await flushAutoUpdate(); - - expect(runUpdateMock).toHaveBeenCalledTimes(1); - expect(forceRestartMock).not.toHaveBeenCalled(); - expect( - loggedEvents.some( - (e) => - e.event === "pty_daemon_auto_update_force_restart_skipped" && - e.props.reason === "live_sessions_present" && - e.props.aliveSessionCount === 1 && - e.props.smoothUpdatePid === instance.pid, - ), - ).toBe(true); - }); - - test("skips force-restart when auto-update joins an existing manual update", async () => { + test("joins an existing manual update without adding a destructive fallback", async () => { const instance = staleInstance("0.0.6"); seedDaemonInstance(sup, "org-auto-coalesced", instance); mockListSessions(sup, []); @@ -885,10 +829,7 @@ describe("DaemonSupervisor auto-update fallback", () => { runUpdateMock; ( sup as unknown as { - forceRestart: ( - organizationId: string, - log: ForceRestartLog, - ) => Promise<{ success: true }>; + forceRestart: () => Promise<{ success: true }>; } ).forceRestart = forceRestartMock; @@ -904,12 +845,15 @@ describe("DaemonSupervisor auto-update fallback", () => { expect(runUpdateMock).toHaveBeenCalledTimes(1); expect(forceRestartMock).not.toHaveBeenCalled(); + expect( + sup.getUpdateStatus("org-auto-coalesced")?.autoUpdateFailure, + ).toBeNull(); expect( loggedEvents.some( (e) => - e.event === "pty_daemon_auto_update_force_restart_skipped" && - e.props.reason === "update_already_in_flight" && - e.props.smoothUpdatePid === instance.pid, + e.event === "pty_daemon_auto_update_failed" && + e.props.reason === "manual smooth update failed" && + e.props.leftPending === true, ), ).toBe(true); }); @@ -917,11 +861,6 @@ describe("DaemonSupervisor auto-update fallback", () => { // ---------------- helpers ---------------- -interface ForceRestartLog { - event: string; - props: Record; -} - interface Deferred { promise: Promise; resolve: (v: T) => void; diff --git a/packages/host-service/src/daemon/DaemonSupervisor.ts b/packages/host-service/src/daemon/DaemonSupervisor.ts index e51bdb41faf..42dd8fd85c1 100644 --- a/packages/host-service/src/daemon/DaemonSupervisor.ts +++ b/packages/host-service/src/daemon/DaemonSupervisor.ts @@ -48,6 +48,8 @@ interface DaemonInstance { expectedVersion: string; /** True when running < expected. Probe failure does NOT set this. */ updatePending: boolean; + /** Last failed background update attempt for this still-running daemon. */ + autoUpdateFailure?: DaemonAutoUpdateFailure; } interface DaemonProbeResult { @@ -55,6 +57,19 @@ interface DaemonProbeResult { daemonPid?: number; } +interface DaemonAutoUpdateFailure { + id: string; + reason: string; + failedAt: number; +} + +interface DaemonUpdateStatus { + pending: boolean; + running: string; + expected: string; + autoUpdateFailure: DaemonAutoUpdateFailure | null; +} + const SOCKET_READY_TIMEOUT_MS = 5_000; const VERSION_PROBE_TIMEOUT_MS = 1_500; const HANDOFF_PREDECESSOR_EXIT_TIMEOUT_MS = 3_000; @@ -116,8 +131,9 @@ export interface DaemonSupervisorOptions { /** * When true (default), opportunistically calls `update()` after * adopting a daemon whose `runningVersion < EXPECTED_DAEMON_VERSION`. - * If that smooth handoff fails, auto-update falls back to a force - * restart so the app does not keep running an incompatible daemon. + * Background auto-update is best-effort only: if fd-handoff cannot + * preserve the daemon's sessions, the predecessor keeps running and + * the desktop UI remains the explicit place for a destructive force restart. * Set to false in integration tests that intentionally adopt a stale * daemon and assert the version-drift flag without the test racing a * real handoff. @@ -189,15 +205,14 @@ export class DaemonSupervisor { * means the version probe failed during adoption — treat as not-pending * (probe failure ≠ stale). */ - getUpdateStatus( - organizationId: string, - ): { pending: boolean; running: string; expected: string } | null { + getUpdateStatus(organizationId: string): DaemonUpdateStatus | null { const instance = this.instances.get(organizationId); if (!instance) return null; return { pending: instance.updatePending, running: instance.runningVersion, expected: instance.expectedVersion, + autoUpdateFailure: instance.autoUpdateFailure ?? null, }; } @@ -526,7 +541,13 @@ export class DaemonSupervisor { runningVersion: instance.runningVersion, expectedVersion: instance.expectedVersion, reason: `threw: ${(err as Error).message}`, + leftPending: true, }); + this.recordAutoUpdateFailure( + organizationId, + instance, + `threw: ${(err as Error).message}`, + ); }); } @@ -554,8 +575,6 @@ export class DaemonSupervisor { return; } - // If a manual Update click already owns the in-flight handoff, leave - // any destructive fallback to that foreground UI path. const update = this.startUpdate(organizationId); try { const result = await update.promise; @@ -575,21 +594,11 @@ export class DaemonSupervisor { runningVersion: instance.runningVersion, expectedVersion: instance.expectedVersion, reason: result.reason, + leftPending: true, }); - if (!update.started) { - this.skipAutoUpdateForceRestart( - organizationId, - instance, - result.reason, - "update_already_in_flight", - ); - return; + if (update.started) { + this.recordAutoUpdateFailure(organizationId, instance, result.reason); } - await this.forceRestartAfterAutoUpdateFailure( - organizationId, - instance, - result.reason, - ); } catch (err) { const reason = `threw: ${(err as Error).message}`; logEvent("pty_daemon_auto_update_failed", { @@ -598,158 +607,47 @@ export class DaemonSupervisor { runningVersion: instance.runningVersion, expectedVersion: instance.expectedVersion, reason, + leftPending: true, }); - if (!update.started) { - this.skipAutoUpdateForceRestart( - organizationId, - instance, - reason, - "update_already_in_flight", - ); - return; + if (update.started) { + this.recordAutoUpdateFailure(organizationId, instance, reason); } - await this.forceRestartAfterAutoUpdateFailure( - organizationId, - instance, - reason, - ); } } - private deferAutoUpdate( + private recordAutoUpdateFailure( organizationId: string, instance: DaemonInstance, reason: string, - extra: Record = {}, ): void { - logEvent("pty_daemon_auto_update_deferred", { - organizationId, - pid: instance.pid, - runningVersion: instance.runningVersion, - expectedVersion: instance.expectedVersion, - reason, - ...extra, - }); - } - - private async canAutoUpdateForceRestart( - organizationId: string, - instance: DaemonInstance, - failureReason: string, - ): Promise { - const sessions = await this.listSessions( - organizationId, - AUTO_UPDATE_SESSION_LIST_TIMEOUT_MS, - ); - if (sessions === null) { - this.skipAutoUpdateForceRestart( - organizationId, - instance, - failureReason, - "session_list_unavailable", - ); - return false; - } - const aliveSessionCount = countAliveSessions(sessions); - if (aliveSessionCount > 0) { - this.skipAutoUpdateForceRestart( - organizationId, - instance, - failureReason, - "live_sessions_present", - { aliveSessionCount }, - ); - return false; + const current = this.instances.get(organizationId); + if (!current || current.pid !== instance.pid || !current.updatePending) { + return; } - return true; + const failedAt = Date.now(); + current.autoUpdateFailure = { + id: `${current.pid}:${current.runningVersion}:${current.expectedVersion}:${failedAt}`, + reason, + failedAt, + }; } - private skipAutoUpdateForceRestart( + private deferAutoUpdate( organizationId: string, instance: DaemonInstance, - failureReason: string, reason: string, extra: Record = {}, ): void { - logEvent("pty_daemon_auto_update_force_restart_skipped", { + logEvent("pty_daemon_auto_update_deferred", { organizationId, - smoothUpdatePid: instance.pid, - smoothUpdateFailureReason: failureReason, + pid: instance.pid, + runningVersion: instance.runningVersion, + expectedVersion: instance.expectedVersion, reason, ...extra, }); } - private async forceRestartAfterAutoUpdateFailure( - organizationId: string, - instance: DaemonInstance, - reason: string, - ): Promise { - const current = this.instances.get(organizationId); - if (!current) { - logEvent("pty_daemon_auto_update_force_restart_skipped", { - organizationId, - smoothUpdatePid: instance.pid, - smoothUpdateFailureReason: reason, - reason: "no_current_daemon", - }); - return; - } - if (current.pid !== instance.pid) { - logEvent("pty_daemon_auto_update_force_restart_skipped", { - organizationId, - smoothUpdatePid: instance.pid, - smoothUpdateFailureReason: reason, - reason: "daemon_changed", - currentPid: current.pid, - currentRunningVersion: current.runningVersion, - currentExpectedVersion: current.expectedVersion, - currentUpdatePending: current.updatePending, - }); - return; - } - if (!current.updatePending) { - logEvent("pty_daemon_auto_update_force_restart_skipped", { - organizationId, - smoothUpdatePid: instance.pid, - smoothUpdateFailureReason: reason, - reason: "daemon_no_longer_pending", - currentRunningVersion: current.runningVersion, - currentExpectedVersion: current.expectedVersion, - }); - return; - } - if ( - !(await this.canAutoUpdateForceRestart(organizationId, current, reason)) - ) { - return; - } - - try { - await this.forceRestart(organizationId, { - event: "pty_daemon_auto_update_force_restart", - props: { - smoothUpdateFailureReason: reason, - smoothUpdatePid: instance.pid, - smoothUpdateRunningVersion: instance.runningVersion, - smoothUpdateExpectedVersion: instance.expectedVersion, - }, - }); - logEvent("pty_daemon_auto_update_force_restart_ok", { - organizationId, - smoothUpdatePid: instance.pid, - smoothUpdateFailureReason: reason, - }); - } catch (err) { - logEvent("pty_daemon_auto_update_force_restart_failed", { - organizationId, - smoothUpdatePid: instance.pid, - smoothUpdateFailureReason: reason, - reason: (err as Error).message, - }); - } - } - /** * Dev-only: SIGTERM any existing daemon for this org so the next * adopt-or-spawn always lands on a fresh daemon process. Reads @@ -802,11 +700,9 @@ export class DaemonSupervisor { this.maybeFireUpdatePending(organizationId, adopted); this.startAdoptedLivenessCheck(organizationId, adopted.pid); // Auto-update opportunistically: if the adopted daemon is older - // than the bundled binary, kick off a handoff in the background. - // Sessions survive on smooth success; on smooth failure the - // background path force-restarts because there is no foreground UI - // to ask the user for the destructive fallback. Fire-and-track so - // bootstrap returns immediately. + // than the bundled binary, try a smooth handoff in the background. + // This path never force-restarts; if handoff fails, the desktop UI + // exposes the explicit destructive fallback. if (adopted.updatePending && this.opts.autoUpdate !== false) { this.kickoffAutoUpdate(organizationId, adopted); } diff --git a/packages/host-service/src/daemon/expected-version.ts b/packages/host-service/src/daemon/expected-version.ts index 3f5ede8a582..73ca8055272 100644 --- a/packages/host-service/src/daemon/expected-version.ts +++ b/packages/host-service/src/daemon/expected-version.ts @@ -1,12 +1,13 @@ // Drives the "update pending" UX: host-service marks updatePending=true // when an adopted daemon's version is below this. Derived from the -// daemon's own package.json so a daemon bump automatically forces the -// upgrade — the only valid daemon-version source of truth in the repo +// daemon's own package.json so a daemon bump automatically marks older +// daemons pending — the only valid daemon-version source of truth in the repo // is `pty-daemon/package.json#version`. // // We pass this to spawned daemons via SUPERSET_PTY_DAEMON_VERSION and -// probe it back on adoption. We do NOT auto-kill on mismatch — sessions -// live in the daemon; the user explicitly triggers restart. +// probe it back on adoption. We do NOT auto-kill on mismatch or failed +// background handoff — sessions live in the daemon; the user explicitly +// triggers restart. import ptyDaemonPackageJson from "@superset/pty-daemon/package.json" with { type: "json", diff --git a/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts b/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts index 1420ee7bc3c..643039c03ba 100644 --- a/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts +++ b/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts @@ -52,6 +52,7 @@ describe("terminal.daemon tRPC procedures", () => { pending: true, running: "0.0.9", expected: "0.1.0", + autoUpdateFailure: null, })); ( sup as unknown as { getUpdateStatus: typeof sup.getUpdateStatus } @@ -68,6 +69,7 @@ describe("terminal.daemon tRPC procedures", () => { pending: true, running: "0.0.9", expected: "0.1.0", + autoUpdateFailure: null, }); }); From f2f1a517dd1b1f1bb35a464779fe8aca470801f6 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 20 May 2026 16:00:16 -0700 Subject: [PATCH 2/2] Address daemon update review feedback --- .../DaemonAutoUpdateFailureDialog.tsx | 63 +++++++++++++++---- packages/host-service/DAEMON_SUPERVISION.md | 7 ++- .../src/daemon/DaemonSupervisor.test.ts | 4 +- .../src/daemon/DaemonSupervisor.ts | 34 ++++++++-- 4 files changed, 84 insertions(+), 24 deletions(-) diff --git a/apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/DaemonAutoUpdateFailureDialog.tsx b/apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/DaemonAutoUpdateFailureDialog.tsx index feb3ebd8161..26adfed2623 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/DaemonAutoUpdateFailureDialog.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/components/DaemonAutoUpdateFailureDialog/DaemonAutoUpdateFailureDialog.tsx @@ -20,10 +20,29 @@ import { import { useLocalHostService } from "renderer/routes/_authenticated/providers/LocalHostServiceProvider"; const STATUS_REFETCH_MS = 5_000; +const DISMISSED_FAILURE_STORAGE_KEY_PREFIX = + "superset.daemon-auto-update-failure.dismissed."; + +function getDismissedFailureId(storageKey: string): string | null { + try { + return window.localStorage.getItem(storageKey); + } catch { + return null; + } +} + +function saveDismissedFailureId(storageKey: string, failureId: string): void { + try { + window.localStorage.setItem(storageKey, failureId); + } catch { + // Best effort; in-memory state still suppresses this failure for the session. + } +} export function DaemonAutoUpdateFailureDialog() { - const { activeHostUrl } = useLocalHostService(); + const { activeHostUrl, activeOrganizationId } = useLocalHostService(); if (!activeHostUrl) return null; + const dismissedFailureStorageKey = `${DISMISSED_FAILURE_STORAGE_KEY_PREFIX}${activeOrganizationId ?? activeHostUrl}`; return ( getHostServiceHeaders(activeHostUrl)} wsToken={() => getHostServiceWsToken(activeHostUrl)} > - + ); } -function DaemonAutoUpdateFailureDialogInner() { +function DaemonAutoUpdateFailureDialogInner({ + dismissedFailureStorageKey, +}: { + dismissedFailureStorageKey: string; +}) { const [activeFailureId, setActiveFailureId] = useState(null); const [dismissedFailureId, setDismissedFailureId] = useState( - null, + () => getDismissedFailureId(dismissedFailureStorageKey), ); const updateStatusQuery = @@ -48,15 +73,28 @@ function DaemonAutoUpdateFailureDialogInner() { refetchInterval: STATUS_REFETCH_MS, refetchOnWindowFocus: true, }); + const closeDialog = () => { + if (activeFailureId) { + setDismissedFailureId(activeFailureId); + saveDismissedFailureId(dismissedFailureStorageKey, activeFailureId); + } + setActiveFailureId(null); + }; + useEffect(() => { + setDismissedFailureId(getDismissedFailureId(dismissedFailureStorageKey)); + }, [dismissedFailureStorageKey]); + const sessionsQuery = workspaceTrpc.terminal.daemon.listSessions.useQuery( undefined, { enabled: activeFailureId !== null, + refetchInterval: activeFailureId !== null ? STATUS_REFETCH_MS : false, refetchOnWindowFocus: true, }, ); const restartDaemon = workspaceTrpc.terminal.daemon.restart.useMutation({ onSuccess: () => { + closeDialog(); toast.success("Daemon restarted", { description: "All sessions were closed and a fresh daemon is running.", }); @@ -77,11 +115,6 @@ function DaemonAutoUpdateFailureDialogInner() { setActiveFailureId(failure.id); }, [failure, dismissedFailureId]); - const closeDialog = () => { - if (activeFailureId) setDismissedFailureId(activeFailureId); - setActiveFailureId(null); - }; - const sessions = sessionsQuery.data ?? null; const aliveCount = sessions === null @@ -92,7 +125,7 @@ function DaemonAutoUpdateFailureDialogInner() { { - if (!open) closeDialog(); + if (!open && !restartDaemon.isPending) closeDialog(); }} > @@ -106,7 +139,7 @@ function DaemonAutoUpdateFailureDialogInner() { Superset tried to update the terminal daemon without closing sessions, but the handoff did not finish. Reason: - + {failure?.reason ?? ""} @@ -118,7 +151,12 @@ function DaemonAutoUpdateFailureDialogInner() { -