diff --git a/apps/desktop/electron.vite.config.ts b/apps/desktop/electron.vite.config.ts index 5f073935c45..f11759ceaa2 100644 --- a/apps/desktop/electron.vite.config.ts +++ b/apps/desktop/electron.vite.config.ts @@ -111,6 +111,9 @@ export default defineConfig({ "git-task-worker": resolve("src/main/git-task-worker.ts"), // Workspace service - local HTTP/tRPC server per org "host-service": resolve("src/main/host-service/index.ts"), + // pty-daemon - long-lived per-org Unix-socket server that owns PTYs. + // Spawned by PtyDaemonCoordinator; survives host-service restarts. + "pty-daemon": resolve("src/main/pty-daemon/index.ts"), }, output: { dir: resolve(devPath, "main"), diff --git a/apps/desktop/package.json b/apps/desktop/package.json index 43eeeb16a4d..ee1052a1530 100644 --- a/apps/desktop/package.json +++ b/apps/desktop/package.json @@ -90,6 +90,7 @@ "@superset/macos-process-metrics": "workspace:*", "@superset/panes": "workspace:*", "@superset/port-scanner": "workspace:*", + "@superset/pty-daemon": "workspace:*", "@superset/shared": "workspace:*", "@superset/trpc": "workspace:*", "@superset/ui": "workspace:*", @@ -213,7 +214,7 @@ "rehype-raw": "^7.0.0", "rehype-sanitize": "^6.0.0", "remark-gfm": "^4.0.1", - "semver": "^7.7.3", + "semver": "^7.7.4", "shell-env": "^4.0.3", "shell-quote": "^1.8.3", "shiki": "^3.21.0", diff --git a/apps/desktop/src/main/lib/host-service-coordinator.ts b/apps/desktop/src/main/lib/host-service-coordinator.ts index a34826d516f..3b745dbae1d 100644 --- a/apps/desktop/src/main/lib/host-service-coordinator.ts +++ b/apps/desktop/src/main/lib/host-service-coordinator.ts @@ -41,8 +41,18 @@ import { HOOK_PROTOCOL_VERSION } from "./terminal/env"; * `device.ensureV2Host`); v2_hosts/v2_users_hosts/v2_workspaces use * machineId text instead of uuid surrogates. * 0.2.0: `workspaceCreation.adopt` gained optional `worktreePath`. + * + * 0.5.0 — pty-daemon supervision migrated into host-service. New + * `terminal.daemon` tRPC namespace; older 0.4.x host-services don't + * expose it. Adopting one in place would leave the new desktop + * talking to old code: Settings → Manage daemon would silently + * fail, and the v2 PTY survival promise is broken. Bumping the + * floor forces the coordinator's `tryAdopt` (host-service-coordinator + * line ~308) to SIGTERM old host-services on first launch and + * respawn with the new bundle. One-time terminal-session loss for + * users on upgrade — accepted per release-notes guidance. */ -const MIN_HOST_SERVICE_VERSION = "0.4.0"; +const MIN_HOST_SERVICE_VERSION = "0.5.0"; export type HostServiceStatus = "starting" | "running" | "stopped"; @@ -82,6 +92,10 @@ export class HostServiceCoordinator extends EventEmitter { private scriptPath = path.join(__dirname, "host-service.js"); private machineId = getHostId(); private devReloadWatcher: fs.FSWatcher | null = null; + // Note: pty-daemon supervision moved into host-service itself — + // see packages/host-service/src/daemon. Host-service spawns and adopts + // the daemon when it boots, so the desktop coordinator no longer needs + // to know about it. async start( organizationId: string, @@ -385,6 +399,9 @@ export class HostServiceCoordinator extends EventEmitter { this.instances.set(organizationId, instance); this.emitStatus(organizationId, "starting", null); + // pty-daemon is supervised by host-service itself; this coordinator + // only spawns host-service and steps out. See + // packages/host-service/src/daemon for the supervisor lifecycle. const childEnv = await this.buildEnv(organizationId, port, secret, config); // Host-service owns v2 PTYs, so it must survive Electron restarts in // every environment. This mirrors the terminal-host daemon: detach the @@ -394,8 +411,16 @@ export class HostServiceCoordinator extends EventEmitter { path.join(manifestDir(organizationId), "host-service.log"), MAX_HOST_LOG_BYTES, ); - const stdio: childProcess.StdioOptions = - logFd >= 0 ? ["ignore", logFd, logFd] : ["ignore", "ignore", "ignore"]; + // Dev: pipe child stdout/stderr through this process so log lines + // land in the developer's `bun dev` terminal. Production: hard-back + // stdio with the rotating log file so the detached child survives + // parent teardown without losing logs. + const isDev = !app.isPackaged; + const stdio: childProcess.StdioOptions = isDev + ? ["ignore", "pipe", "pipe"] + : logFd >= 0 + ? ["ignore", logFd, logFd] + : ["ignore", "ignore", "ignore"]; let child: ReturnType; try { @@ -416,6 +441,15 @@ export class HostServiceCoordinator extends EventEmitter { } } + // In dev, fan child output through to parent stdout/stderr with a + // prefix so it's identifiable in `bun dev`. The detached child has + // its own session, so closing pipes won't kill it on parent exit. + if (isDev && child.stdout && child.stderr) { + const tag = `[hs:${organizationId.slice(0, 8)}]`; + pipeWithPrefix(child.stdout, process.stdout, tag); + pipeWithPrefix(child.stderr, process.stderr, tag); + } + const childPid = child.pid; if (!childPid) { this.instances.delete(organizationId); @@ -540,6 +574,33 @@ export class HostServiceCoordinator extends EventEmitter { } } +/** + * Forward child stdout/stderr to a parent stream with a per-line prefix. + * Plain `chunk => parent.write(`${tag} ${chunk}`)` only prefixes the first + * line in a chunk and breaks visual scanning when child output bursts. + */ +function pipeWithPrefix( + source: NodeJS.ReadableStream, + target: NodeJS.WritableStream, + tag: string, +): void { + let pending = ""; + source.on("data", (chunk: Buffer) => { + const text = pending + chunk.toString("utf8"); + const lines = text.split("\n"); + // Last element is a partial line if input doesn't end with \n; + // stash it for the next chunk. + pending = lines.pop() ?? ""; + for (const line of lines) { + target.write(`${tag} ${line}\n`); + } + }); + source.on("end", () => { + if (pending) target.write(`${tag} ${pending}\n`); + pending = ""; + }); +} + let coordinator: HostServiceCoordinator | null = null; export function getHostServiceCoordinator(): HostServiceCoordinator { diff --git a/apps/desktop/src/main/pty-daemon/index.ts b/apps/desktop/src/main/pty-daemon/index.ts new file mode 100644 index 00000000000..330755139af --- /dev/null +++ b/apps/desktop/src/main/pty-daemon/index.ts @@ -0,0 +1,79 @@ +/** + * pty-daemon — Desktop bundle target + * + * The supervisor (in @superset/host-service) spawns this script as the + * daemon process. We need a desktop-side entry so electron-vite emits + * `apps/desktop/dist/main/pty-daemon.js` alongside `host-service.js` — + * the supervisor's `sideBySide` script-path resolution looks for the + * daemon binary right next to its own bundle. + * + * The actual daemon implementation lives in `@superset/pty-daemon`. + * This file is a thin runtime shim: argv parsing, signal handling, + * and starting the Server. Mirrors the layout host-service uses + * (apps/desktop/src/main/host-service/index.ts). + * + * Headless deploy path: in a non-Electron build, this file is unused — + * the supervisor instead spawns the @superset/pty-daemon package's + * built-in main.ts directly. + */ + +import { Server } from "@superset/pty-daemon"; + +interface CliArgs { + socket: string; +} + +function parseArgs(argv: string[]): CliArgs { + const args: Partial = {}; + for (const arg of argv) { + if (arg.startsWith("--socket=")) { + args.socket = arg.slice("--socket=".length); + } + } + if (!args.socket) { + throw new Error("--socket=PATH is required"); + } + return args as CliArgs; +} + +async function main(): Promise { + const args = parseArgs(process.argv.slice(2)); + // Source of truth for daemon version — the supervisor sets this env + // var on spawn (matching its EXPECTED_DAEMON_VERSION). Falls back to + // a hardcoded default if launched without env, so the daemon still + // reports something sane on direct invocation. + const daemonVersion = process.env.SUPERSET_PTY_DAEMON_VERSION ?? "0.1.0"; + const server = new Server({ + socketPath: args.socket, + daemonVersion, + }); + await server.listen(); + process.stderr.write( + `[pty-daemon] listening on ${args.socket} (v${daemonVersion})\n`, + ); + + let shuttingDown = false; + const shutdown = async (signal: NodeJS.Signals) => { + if (shuttingDown) return; + shuttingDown = true; + process.stderr.write(`[pty-daemon] received ${signal}, shutting down\n`); + try { + await server.close(); + } catch (err) { + process.stderr.write( + `[pty-daemon] shutdown error: ${(err as Error).stack ?? err}\n`, + ); + } finally { + process.exit(0); + } + }; + process.on("SIGINT", () => void shutdown("SIGINT")); + process.on("SIGTERM", () => void shutdown("SIGTERM")); +} + +void main().catch((error) => { + process.stderr.write( + `[pty-daemon] failed to start: ${(error as Error).stack ?? error}\n`, + ); + process.exit(1); +}); diff --git a/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/TerminalSettings.tsx b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/TerminalSettings.tsx index 502ace48ab9..f2195247076 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/TerminalSettings.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/TerminalSettings.tsx @@ -9,6 +9,7 @@ import { LinkBehaviorSetting } from "./components/LinkBehaviorSetting"; import { PresetsSection } from "./components/PresetsSection"; import { SessionsSection } from "./components/SessionsSection"; import { V2PresetsSection } from "./components/V2PresetsSection"; +import { V2SessionsSection } from "./components/V2SessionsSection"; interface TerminalSettingsProps { visibleItems?: SettingItemId[] | null; @@ -97,7 +98,12 @@ export function TerminalSettings({ /> ))} {showLinkBehavior && } - {showSessions && } + {showSessions && + (isV2CloudEnabled ? ( + + ) : ( + + ))} ); diff --git a/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/V2SessionsSection.tsx b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/V2SessionsSection.tsx new file mode 100644 index 00000000000..f46c6eed9bc --- /dev/null +++ b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/V2SessionsSection.tsx @@ -0,0 +1,305 @@ +// V2 Settings → Terminal → Manage daemon section. +// +// Talks to host-service's `terminal.daemon` namespace — the supervisor +// that owns pty-daemon's lifecycle lives there, not in desktop main. +// What's *not* duplicated from v1: kill-all-sessions, clear-history, +// per-row kill. Restart already achieves the kill-all effect for v2; +// scrollback is owned per-session by the daemon's ring buffer with no +// disk persistence; per-row kill belongs in the renderer's pane controls. +// +// Provider plumbing: workspaceTrpc needs a WorkspaceClientProvider with a +// real host URL. Settings routes are *outside* any per-workspace provider +// (they're org-level), so we mount our own here using the active org's +// host URL from LocalHostServiceProvider. Without this wrapping, hooks +// fall through to electron-trpc and fail with "no procedure on path +// terminal.daemon.*" — there's no such namespace on electron-trpc. + +import { + AlertDialog, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, +} from "@superset/ui/alert-dialog"; +import { Button } from "@superset/ui/button"; +import { Label } from "@superset/ui/label"; +import { toast } from "@superset/ui/sonner"; +import { + WorkspaceClientProvider, + workspaceTrpc, +} from "@superset/workspace-client"; +import { useState } from "react"; +import { + getHostServiceHeaders, + getHostServiceWsToken, +} from "renderer/lib/host-service-auth"; +import { useLocalHostService } from "renderer/routes/_authenticated/providers/LocalHostServiceProvider"; + +const REFETCH_WHILE_OPEN_MS = 5_000; + +export function V2SessionsSection() { + const { activeHostUrl } = useLocalHostService(); + if (!activeHostUrl) { + return ( +
+ +

+ Host service is starting… +

+
+ ); + } + return ( + getHostServiceHeaders(activeHostUrl)} + wsToken={() => getHostServiceWsToken(activeHostUrl)} + > + + + ); +} + +function V2SessionsSectionInner() { + const [confirmRestartOpen, setConfirmRestartOpen] = useState(false); + const [showSessionList, setShowSessionList] = useState(false); + + const updateStatusQuery = + workspaceTrpc.terminal.daemon.getUpdateStatus.useQuery(undefined, { + refetchOnWindowFocus: true, + }); + const sessionsQuery = workspaceTrpc.terminal.daemon.listSessions.useQuery( + undefined, + { + // Poll while the user keeps the list expanded — sessions + // die/come up while they watch. Otherwise refetch on focus only. + refetchInterval: showSessionList ? REFETCH_WHILE_OPEN_MS : false, + refetchOnWindowFocus: true, + }, + ); + // Surface query errors so they're visible in renderer logs even when + // the section's UI gracefully degrades to "Daemon unavailable". + if (updateStatusQuery.error) { + console.error( + "[V2SessionsSection] getUpdateStatus error:", + updateStatusQuery.error, + ); + } + if (sessionsQuery.error) { + console.error( + "[V2SessionsSection] listSessions error:", + sessionsQuery.error, + ); + } + + const restartDaemon = workspaceTrpc.terminal.daemon.restart.useMutation({ + onSuccess: () => { + const versions = updateStatusQuery.data; + toast.success("Daemon restarted", { + description: + versions && versions.running !== versions.expected + ? `Now running ${versions.expected} (was ${versions.running}). All sessions were closed.` + : "All sessions were closed and a fresh daemon is running.", + }); + void updateStatusQuery.refetch(); + void sessionsQuery.refetch(); + }, + onError: (error) => { + toast.error("Failed to restart daemon", { description: error.message }); + }, + }); + + const sessions = sessionsQuery.data ?? null; + const aliveCount = + sessions === null ? null : sessions.filter((s) => s.alive).length; + const updatePending = updateStatusQuery.data?.pending === true; + const versions = updateStatusQuery.data; + + const sessionCountLabel = (() => { + if (sessions === null) return "Daemon unavailable"; + if (aliveCount === 0) return "No sessions running"; + return `${aliveCount} session${aliveCount === 1 ? "" : "s"} running`; + })(); + + const versionLabel = (() => { + if (!versions) return null; + if (versions.running === "unknown") { + return `bundled ${versions.expected}`; + } + if (updatePending) { + return `${versions.running} → ${versions.expected} pending`; + } + return versions.running; + })(); + + return ( + <> +
+
+
+ +

+ The terminal daemon owns all PTY sessions. It survives app + restarts so your shells, builds, and agents keep running. +

+
+ +
+ +
+ + 0 + ? "size-1.5 rounded-full bg-emerald-500" + : "size-1.5 rounded-full bg-muted-foreground/60" + } + /> + {sessionCountLabel} + + {versionLabel ? ( + + {versionLabel} + + ) : null} + {updatePending ? ( + + Update available + + ) : null} +
+ +
+ + +
+ + {showSessionList && sessions && sessions.length > 0 ? ( +
+
+ + + + + + + + + + + {sessions.map((s) => ( + + + + + + + ))} + +
SessionPIDSizeStatus
{s.id} + {s.pid || "—"} + + {s.cols}×{s.rows} + + + {s.alive ? "Alive" : "Exited"} + +
+
+
+ ) : null} +
+ + + + + + {updatePending + ? "Restart and apply update?" + : "Restart terminal daemon?"} + + +
+ + This closes every terminal session for your organization + {aliveCount && aliveCount > 0 + ? ` (${aliveCount} running)` + : ""}{" "} + and starts a fresh daemon. + + {updatePending && versions ? ( + + Restarting will load{" "} + {versions.expected}{" "} + (currently running{" "} + {versions.running}). + + ) : null} +
+
+
+ + + + +
+
+ + ); +} diff --git a/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/index.ts b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/index.ts new file mode 100644 index 00000000000..13f0812ce81 --- /dev/null +++ b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/index.ts @@ -0,0 +1 @@ +export { V2SessionsSection } from "./V2SessionsSection"; diff --git a/bun.lock b/bun.lock index 9e8a97c64f2..43042cdbfd8 100644 --- a/bun.lock +++ b/bun.lock @@ -168,6 +168,7 @@ "@superset/macos-process-metrics": "workspace:*", "@superset/panes": "workspace:*", "@superset/port-scanner": "workspace:*", + "@superset/pty-daemon": "workspace:*", "@superset/shared": "workspace:*", "@superset/trpc": "workspace:*", "@superset/ui": "workspace:*", @@ -291,7 +292,7 @@ "rehype-raw": "^7.0.0", "rehype-sanitize": "^6.0.0", "remark-gfm": "^4.0.1", - "semver": "^7.7.3", + "semver": "^7.7.4", "shell-env": "^4.0.3", "shell-quote": "^1.8.3", "shiki": "^3.21.0", @@ -773,6 +774,7 @@ "@octokit/rest": "^22.0.1", "@superset/chat": "workspace:*", "@superset/port-scanner": "workspace:*", + "@superset/pty-daemon": "workspace:*", "@superset/shared": "workspace:*", "@superset/trpc": "workspace:*", "@superset/workspace-fs": "workspace:*", @@ -785,6 +787,7 @@ "mastracode": "0.15.0-alpha.3", "mime-types": "^3.0.2", "node-pty": "1.1.0", + "semver": "^7.7.4", "simple-git": "^3.30.0", "superjson": "^2.2.5", "tree-kill": "^1.2.2", @@ -795,6 +798,7 @@ "@types/better-sqlite3": "^7.6.13", "@types/mime-types": "^3.0.1", "@types/node": "^24.9.1", + "@types/semver": "^7.7.1", "bun-types": "^1.3.1", "drizzle-kit": "0.31.8", "typescript": "^5.9.3", @@ -894,9 +898,25 @@ "typescript": "^5.9.3", }, }, + "packages/pty-daemon": { + "name": "@superset/pty-daemon", + "version": "0.1.0", + "bin": { + "pty-daemon": "./src/main.ts", + }, + "dependencies": { + "node-pty": "1.1.0", + }, + "devDependencies": { + "@superset/typescript": "workspace:*", + "@types/node": "^24.9.1", + "bun-types": "^1.3.1", + "typescript": "^5.9.3", + }, + }, "packages/sdk": { "name": "@superset/sdk", - "version": "0.0.1-alpha.0", + "version": "0.0.1-alpha.7", "devDependencies": { "@superset/typescript": "workspace:*", "bun-types": "^1.3.1", @@ -2610,6 +2630,8 @@ "@superset/port-scanner": ["@superset/port-scanner@workspace:packages/port-scanner"], + "@superset/pty-daemon": ["@superset/pty-daemon@workspace:packages/pty-daemon"], + "@superset/relay": ["@superset/relay@workspace:apps/relay"], "@superset/sdk": ["@superset/sdk@workspace:packages/sdk"], diff --git a/packages/host-service/DAEMON_SUPERVISION.md b/packages/host-service/DAEMON_SUPERVISION.md new file mode 100644 index 00000000000..f23164bde6a --- /dev/null +++ b/packages/host-service/DAEMON_SUPERVISION.md @@ -0,0 +1,216 @@ +# Daemon Supervision + +Host-service owns the lifecycle of `@superset/pty-daemon` — the long-lived +PTY process. Supervision lives here (not in the desktop app) so +host-service can be deployed independently of Electron. The daemon +outlives host-service crashes via detached spawn + manifest adoption. + +## Where it lives + +- **Supervisor**: `src/daemon/DaemonSupervisor.ts` — spawn / adopt / + restart / crash-circuit. One supervisor per host-service process, + managing one daemon (per the org host-service was started for). +- **Singleton + bootstrap**: `src/daemon/singleton.ts` — process-level + cache + `startDaemonBootstrap` / `waitForDaemonReady` for the boot + pattern below. +- **Manifest**: `src/daemon/manifest.ts` — `$SUPERSET_HOME_DIR/host/{orgId}/pty-daemon-manifest.json`. + Read by `tryAdopt` on startup to find a still-running daemon from a + previous host-service incarnation. +- **Expected version**: `src/daemon/expected-version.ts` — hand-edited + `EXPECTED_DAEMON_VERSION`, kept in lockstep with + `packages/pty-daemon/package.json#version`. Drives the + "update available, restart terminals" UX. +- **Renderer surface**: `terminal.daemon.{getUpdateStatus, listSessions, restart}` + on the host-service tRPC. + +## Boot pattern (fire-and-track) + +`serve.ts` calls `startDaemonBootstrap(env.ORGANIZATION_ID)` during +startup but does **not** await it. tRPC accepts connections immediately; +non-terminal ops (workspaces, git, chat) work without waiting for the +daemon. Terminal request handlers `await waitForDaemonReady(orgId)` +before using the supervisor's socket path, so an in-flight bootstrap +doesn't race with the first terminal launch. + +## Detached spawn + adoption + +The daemon is spawned `detached: true` so it survives host-service +exit. On next host-service start, `tryAdopt` reads the manifest, checks +the PID is alive and the socket is reachable, and reuses the running +daemon. PTY sessions therefore survive host-service restarts. + +The socket path lives in `os.tmpdir()/superset-ptyd-.sock` +— short enough to fit Darwin's 104-byte `sun_path` limit. Owner-only +file mode (0600) is the auth boundary. + +### Adopted-daemon liveness check + +`child.on("exit")` only fires for daemons we *spawned* — adopted +daemons (PIDs from a manifest) have no child handle. Without a +liveness check, the supervisor's `instances` map carries a stale +entry forever when an adopted daemon dies externally (kill -9, OOM): +`getSocketPath` returns a socket nobody's listening on, terminal ops +fail with ECONNREFUSED until something forces a restart. + +We poll `process.kill(pid, 0)` every 2s for adopted PIDs +(`ADOPTED_LIVENESS_INTERVAL_MS`). On detected death we clear the +instance + manifest so the next `ensure()` respawns. Spawned daemons +keep using the cheaper `child.on("exit")` path. + +## Version detection + +On adoption, `probeDaemonVersion` does a one-shot `hello`/`hello-ack` to +read the running daemon's `daemonVersion`, compares against +`EXPECTED_DAEMON_VERSION` via `semver.satisfies(>=)`. Mismatch sets +`updatePending: true` on the instance — the renderer surfaces a +"restart to update" affordance. We do **not** auto-kill on mismatch +because PTY sessions live in the daemon; the user opts in via Restart. + +Probe failure ≠ stale: a transient socket issue produces +`runningVersion: "unknown", updatePending: false` rather than a +false-positive update flag. + +## Crash circuit breaker + +Auto-respawn unexpected exits, but only up to `CRASH_BUDGET = 3` within +`CRASH_WINDOW_MS = 60_000`. Past that, the circuit opens and `ensure` +fails fast with a clear error until something calls +`clearCrashCircuit(orgId)` — which the user-triggered `restart()` +implicitly does, so the user can always recover. + +## User-triggered restart + +`restart(orgId)` awaits any in-flight pending spawn, calls `stop`, +clears the crash circuit, logs `pty_daemon_user_restart`, then `ensure`s +fresh. Sessions die in the gap — that's the cost the user accepted via +the confirmation dialog. + +### Default close signal: SIGHUP, not SIGTERM + +The kill chain (`DaemonClient.close`, daemon `handleClose`, +`DaemonPty.kill`) defaults to **SIGHUP**, not SIGTERM. Interactive +shells — especially `zsh -l`, the default macOS login shell — trap +SIGTERM and stay alive. SIGTERM defaults silently leaked PTY processes +on every closed pane until the daemon was respawned. SIGHUP is what +the kernel sends when a real TTY closes, and shells honor it. + +Explicit `SIGKILL` still passes through for hung shells (e.g. the +"force kill" path). + +## Session deletion on PTY exit + +The daemon's `Server.onExit` handler deletes the session row from +the store immediately after fanning out the exit event. **Late +subscribers that connect after exit get ENOENT**, not the buffered +output and exit event. + +Tradeoff: a host-service that restarts during the small window when +a shell is exiting will not be able to fetch the final output via +`subscribe(replay: true)` — the renderer falls back to a generic +"session unavailable" footer instead of "Process exited with code N". +Without this delete, every closed terminal pane left a row in the +store forever (every "Show sessions" entry would have been an Exited +zombie). + +## Dev-mode log piping + +In dev (`NODE_ENV !== "production"`), both host-service and +pty-daemon stdio is **piped through to the parent process** with +per-line prefixes: + +- `[hs:<8-char-orgId>] ...` — host-service stdout in `bun dev` +- `[ptyd:<8-char-orgId>] ...` — daemon stdout, fanned through host-service + +Production stdio backs to per-org rotating log files +(`$SUPERSET_HOME_DIR/host/{orgId}/{host-service,pty-daemon}.log`) +because the detached children must outlive parent teardown. + +The `pipeWithPrefix` helper splits incoming chunks on `\n` so +multi-line bursts keep the prefix on every line. + +## Telemetry + +The supervisor emits structured `console.log` lines with +`{ component: "pty-daemon-supervisor", event, ...props }`. Events: +`pty_daemon_spawn`, `pty_daemon_adopt`, `pty_daemon_user_restart`, +`pty_daemon_update_pending`, `pty_daemon_crash`, +`pty_daemon_circuit_open`, `pty_daemon_spawn_failed`. No PostHog +plumbing on host-service yet — promote to real telemetry when the path +is needed. + +## Tests + +- `src/daemon/DaemonSupervisor.test.ts` — probe edge cases, debounce + semantics, restart race-await + circuit clear. +- `src/daemon/DaemonSupervisor.node-test.ts` — real-spawn integration: + fresh spawn, cross-instance adoption, version drift via env override, + user-restart kills + respawns, auto-respawn after SIGKILL, **adopted + daemon dies externally → supervisor detects and respawns**. +- `src/daemon/singleton.test.ts` — fire-and-track bootstrap, idempotent + startDaemonBootstrap, retryable failure path. +- `src/trpc/router/terminal/terminal.daemon.test.ts` — tRPC procedure + wiring (UNAUTHORIZED gating, getUpdateStatus delegation, listSessions + awaits bootstrap, restart wiring). +- `src/no-electron-coupling.test.ts` — asserts host-service source has + zero Electron imports/globals/APIs (substitute for a real headless + smoke test until native-addon distribution is solved). +- Daemon wire protocol coverage lives in `packages/pty-daemon/test/` + (handshake, adoption, SIGKILL recovery, **default-close terminates + an interactive login shell** — SIGHUP regression test). + +## Test escape hatch + +Setting `SUPERSET_PTY_DAEMON_SOCKET` env var bypasses the supervisor in +`daemon-client-singleton.ts` and connects directly to the given socket. +Used by `terminal.adoption.node-test.ts` to test host-service against an +in-process Server instance. Production paths leave this env unset. + +## Extension points + +Adding a daemon op the renderer needs: + +1. Add a method on `DaemonSupervisor` (or use `getDaemonClient()` from + `terminal/daemon-client-singleton.ts` if it's a wire-protocol op). +2. Expose via `terminal.daemon` in `src/trpc/router/terminal/terminal.ts`. +3. Call from the renderer via `workspaceTrpc.terminal.daemon.*`. + +Bumping the daemon version: edit `EXPECTED_DAEMON_VERSION` in +`expected-version.ts` to match the new `packages/pty-daemon/package.json#version`. +The supervisor's adoption probe will surface the "update available" flag +on existing installs until they restart. + +Bumping host-service-level features that the desktop coordinator +needs to refuse to adopt old binaries: bump `HOST_SERVICE_VERSION` +in `src/trpc/router/host/host.ts` and `MIN_HOST_SERVICE_VERSION` in +`apps/desktop/src/main/lib/host-service-coordinator.ts` together. +The coordinator's `tryAdopt` does a `semver.satisfies(>=)` check and +SIGTERMs+respawns anything older. + +## Phase 2 deferred — daemon upgrades currently kill sessions + +The original Architecture E plan called for **daemon-upgrade fd-handoff** +so even daemon-binary changes preserve PTYs. Phase 0 (the Go and +node-pty harnesses in the design-doc branch) proved the primitive +works. **Phase 2 is not built in this codebase yet.** + +Today: clicking "Restart and update" in Settings → Manage daemon +SIGTERMs the running daemon and spawns the new bundle. All sessions +die in the gap. The confirmation dialog tells the user this. + +When Phase 2 lands: the supervisor will spawn the new daemon with +existing PTY master FDs in its `stdio` array (kernel-level dup, +refcount preserved across the swap). New daemon adopts the FDs, +takes over the socket, old daemon exits without closing them. +Sessions survive the upgrade. + +Hooks already in place that Phase 2 will use: +- Adopted-liveness check (it'll detect the old daemon's exit at + the supervisor level if anything goes wrong mid-handoff). +- Manifest-based daemon discovery (the supervisor's current + `tryAdopt` is what Phase 2's "fall back if handoff fails" path + reuses). +- Existing wire protocol (we'd add an `upgrade` message; the + protocol is versioned). + +See `apps/desktop/plans/20260430-pty-daemon-host-service-migration.md` +in the design-doc branch for the migration journey and Phase 2 sketch. diff --git a/packages/host-service/bunfig.toml b/packages/host-service/bunfig.toml new file mode 100644 index 00000000000..a22eda2e883 --- /dev/null +++ b/packages/host-service/bunfig.toml @@ -0,0 +1,2 @@ +[test] +preload = ["./test/setup-env.ts"] diff --git a/packages/host-service/package.json b/packages/host-service/package.json index 64fe9a2c348..79ae60d18ec 100644 --- a/packages/host-service/package.json +++ b/packages/host-service/package.json @@ -48,6 +48,8 @@ "generate": "drizzle-kit generate", "test": "bun test --pass-with-no-tests", "test:integration": "bun test --pass-with-no-tests test/integration", + "test:integration:daemon": "node --experimental-strip-types --test src/terminal/DaemonClient/DaemonClient.node-test.ts src/daemon/DaemonSupervisor.node-test.ts", + "test:e2e": "bun run scripts/test-e2e.ts", "typecheck": "tsc --noEmit --emitDeclarationOnly false" }, "dependencies": { @@ -58,6 +60,7 @@ "@octokit/rest": "^22.0.1", "@superset/chat": "workspace:*", "@superset/port-scanner": "workspace:*", + "@superset/pty-daemon": "workspace:*", "@superset/shared": "workspace:*", "@superset/trpc": "workspace:*", "@superset/workspace-fs": "workspace:*", @@ -70,6 +73,7 @@ "mastracode": "0.15.0-alpha.3", "mime-types": "^3.0.2", "node-pty": "1.1.0", + "semver": "^7.7.4", "simple-git": "^3.30.0", "superjson": "^2.2.5", "tree-kill": "^1.2.2", @@ -80,6 +84,7 @@ "@types/better-sqlite3": "^7.6.13", "@types/mime-types": "^3.0.1", "@types/node": "^24.9.1", + "@types/semver": "^7.7.1", "bun-types": "^1.3.1", "drizzle-kit": "0.31.8", "typescript": "^5.9.3" diff --git a/packages/host-service/scripts/test-e2e.ts b/packages/host-service/scripts/test-e2e.ts new file mode 100644 index 00000000000..7275d0367ce --- /dev/null +++ b/packages/host-service/scripts/test-e2e.ts @@ -0,0 +1,59 @@ +// Runs the host-service end-to-end adoption test under Electron-as-Node. +// +// Why Electron and not raw `node`: host-service uses better-sqlite3, whose +// native module is compiled against the Electron bundled Node ABI for +// production. Running the test under Electron-as-Node ensures the same +// native-module ABI as production. Raw `node` would fail with +// NODE_MODULE_VERSION mismatch. +// +// Usage: bun run test:e2e + +import * as childProcess from "node:child_process"; +import * as fs from "node:fs"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const repoRoot = path.resolve(__dirname, "../../.."); + +// Resolve the Electron binary from the workspace's node_modules. Bun's flat +// .bun/@/node_modules/ layout makes this a glob. +function findElectronBinary(): string { + const candidates = childProcess + .execSync("find . -path '*/electron/dist/*.app/Contents/MacOS/Electron'", { + cwd: repoRoot, + encoding: "utf-8", + }) + .split("\n") + .filter(Boolean); + const first = candidates[0]; + if (!first) { + throw new Error( + "Electron binary not found. Run `bun install` from the repo root first.", + ); + } + return path.join(repoRoot, first); +} + +const electronBin = findElectronBinary(); +const testFile = path.resolve( + __dirname, + "..", + "src/terminal/terminal.adoption.node-test.ts", +); + +if (!fs.existsSync(testFile)) { + console.error(`Test file missing: ${testFile}`); + process.exit(1); +} + +const result = childProcess.spawnSync( + electronBin, + ["--experimental-strip-types", "--test", "--test-reporter=spec", testFile], + { + stdio: "inherit", + env: { ...process.env, ELECTRON_RUN_AS_NODE: "1" }, + }, +); + +process.exit(result.status ?? 1); diff --git a/packages/host-service/src/daemon/DaemonSupervisor.node-test.ts b/packages/host-service/src/daemon/DaemonSupervisor.node-test.ts new file mode 100644 index 00000000000..9e954e541a3 --- /dev/null +++ b/packages/host-service/src/daemon/DaemonSupervisor.node-test.ts @@ -0,0 +1,316 @@ +// Real-spawn integration tests for DaemonSupervisor. +// Runs under Node (`node --experimental-strip-types --test`) because the +// supervisor uses `process.execPath` to spawn the daemon, and the daemon +// imports node-pty (a native addon that needs Node ABI). +// +// Unit-level coverage for the same surface lives in DaemonSupervisor.test.ts +// (under bun test). These integration tests catch process-lifecycle bugs +// that mocks don't (PID liveness, manifest IO across supervisor instances, +// real socket connectivity). + +import { strict as assert } from "node:assert"; +import * as childProcess from "node:child_process"; +import * as crypto from "node:crypto"; +import * as fs from "node:fs"; +import * as net from "node:net"; +import * as os from "node:os"; +import * as path from "node:path"; +import { afterEach, beforeEach, describe, test } from "node:test"; +import { fileURLToPath } from "node:url"; +import { DaemonSupervisor } from "./DaemonSupervisor.ts"; +import { + type PtyDaemonManifest, + ptyDaemonManifestDir, + writePtyDaemonManifest, +} from "./manifest.ts"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +// packages/host-service/src/daemon → packages/pty-daemon/dist/pty-daemon.js +const DAEMON_BUNDLE = path.resolve( + __dirname, + "../../../pty-daemon/dist/pty-daemon.js", +); + +if (!fs.existsSync(DAEMON_BUNDLE)) { + throw new Error( + `Daemon bundle missing at ${DAEMON_BUNDLE}. Run \`bun run build:daemon\` in packages/pty-daemon first.`, + ); +} + +let tmpHome: string; +let originalHome: string | undefined; +const supervisorsToCleanup: { sup: DaemonSupervisor; orgId: string }[] = []; + +beforeEach(() => { + tmpHome = fs.mkdtempSync(path.join(os.tmpdir(), "pty-daemon-it-")); + originalHome = process.env.SUPERSET_HOME_DIR; + process.env.SUPERSET_HOME_DIR = tmpHome; +}); + +afterEach(async () => { + // Detached daemons survive the test process by design — kill any we + // spawned so they don't leak across test runs. + for (const { sup, orgId } of supervisorsToCleanup.splice(0)) { + try { + await sup.stop(orgId); + } catch { + // best-effort + } + } + if (originalHome !== undefined) { + process.env.SUPERSET_HOME_DIR = originalHome; + } else { + delete process.env.SUPERSET_HOME_DIR; + } + try { + fs.rmSync(tmpHome, { recursive: true, force: true }); + } catch { + // best-effort + } +}); + +describe("DaemonSupervisor.ensure (real spawn)", () => { + test("spawns a fresh daemon and reports running == expected", async () => { + const sup = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup, orgId: "org-spawn" }); + const inst = await sup.ensure("org-spawn"); + assert.ok(inst.pid > 0, "expected a positive pid"); + assert.equal(inst.runningVersion, inst.expectedVersion); + assert.equal(inst.updatePending, false); + assert.equal(await isReachable(inst.socketPath), true); + }); + + test("adopts a running daemon across supervisor instances", async () => { + const supA = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + const a = await supA.ensure("org-adopt"); + assert.ok(a.pid > 0); + + // Track the daemon for cleanup; we'll stop via supervisor B since + // that's the live owner by the end of the test. + try { + // Supervisor B simulates a host-service restart — fresh state, + // but the manifest + running daemon are still on disk/live. + const supB = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup: supB, orgId: "org-adopt" }); + const b = await supB.ensure("org-adopt"); + assert.equal(b.pid, a.pid, "B should adopt A's daemon"); + assert.equal(b.socketPath, a.socketPath); + assert.equal(b.runningVersion, a.expectedVersion); + assert.equal(b.updatePending, false); + } catch (err) { + // On failure, make sure A still cleans up. + await supA.stop("org-adopt").catch(() => {}); + throw err; + } + }); + + test("flags updatePending when running daemon is older than expected", async () => { + // We spawn the daemon DIRECTLY (not via supervisor.ensure), pinning + // its version to "0.0.1" via env. Then we write the manifest and + // hand the supervisor a fresh instance that adopts via tryAdopt. + // Going through supervisor.ensure for the spawn would inject + // EXPECTED_DAEMON_VERSION (0.1.0) into childEnv, defeating the + // older-version setup. + const orgId = "org-stale"; + const socketPath = path.join( + os.tmpdir(), + `superset-ptyd-${crypto + .createHash("sha256") + .update(orgId) + .digest("hex") + .slice(0, 12)}.sock`, + ); + // Clean up any leftover socket from prior runs. + try { + fs.unlinkSync(socketPath); + } catch {} + + const child = childProcess.spawn( + process.execPath, + [DAEMON_BUNDLE, `--socket=${socketPath}`], + { + detached: true, + stdio: "ignore", + env: { ...process.env, SUPERSET_PTY_DAEMON_VERSION: "0.0.1" }, + }, + ); + child.unref(); + // Wait for the socket to come up. + const ready = await waitForSocket(socketPath, 5000); + assert.equal(ready, true, "daemon socket did not become ready"); + + try { + // Write the manifest the supervisor needs to find the daemon. + fs.mkdirSync(ptyDaemonManifestDir(orgId), { + recursive: true, + mode: 0o700, + }); + const manifest: PtyDaemonManifest = { + pid: child.pid as number, + socketPath, + protocolVersions: [1], + startedAt: Date.now(), + organizationId: orgId, + }; + writePtyDaemonManifest(manifest); + + // Fresh supervisor adopts and probes. + const sup = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup, orgId }); + const adopted = await sup.ensure(orgId); + assert.equal(adopted.runningVersion, "0.0.1"); + assert.equal(adopted.expectedVersion, "0.1.0"); + assert.equal(adopted.updatePending, true); + } catch (err) { + // On failure, kill the orphaned daemon ourselves. + try { + if (child.pid) process.kill(child.pid, "SIGTERM"); + } catch {} + throw err; + } + }); + + test("restart() kills the old daemon and spawns a new one", async () => { + const sup = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup, orgId: "org-restart" }); + const a = await sup.ensure("org-restart"); + const aPid = a.pid; + + await sup.restart("org-restart"); + const after = ( + sup as unknown as { instances: Map } + ).instances.get("org-restart"); + assert.ok(after, "expected an instance after restart"); + assert.notEqual(after.pid, aPid, "expected a new pid after restart"); + // Old PID is dead within a beat. + await new Promise((r) => setTimeout(r, 200)); + assert.equal(isAlive(aPid), false); + }); + + test("auto-respawns after the running daemon dies unexpectedly", async () => { + // SIGKILL the running daemon, wait for the supervisor's on-exit + // handler to fire, and verify a new daemon comes up. Crash-budget + // behavior past this point is covered by the unit tests in + // DaemonSupervisor.test.ts (mocked stop/ensure for determinism — + // killing 4 daemons in a row from this test would race with the + // auto-respawn loop). + const sup = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup, orgId: "org-respawn" }); + const a = await sup.ensure("org-respawn"); + const aPid = a.pid; + + process.kill(aPid, "SIGKILL"); + + // Wait for the on-exit handler to register the death and respawn. + // The supervisor's auto-respawn fires inside `child.on("exit")`. + const deadline = Date.now() + 8000; + let _next = sup.getSocketPath("org-respawn"); + while (Date.now() < deadline) { + const inst = ( + sup as unknown as { instances: Map } + ).instances.get("org-respawn"); + if (inst && inst.pid !== aPid) { + _next = inst as unknown as string; + break; + } + await new Promise((r) => setTimeout(r, 100)); + } + const after = ( + sup as unknown as { instances: Map } + ).instances.get("org-respawn"); + assert.ok(after, "expected a respawned instance"); + assert.notEqual(after.pid, aPid); + }); + + test("detects when an adopted daemon dies externally", async () => { + // Adopted daemons (PIDs from a manifest, not spawned children) + // don't fire `child.on("exit")` when killed externally. The + // supervisor must poll PID liveness to notice and clear the + // stale instance so the next ensure() respawns. Without this, + // host-service would keep handing out a dead socket path until + // something else forced a restart. + const orgId = "org-adopted-died"; + + // Supervisor A spawns the daemon. We'll then construct a + // supervisor B that adopts via manifest, verify the adopted + // PID, kill it externally, and assert B clears its instance. + const supA = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + const a = await supA.ensure(orgId); + const adoptedPid = a.pid; + + const supB = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup: supB, orgId }); + const b = await supB.ensure(orgId); + assert.equal(b.pid, adoptedPid, "B should adopt A's daemon"); + + // Externally kill the adopted daemon. supA never had a child + // handle so its on-exit handler can't fire; supB only adopted + // (no child handle either). The poller must catch this. + process.kill(adoptedPid, "SIGKILL"); + + // Wait up to 6s for the liveness poller (2s interval) to fire. + const deadline = Date.now() + 6000; + while (Date.now() < deadline) { + const inst = ( + supB as unknown as { instances: Map } + ).instances.get(orgId); + if (!inst) break; + await new Promise((r) => setTimeout(r, 200)); + } + const after = ( + supB as unknown as { instances: Map } + ).instances.get(orgId); + assert.equal( + after, + undefined, + "supervisor should have cleared the dead adopted instance", + ); + + // Next ensure() should respawn fresh. + const fresh = await supB.ensure(orgId); + assert.notEqual(fresh.pid, adoptedPid); + assert.equal(isAlive(fresh.pid), true); + }); +}); + +function isAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch (err) { + return (err as NodeJS.ErrnoException).code === "EPERM"; + } +} + +function isReachable(socketPath: string): Promise { + return new Promise((resolve) => { + const sock = net.createConnection({ path: socketPath }); + const timer = setTimeout(() => { + sock.destroy(); + resolve(false); + }, 500); + sock.once("connect", () => { + clearTimeout(timer); + sock.end(); + resolve(true); + }); + sock.once("error", () => { + clearTimeout(timer); + resolve(false); + }); + }); +} + +async function waitForSocket( + socketPath: string, + timeoutMs: number, +): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + if (fs.existsSync(socketPath)) { + if (await isReachable(socketPath)) return true; + } + await new Promise((r) => setTimeout(r, 50)); + } + return false; +} diff --git a/packages/host-service/src/daemon/DaemonSupervisor.test.ts b/packages/host-service/src/daemon/DaemonSupervisor.test.ts new file mode 100644 index 00000000000..bae54c71eaa --- /dev/null +++ b/packages/host-service/src/daemon/DaemonSupervisor.test.ts @@ -0,0 +1,450 @@ +// Tests for the DaemonSupervisor: +// - probeDaemonVersion (one-shot hello/hello-ack against an in-process +// fake daemon — exercises the *real* probe code, not a parallel impl) +// - update-pending event debouncing on adoption +// - getUpdateStatus semantics +// - restart() race-await + circuit-clear semantics +// +// Telemetry events are emitted as structured `console.log` lines (per the +// host-service-migration plan, decision D2). We spy on console.log and +// filter for our component prefix. + +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; +import * as net from "node:net"; +import * as os from "node:os"; +import * as path from "node:path"; +import { + type ClientMessage, + encodeFrame, + FrameDecoder, +} from "@superset/pty-daemon/protocol"; +import { DaemonSupervisor, probeDaemonVersion } from "./DaemonSupervisor.ts"; + +// Capture supervisor-emitted log events. We replace console.log for the +// duration of the test, then filter for our supervisor's component prefix. +const loggedEvents: { event: string; props: Record }[] = []; +const realConsoleLog = console.log; + +beforeEach(() => { + loggedEvents.length = 0; + console.log = (...args: unknown[]) => { + // Try to parse the first arg as JSON — supervisor logs in JSON; + // non-JSON lines (e.g. plain "[pty-daemon:...] adopted ...") fall + // through silently. + const first = args[0]; + if (typeof first === "string") { + try { + const parsed = JSON.parse(first) as Record; + if (parsed.component === "pty-daemon-supervisor") { + const { event, ...props } = parsed; + loggedEvents.push({ event: String(event), props }); + return; + } + } catch { + // not JSON, fall through + } + } + // keep one breadcrumb for debugging on test failure + realConsoleLog(...args); + }; +}); + +afterEach(() => { + console.log = realConsoleLog; +}); + +interface FakeDaemonOptions { + respondWithVersion?: string; + respondRaw?: Buffer; + hangUpAfterHello?: boolean; + respondWithWrongMessageFirst?: boolean; + silent?: boolean; +} + +async function startFakeDaemon(opts: FakeDaemonOptions): Promise<{ + socketPath: string; + close: () => Promise; +}> { + const socketPath = path.join( + os.tmpdir(), + `fake-pty-daemon-${process.pid}-${Math.random().toString(36).slice(2, 8)}.sock`, + ); + const server = net.createServer((sock) => { + const decoder = new FrameDecoder(); + sock.on("data", (chunk: Buffer) => { + decoder.push(chunk); + for (const raw of decoder.drain()) { + const msg = raw as ClientMessage; + if (msg.type !== "hello") continue; + if (opts.silent) return; + if (opts.hangUpAfterHello) { + sock.end(); + return; + } + if (opts.respondRaw) { + sock.write(opts.respondRaw); + return; + } + if (opts.respondWithWrongMessageFirst) { + sock.write( + encodeFrame({ + type: "error", + code: "EBOGUS", + message: "test", + }), + ); + return; + } + if (opts.respondWithVersion) { + sock.write( + encodeFrame({ + type: "hello-ack", + protocol: 1, + daemonVersion: opts.respondWithVersion, + }), + ); + return; + } + } + }); + sock.on("error", () => {}); + }); + await new Promise((resolve) => server.listen(socketPath, resolve)); + return { + socketPath, + close: () => + new Promise((resolve) => { + server.close(() => resolve()); + }), + }; +} + +describe("probeDaemonVersion", () => { + test("returns daemonVersion on valid hello-ack", async () => { + const fake = await startFakeDaemon({ respondWithVersion: "0.1.0" }); + try { + expect(await probeDaemonVersion(fake.socketPath, 1500)).toBe("0.1.0"); + } finally { + await fake.close(); + } + }); + + test("returns null when there is no listener on the socket path", async () => { + const dead = path.join( + os.tmpdir(), + `nonexistent-${process.pid}-${Math.random().toString(36).slice(2, 8)}.sock`, + ); + expect(await probeDaemonVersion(dead, 500)).toBeNull(); + }); + + test("returns null on probe timeout (silent daemon)", async () => { + const fake = await startFakeDaemon({ silent: true }); + try { + expect(await probeDaemonVersion(fake.socketPath, 200)).toBeNull(); + } finally { + await fake.close(); + } + }); + + test("returns null when daemon hangs up before hello-ack", async () => { + const fake = await startFakeDaemon({ hangUpAfterHello: true }); + try { + expect(await probeDaemonVersion(fake.socketPath, 1500)).toBeNull(); + } finally { + await fake.close(); + } + }); + + test("returns null on malformed/garbage response", async () => { + const fake = await startFakeDaemon({ + respondRaw: Buffer.from([0x00, 0xff, 0xab, 0xcd]), + }); + try { + expect(await probeDaemonVersion(fake.socketPath, 800)).toBeNull(); + } finally { + await fake.close(); + } + }); + + test("returns null when daemon sends a non-hello-ack message first", async () => { + const fake = await startFakeDaemon({ respondWithWrongMessageFirst: true }); + try { + expect(await probeDaemonVersion(fake.socketPath, 800)).toBeNull(); + } finally { + await fake.close(); + } + }); + + test("does not leak sockets across many invocations", async () => { + const fake = await startFakeDaemon({ respondWithVersion: "0.1.0" }); + try { + for (let i = 0; i < 50; i++) { + expect(await probeDaemonVersion(fake.socketPath, 1000)).toBe("0.1.0"); + } + } finally { + await fake.close(); + } + }); +}); + +describe("DaemonSupervisor.getUpdateStatus", () => { + let sup: DaemonSupervisor; + + beforeEach(() => { + sup = new DaemonSupervisor({ scriptPath: "/nonexistent" }); + }); + + test("returns null when no instance is registered", () => { + expect(sup.getUpdateStatus("org-no-such")).toBeNull(); + }); + + test("reflects updatePending=false for fresh-spawned instances", () => { + seedInstance(sup, "org-fresh", { + runningVersion: "0.1.0", + expectedVersion: "0.1.0", + updatePending: false, + }); + expect(sup.getUpdateStatus("org-fresh")).toEqual({ + pending: false, + running: "0.1.0", + expected: "0.1.0", + }); + }); + + test("reflects updatePending=true for stale-adopted instances", () => { + seedInstance(sup, "org-stale", { + runningVersion: "0.0.9", + expectedVersion: "0.1.0", + updatePending: true, + }); + expect(sup.getUpdateStatus("org-stale")).toEqual({ + pending: true, + running: "0.0.9", + expected: "0.1.0", + }); + }); + + test("'unknown' running version surfaces but is never pending", () => { + seedInstance(sup, "org-probe-failed", { + runningVersion: "unknown", + expectedVersion: "0.1.0", + updatePending: false, + }); + const status = sup.getUpdateStatus("org-probe-failed"); + expect(status?.pending).toBe(false); + expect(status?.running).toBe("unknown"); + }); +}); + +describe("update-pending event debounce", () => { + let sup: DaemonSupervisor; + + beforeEach(() => { + sup = new DaemonSupervisor({ scriptPath: "/nonexistent" }); + }); + + test("logs once per (running,expected) pair", () => { + const adopted = staleInstance("0.0.9"); + invokeMaybeFire(sup, "org", adopted); + invokeMaybeFire(sup, "org", adopted); + invokeMaybeFire(sup, "org", adopted); + const updateLogs = loggedEvents.filter( + (e) => e.event === "pty_daemon_update_pending", + ); + expect(updateLogs).toHaveLength(1); + expect(updateLogs[0]?.props).toMatchObject({ + organizationId: "org", + runningVersion: "0.0.9", + expectedVersion: "0.1.0", + }); + }); + + test("re-fires when the running version changes", () => { + invokeMaybeFire(sup, "org", staleInstance("0.0.8")); + invokeMaybeFire(sup, "org", staleInstance("0.0.9")); + expect( + loggedEvents.filter((e) => e.event === "pty_daemon_update_pending"), + ).toHaveLength(2); + }); + + test("clears debounce when an instance becomes non-pending", () => { + invokeMaybeFire(sup, "org", staleInstance("0.0.9")); + invokeMaybeFire(sup, "org", freshInstance()); + invokeMaybeFire(sup, "org", staleInstance("0.0.9")); + expect( + loggedEvents.filter((e) => e.event === "pty_daemon_update_pending"), + ).toHaveLength(2); + }); + + test("does not fire when updatePending is false", () => { + invokeMaybeFire(sup, "org", freshInstance()); + expect( + loggedEvents.filter((e) => e.event === "pty_daemon_update_pending"), + ).toHaveLength(0); + }); + + test("debounce is per-organization", () => { + const stale = staleInstance("0.0.9"); + invokeMaybeFire(sup, "org-a", stale); + invokeMaybeFire(sup, "org-b", stale); + expect( + loggedEvents.filter((e) => e.event === "pty_daemon_update_pending"), + ).toHaveLength(2); + }); +}); + +describe("DaemonSupervisor.restart", () => { + let sup: DaemonSupervisor; + + beforeEach(() => { + sup = new DaemonSupervisor({ scriptPath: "/nonexistent" }); + (sup as unknown as { stop: typeof sup.stop }).stop = mock( + async () => {}, + ) as typeof sup.stop; + (sup as unknown as { ensure: typeof sup.ensure }).ensure = mock(async () => + freshInstance(), + ) as typeof sup.ensure; + }); + + test("logs pty_daemon_user_restart with previous-version context", async () => { + seedInstance(sup, "org-restart", { + runningVersion: "0.0.9", + expectedVersion: "0.1.0", + updatePending: true, + }); + await sup.restart("org-restart"); + const restartLogs = loggedEvents.filter( + (e) => e.event === "pty_daemon_user_restart", + ); + expect(restartLogs).toHaveLength(1); + expect(restartLogs[0]?.props).toMatchObject({ + organizationId: "org-restart", + previousRunningVersion: "0.0.9", + previousExpectedVersion: "0.1.0", + previousUpdatePending: true, + hadCircuitOpen: false, + }); + }); + + test("clears the crash circuit so the user can recover from a tripped breaker", async () => { + (sup as unknown as { circuitOpen: Set }).circuitOpen.add( + "org-tripped", + ); + (sup as unknown as { crashTimes: Map }).crashTimes.set( + "org-tripped", + [1, 2, 3, 4], + ); + + await sup.restart("org-tripped"); + + expect(sup.isCircuitOpen("org-tripped")).toBe(false); + expect( + (sup as unknown as { crashTimes: Map }).crashTimes.get( + "org-tripped", + ), + ).toBeUndefined(); + + const restartLogs = loggedEvents.filter( + (e) => e.event === "pty_daemon_user_restart", + ); + expect(restartLogs[0]?.props).toMatchObject({ hadCircuitOpen: true }); + }); + + test("awaits an in-flight pendingStart before stopping", async () => { + let resolvePending: (value: unknown) => void = () => {}; + const pendingPromise = new Promise((resolve) => { + resolvePending = resolve; + }); + ( + sup as unknown as { pendingStarts: Map> } + ).pendingStarts.set("org-racey", pendingPromise); + + const stopMock = (sup as unknown as { stop: ReturnType }).stop; + const restartPromise = sup.restart("org-racey"); + + await new Promise((r) => setTimeout(r, 10)); + expect(stopMock).not.toHaveBeenCalled(); + + resolvePending({}); + await restartPromise; + expect(stopMock).toHaveBeenCalledTimes(1); + }); + + test("falls through cleanly if the pendingStart rejects", async () => { + const failingPending = Promise.reject(new Error("spawn failed")); + failingPending.catch(() => {}); + ( + sup as unknown as { pendingStarts: Map> } + ).pendingStarts.set("org-failed-spawn", failingPending); + + await expect(sup.restart("org-failed-spawn")).resolves.toEqual({ + success: true, + }); + }); + + test("returns success only after ensure resolves", async () => { + const ensureMock = mock(async () => freshInstance()); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + const result = await sup.restart("org-ok"); + expect(result).toEqual({ success: true }); + expect(ensureMock).toHaveBeenCalledTimes(1); + }); +}); + +// ---------------- helpers ---------------- + +interface SeededFields { + runningVersion: string; + expectedVersion: string; + updatePending: boolean; +} + +function seedInstance( + sup: DaemonSupervisor, + organizationId: string, + fields: SeededFields, +): void { + const instances = (sup as unknown as { instances: Map }) + .instances; + instances.set(organizationId, { + pid: 9999, + socketPath: "/tmp/seeded.sock", + startedAt: Date.now(), + ...fields, + }); +} + +function freshInstance() { + return { + pid: 1234, + socketPath: "/tmp/fresh.sock", + startedAt: Date.now(), + runningVersion: "0.1.0", + expectedVersion: "0.1.0", + updatePending: false, + }; +} + +function staleInstance(running: string) { + return { + pid: 1234, + socketPath: "/tmp/stale.sock", + startedAt: Date.now(), + runningVersion: running, + expectedVersion: "0.1.0", + updatePending: true, + }; +} + +function invokeMaybeFire( + sup: DaemonSupervisor, + organizationId: string, + instance: ReturnType, +): void { + ( + sup as unknown as { + maybeFireUpdatePending: (id: string, inst: typeof instance) => void; + } + ).maybeFireUpdatePending(organizationId, instance); +} diff --git a/packages/host-service/src/daemon/DaemonSupervisor.ts b/packages/host-service/src/daemon/DaemonSupervisor.ts new file mode 100644 index 00000000000..0035326f107 --- /dev/null +++ b/packages/host-service/src/daemon/DaemonSupervisor.ts @@ -0,0 +1,779 @@ +// DaemonSupervisor — owns the per-organization pty-daemon process for +// host-service. Spawns or adopts the daemon and exposes its socket path +// via getSocketPath(orgId). PTY ownership lives here so host-service can +// crash/restart freely without losing user shells. +// +// History: this used to live in the desktop main process +// (`apps/desktop/src/main/lib/pty-daemon-coordinator.ts`). It moved here +// so host-service can be deployed independently of Electron — see +// `apps/desktop/plans/20260430-pty-daemon-host-service-migration.md`. + +import * as childProcess from "node:child_process"; +import { createHash } from "node:crypto"; +import * as fs from "node:fs"; +import * as net from "node:net"; +import * as os from "node:os"; +import * as path from "node:path"; +import { + CURRENT_PROTOCOL_VERSION, + encodeFrame, + FrameDecoder, + type ServerMessage, + type SessionInfo, +} from "@superset/pty-daemon/protocol"; +import semver from "semver"; +import { EXPECTED_DAEMON_VERSION } from "./expected-version.ts"; +import { MAX_DAEMON_LOG_BYTES, openRotatingLogFd } from "./log-fd.ts"; +import { + isProcessAlive, + type PtyDaemonManifest, + ptyDaemonManifestDir, + readPtyDaemonManifest, + removePtyDaemonManifest, + writePtyDaemonManifest, +} from "./manifest.ts"; + +interface DaemonInstance { + pid: number; + socketPath: string; + startedAt: number; + /** Version reported by the running daemon's hello-ack. "unknown" if probe failed. */ + runningVersion: string; + /** Bundled-binary version we expect — i.e. EXPECTED_DAEMON_VERSION at spawn time. */ + expectedVersion: string; + /** True when running < expected. Probe failure does NOT set this. */ + updatePending: boolean; +} + +const SOCKET_READY_TIMEOUT_MS = 5_000; +const VERSION_PROBE_TIMEOUT_MS = 1_500; + +/** + * Crash supervision parameters. If the daemon for an organization crashes + * more than CRASH_BUDGET times within CRASH_WINDOW_MS, we stop respawning + * and surface a hard error — repeated crashes are a bug, not transient + * recovery. + */ +const CRASH_BUDGET = 3; +const CRASH_WINDOW_MS = 60_000; +/** How often to poll an adopted daemon's PID for liveness. */ +const ADOPTED_LIVENESS_INTERVAL_MS = 2_000; + +/** + * Per-organization socket path. **Must stay short** — Darwin's `sun_path` + * is 104 bytes, and `$SUPERSET_HOME_DIR/host/{orgId}/pty-daemon.sock` blows + * past that in dev (worktree-relative SUPERSET_HOME_DIR + 36-char UUID). + * + * We put the socket in `os.tmpdir()` with a hash of the org id. Owner-only + * file mode (0600, set by the daemon's Server.listen) is the auth boundary; + * the directory permissions don't matter. + */ +function ptyDaemonSocketPath(organizationId: string): string { + const shortId = createHash("sha256") + .update(organizationId) + .digest("hex") + .slice(0, 12); + return path.join(os.tmpdir(), `superset-ptyd-${shortId}.sock`); +} + +/** + * Structured log helper. Replaces the desktop's `track(...)` calls — we + * keep the same event names + props so any future telemetry slice can + * lift them straight back into PostHog. + */ +function logEvent(event: string, props: Record): void { + console.log( + JSON.stringify({ component: "pty-daemon-supervisor", event, ...props }), + ); +} + +export interface DaemonSupervisorOptions { + /** Path to the daemon entry script (e.g. `dist/pty-daemon.js`). */ + scriptPath: string; +} + +export class DaemonSupervisor { + private readonly opts: DaemonSupervisorOptions; + private readonly instances = new Map(); + private readonly pendingStarts = new Map>(); + /** Recent crash timestamps per orgId, for the circuit breaker. */ + private readonly crashTimes = new Map(); + /** Orgs we've explicitly stopped — exit isn't a crash, don't respawn. */ + private readonly stopping = new Set(); + /** Orgs that tripped the circuit breaker — refuse respawn until cleared. */ + private readonly circuitOpen = new Set(); + /** + * Last (orgId → "running:expected") pair we logged update-pending for. + * Debounce — re-fire only when either side changes. + */ + private readonly lastUpdatePendingPair = new Map(); + /** + * Liveness pollers per org. We only attach a `child.on("exit")` handler + * to daemons we *spawned* — adopted daemons (PIDs from a manifest) have + * no child handle, so we'd never notice if they died externally. This + * timer polls `process.kill(pid, 0)` to bridge that gap. + */ + private readonly adoptedLivenessTimers = new Map< + string, + ReturnType + >(); + + constructor(opts: DaemonSupervisorOptions) { + this.opts = opts; + } + + /** + * Has the org tripped the crash circuit breaker? Once tripped, ensure() + * fails fast with a clear error until clearCrashCircuit() is called. + */ + isCircuitOpen(organizationId: string): boolean { + return this.circuitOpen.has(organizationId); + } + + /** + * Reset the crash counter and close the circuit. Called from a UI + * "retry" action after surfacing the error to the user. + */ + clearCrashCircuit(organizationId: string): void { + this.circuitOpen.delete(organizationId); + this.crashTimes.delete(organizationId); + } + + /** + * Returns whether the running daemon is older than the bundled binary. + * Null when we have no instance for this org. `running === "unknown"` + * means the version probe failed during adoption — treat as not-pending + * (probe failure ≠ stale). + */ + getUpdateStatus( + organizationId: string, + ): { pending: boolean; running: string; expected: string } | null { + const instance = this.instances.get(organizationId); + if (!instance) return null; + return { + pending: instance.updatePending, + running: instance.runningVersion, + expected: instance.expectedVersion, + }; + } + + /** + * Explicitly restart the daemon for an org — kills sessions, spawns + * fresh. The user has opted in via UI confirmation. Distinct from + * crash-respawn: clears the crash circuit (if open) and emits its own + * event so logs can separate intent from recovery. + * + * Awaits any in-flight spawn before stopping so we never SIGTERM a + * partially-initialized child. + */ + async restart(organizationId: string): Promise<{ success: true }> { + const prev = this.instances.get(organizationId); + const hadCircuitOpen = this.circuitOpen.has(organizationId); + + const pending = this.pendingStarts.get(organizationId); + if (pending) { + try { + await pending; + } catch { + // Failed in-flight spawn — nothing to stop, ensure() will retry. + } + } + + await this.stop(organizationId); + this.clearCrashCircuit(organizationId); + + logEvent("pty_daemon_user_restart", { + organizationId, + hadCircuitOpen, + previousRunningVersion: prev?.runningVersion ?? null, + previousExpectedVersion: prev?.expectedVersion ?? null, + previousUpdatePending: prev?.updatePending ?? null, + }); + + await this.ensure(organizationId); + return { success: true }; + } + + /** + * Spawn the daemon if not already running for this organization, or + * adopt the running one. Returns the instance metadata. + */ + async ensure(organizationId: string): Promise { + if (this.circuitOpen.has(organizationId)) { + throw new Error( + `[pty-daemon:${organizationId}] crash circuit open: ${CRASH_BUDGET} crashes within ${CRASH_WINDOW_MS / 1000}s. Restart the host-service to retry.`, + ); + } + const existing = this.instances.get(organizationId); + if (existing) return existing; + const pending = this.pendingStarts.get(organizationId); + if (pending) return pending; + + const startPromise = this.start(organizationId).finally(() => { + this.pendingStarts.delete(organizationId); + }); + this.pendingStarts.set(organizationId, startPromise); + return startPromise; + } + + getSocketPath(organizationId: string): string | null { + return this.instances.get(organizationId)?.socketPath ?? null; + } + + /** + * Live session list from the running daemon. Null when there is no + * daemon for the org, the socket is unreachable, or the request times + * out — the caller treats null as "unknown" (distinct from `[]` which + * means "daemon up, no sessions"). + */ + async listSessions( + organizationId: string, + timeoutMs = 1500, + ): Promise { + const socketPath = this.getSocketPath(organizationId); + if (!socketPath) return null; + return listDaemonSessions(socketPath, timeoutMs); + } + + async stop(organizationId: string): Promise { + const instance = this.instances.get(organizationId); + this.instances.delete(organizationId); + this.stopAdoptedLivenessCheck(organizationId); + if (!instance) return; + this.stopping.add(organizationId); + try { + process.kill(instance.pid, "SIGTERM"); + } catch { + // Already dead. + } + removePtyDaemonManifest(organizationId); + } + + /** + * Poll an adopted daemon's liveness. Adopted daemons are PIDs we + * inherited via the manifest — we never spawned them as a child, so + * `child.on("exit")` doesn't fire when they die. Without this poller + * the supervisor's `instances` map carries a stale entry forever: + * `getSocketPath` returns a socket nobody's listening on, terminal + * ops fail with "ECONNREFUSED" until something forces a restart. + * + * On detected death: clear the instance + manifest so the next + * `ensure()` call respawns. + */ + private startAdoptedLivenessCheck(organizationId: string, pid: number): void { + this.stopAdoptedLivenessCheck(organizationId); + const timer = setInterval(() => { + if (isProcessAlive(pid)) return; + console.log( + `[pty-daemon:${organizationId}] adopted process ${pid} died — clearing instance for next-ensure respawn`, + ); + this.stopAdoptedLivenessCheck(organizationId); + const current = this.instances.get(organizationId); + if (current?.pid === pid) { + this.instances.delete(organizationId); + removePtyDaemonManifest(organizationId); + } + }, ADOPTED_LIVENESS_INTERVAL_MS); + this.adoptedLivenessTimers.set(organizationId, timer); + } + + private stopAdoptedLivenessCheck(organizationId: string): void { + const timer = this.adoptedLivenessTimers.get(organizationId); + if (timer) { + clearInterval(timer); + this.adoptedLivenessTimers.delete(organizationId); + } + } + + private async start(organizationId: string): Promise { + const adopted = await this.tryAdopt(organizationId); + if (adopted) { + this.instances.set(organizationId, adopted); + console.log( + `[pty-daemon:${organizationId}] adopted existing daemon pid=${adopted.pid} runningVersion=${adopted.runningVersion} updatePending=${adopted.updatePending}`, + ); + logEvent("pty_daemon_adopt", { + organizationId, + pid: adopted.pid, + ageSeconds: Math.round((Date.now() - adopted.startedAt) / 1000), + runningVersion: adopted.runningVersion, + expectedVersion: adopted.expectedVersion, + updatePending: adopted.updatePending, + }); + this.maybeFireUpdatePending(organizationId, adopted); + this.startAdoptedLivenessCheck(organizationId, adopted.pid); + return adopted; + } + + const instance = await this.spawn(organizationId); + logEvent("pty_daemon_spawn", { + organizationId, + pid: instance.pid, + socketPath: instance.socketPath, + daemonVersion: instance.runningVersion, + }); + this.lastUpdatePendingPair.delete(organizationId); + return instance; + } + + /** + * Log `pty_daemon_update_pending` once per (running, expected) pair so + * adopting the same stale daemon repeatedly doesn't spam logs. + */ + private maybeFireUpdatePending( + organizationId: string, + instance: DaemonInstance, + ): void { + if (!instance.updatePending) { + this.lastUpdatePendingPair.delete(organizationId); + return; + } + const pair = `${instance.runningVersion}:${instance.expectedVersion}`; + if (this.lastUpdatePendingPair.get(organizationId) === pair) return; + this.lastUpdatePendingPair.set(organizationId, pair); + logEvent("pty_daemon_update_pending", { + organizationId, + runningVersion: instance.runningVersion, + expectedVersion: instance.expectedVersion, + }); + } + + private async tryAdopt( + organizationId: string, + ): Promise { + const manifest = readPtyDaemonManifest(organizationId); + if (!manifest) return null; + if (!isProcessAlive(manifest.pid)) { + removePtyDaemonManifest(organizationId); + return null; + } + const reachable = await isSocketConnectable(manifest.socketPath, 1000); + if (!reachable) { + // PID alive but socket gone — daemon is wedged. Kill and respawn. + try { + process.kill(manifest.pid, "SIGTERM"); + } catch { + // Already dead. + } + removePtyDaemonManifest(organizationId); + return null; + } + + const probed = await probeDaemonVersion( + manifest.socketPath, + VERSION_PROBE_TIMEOUT_MS, + ); + const runningVersion = probed ?? "unknown"; + const updatePending = + !!probed && !semver.satisfies(probed, `>=${EXPECTED_DAEMON_VERSION}`); + + return { + pid: manifest.pid, + socketPath: manifest.socketPath, + startedAt: manifest.startedAt, + runningVersion, + expectedVersion: EXPECTED_DAEMON_VERSION, + updatePending, + }; + } + + private async spawn(organizationId: string): Promise { + const dir = ptyDaemonManifestDir(organizationId); + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true, mode: 0o700 }); + } + const socketPath = ptyDaemonSocketPath(organizationId); + const logPath = path.join(dir, "pty-daemon.log"); + + if (!fs.existsSync(this.opts.scriptPath)) { + throw new Error( + `[pty-daemon:${organizationId}] script not found at ${this.opts.scriptPath} — has the daemon binary been bundled?`, + ); + } + + // Dev: pipe daemon stdout/stderr through host-service so log lines + // flow up to the developer's `bun dev` terminal. Production: + // hard-back stdio with the rotating log file so the detached + // daemon survives host-service teardown without losing logs. + const isDev = process.env.NODE_ENV !== "production"; + const logFd = isDev ? -1 : openRotatingLogFd(logPath, MAX_DAEMON_LOG_BYTES); + const stdio: childProcess.StdioOptions = isDev + ? ["ignore", "pipe", "pipe"] + : logFd >= 0 + ? ["ignore", logFd, logFd] + : ["ignore", "ignore", "ignore"]; + + const childEnv = { + ...(process.env as Record), + ORGANIZATION_ID: organizationId, + // Source of truth for daemon version. The daemon's main.ts reads + // this and surfaces it in the hello-ack so adoption probes can + // detect drift against EXPECTED_DAEMON_VERSION. + SUPERSET_PTY_DAEMON_VERSION: EXPECTED_DAEMON_VERSION, + }; + + console.log( + `[pty-daemon:${organizationId}] spawning ${this.opts.scriptPath} → ${socketPath} (log: ${logPath})`, + ); + + let child: ReturnType; + try { + child = childProcess.spawn( + process.execPath, + [this.opts.scriptPath, `--socket=${socketPath}`], + { + detached: true, + stdio, + env: childEnv, + windowsHide: true, + }, + ); + } finally { + if (logFd >= 0) { + try { + fs.closeSync(logFd); + } catch { + // best-effort + } + } + } + + const childPid = child.pid; + if (!childPid) { + throw new Error(`[pty-daemon:${organizationId}] failed to spawn`); + } + + // Dev: fan daemon stdout/stderr up to host-service stdout (which + // itself flows up to `bun dev`). Production stdio is backed by the + // rotating log file already (logFd above), so no fan-out needed. + if (isDev && child.stdout && child.stderr) { + const tag = `[ptyd:${organizationId.slice(0, 8)}]`; + pipeWithPrefix(child.stdout, process.stdout, tag); + pipeWithPrefix(child.stderr, process.stderr, tag); + } + + let earlyExitCode: number | null = null; + let earlyExitSignal: NodeJS.Signals | null = null; + child.once("exit", (code, signal) => { + earlyExitCode = code; + earlyExitSignal = signal; + }); + + const ready = await waitForSocket(socketPath, SOCKET_READY_TIMEOUT_MS); + if (!ready) { + try { + child.kill("SIGTERM"); + } catch { + // best-effort + } + let logTail = ""; + try { + const buf = fs.readFileSync(logPath, "utf-8"); + logTail = buf.slice(-2000); + } catch { + logTail = "(no log file written)"; + } + logEvent("pty_daemon_spawn_failed", { + organizationId, + reason: "socket-not-ready", + timeoutMs: SOCKET_READY_TIMEOUT_MS, + earlyExitCode, + earlyExitSignal, + }); + throw new Error( + `[pty-daemon:${organizationId}] socket did not become ready within ${SOCKET_READY_TIMEOUT_MS}ms (childPid=${childPid}, earlyExit=${earlyExitCode ?? earlyExitSignal ?? "still alive"}). Log tail:\n${logTail}`, + ); + } + + child.unref(); + child.on("exit", (code) => { + console.log(`[pty-daemon:${organizationId}] exited with code ${code}`); + const current = this.instances.get(organizationId); + if (current?.pid !== childPid) return; + this.instances.delete(organizationId); + removePtyDaemonManifest(organizationId); + + if (this.stopping.has(organizationId)) { + this.stopping.delete(organizationId); + return; + } + + const now = Date.now(); + const recent = (this.crashTimes.get(organizationId) ?? []).filter( + (t) => now - t < CRASH_WINDOW_MS, + ); + recent.push(now); + this.crashTimes.set(organizationId, recent); + + logEvent("pty_daemon_crash", { + organizationId, + exitCode: code, + crashesInWindow: recent.length, + windowSeconds: CRASH_WINDOW_MS / 1000, + ageSeconds: Math.round((now - current.startedAt) / 1000), + }); + + if (recent.length > CRASH_BUDGET) { + this.circuitOpen.add(organizationId); + console.error( + `[pty-daemon:${organizationId}] crash circuit OPEN — ${recent.length} crashes in ${CRASH_WINDOW_MS / 1000}s; refusing further respawns until clearCrashCircuit() is called`, + ); + logEvent("pty_daemon_circuit_open", { + organizationId, + crashesInWindow: recent.length, + }); + return; + } + + console.warn( + `[pty-daemon:${organizationId}] auto-respawning after unexpected exit (${recent.length}/${CRASH_BUDGET} in window)`, + ); + void this.ensure(organizationId).catch((err) => { + console.error( + `[pty-daemon:${organizationId}] auto-respawn failed:`, + err, + ); + }); + }); + + const startedAt = Date.now(); + const manifest: PtyDaemonManifest = { + pid: childPid, + socketPath, + protocolVersions: [1], + startedAt, + organizationId, + }; + writePtyDaemonManifest(manifest); + + const instance: DaemonInstance = { + pid: childPid, + socketPath, + startedAt, + runningVersion: EXPECTED_DAEMON_VERSION, + expectedVersion: EXPECTED_DAEMON_VERSION, + updatePending: false, + }; + this.instances.set(organizationId, instance); + console.log( + `[pty-daemon:${organizationId}] spawned pid=${childPid} socket=${socketPath}`, + ); + return instance; + } +} + +/** + * Forward child stdout/stderr to a parent stream with a per-line prefix. + * Plain `chunk => parent.write(`${tag} ${chunk}`)` only prefixes the first + * line in a chunk; bursts of multi-line output lose the prefix on + * subsequent lines. + */ +function pipeWithPrefix( + source: NodeJS.ReadableStream, + target: NodeJS.WritableStream, + tag: string, +): void { + let pending = ""; + source.on("data", (chunk: Buffer) => { + const text = pending + chunk.toString("utf8"); + const lines = text.split("\n"); + pending = lines.pop() ?? ""; + for (const line of lines) { + target.write(`${tag} ${line}\n`); + } + }); + source.on("end", () => { + if (pending) target.write(`${tag} ${pending}\n`); + pending = ""; + }); +} + +async function waitForSocket( + socketPath: string, + timeoutMs: number, +): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + if (fs.existsSync(socketPath)) { + if (await isSocketConnectable(socketPath, 200)) return true; + } + await new Promise((r) => setTimeout(r, 50)); + } + return false; +} + +/** + * One-shot session list: connect, do handshake, send `list`, return the + * sessions array. Returns null on any failure. + * + * Owns its socket lifecycle on every exit path. + */ +export async function listDaemonSessions( + socketPath: string, + timeoutMs: number, +): Promise { + return new Promise((resolve) => { + const sock = net.createConnection({ path: socketPath }); + const decoder = new FrameDecoder(); + let helloAcked = false; + let settled = false; + + const cleanup = (value: SessionInfo[] | null) => { + if (settled) return; + settled = true; + clearTimeout(timer); + sock.removeAllListeners(); + try { + sock.end(); + } catch { + // best-effort + } + try { + sock.destroy(); + } catch { + // best-effort + } + resolve(value); + }; + + const timer = setTimeout(() => cleanup(null), timeoutMs); + + sock.once("error", () => cleanup(null)); + sock.once("close", () => cleanup(null)); + + sock.once("connect", () => { + try { + sock.write( + encodeFrame({ + type: "hello", + protocols: [CURRENT_PROTOCOL_VERSION], + clientVersion: "supervisor-list", + }), + ); + } catch { + cleanup(null); + } + }); + + sock.on("data", (chunk: Buffer) => { + try { + decoder.push(chunk); + for (const raw of decoder.drain()) { + const msg = raw as ServerMessage; + if (!helloAcked) { + if (msg.type !== "hello-ack") { + cleanup(null); + return; + } + helloAcked = true; + sock.write(encodeFrame({ type: "list" })); + continue; + } + if (msg.type === "list-reply") { + cleanup(msg.sessions); + return; + } + if (msg.type === "error") { + cleanup(null); + return; + } + } + } catch { + cleanup(null); + } + }); + }); +} + +/** + * One-shot version probe: connect, send `hello`, read framed `hello-ack`, + * close, return `daemonVersion`. Returns null on any failure. + * + * Owns its socket lifecycle on every exit path. + */ +export async function probeDaemonVersion( + socketPath: string, + timeoutMs: number, +): Promise { + return new Promise((resolve) => { + const sock = net.createConnection({ path: socketPath }); + const decoder = new FrameDecoder(); + let settled = false; + + const cleanup = (value: string | null) => { + if (settled) return; + settled = true; + clearTimeout(timer); + sock.removeAllListeners(); + try { + sock.end(); + } catch { + // best-effort + } + try { + sock.destroy(); + } catch { + // best-effort + } + resolve(value); + }; + + const timer = setTimeout(() => cleanup(null), timeoutMs); + + sock.once("error", () => cleanup(null)); + sock.once("close", () => cleanup(null)); + + sock.once("connect", () => { + try { + sock.write( + encodeFrame({ + type: "hello", + protocols: [CURRENT_PROTOCOL_VERSION], + clientVersion: "supervisor-probe", + }), + ); + } catch { + cleanup(null); + } + }); + + sock.on("data", (chunk: Buffer) => { + try { + decoder.push(chunk); + for (const raw of decoder.drain()) { + const msg = raw as ServerMessage; + if (msg.type === "hello-ack") { + cleanup(msg.daemonVersion ?? null); + return; + } + cleanup(null); + return; + } + } catch { + cleanup(null); + } + }); + }); +} + +function isSocketConnectable( + socketPath: string, + timeoutMs: number, +): Promise { + return new Promise((resolve) => { + const sock = net.createConnection({ path: socketPath }); + const timer = setTimeout(() => { + sock.destroy(); + resolve(false); + }, timeoutMs); + sock.once("connect", () => { + clearTimeout(timer); + sock.end(); + resolve(true); + }); + sock.once("error", () => { + clearTimeout(timer); + resolve(false); + }); + }); +} diff --git a/packages/host-service/src/daemon/expected-version.ts b/packages/host-service/src/daemon/expected-version.ts new file mode 100644 index 00000000000..993c055ce5d --- /dev/null +++ b/packages/host-service/src/daemon/expected-version.ts @@ -0,0 +1,16 @@ +// Bundled daemon version. **Hand-edited to match +// `packages/pty-daemon/package.json#version`** — keep them in lockstep. +// +// This drives the "update pending — restart terminals to apply" UX: +// when host-service adopts a daemon whose version (read via hello-ack) +// is older than this constant, the renderer surfaces a flag. +// +// We pass this to spawned daemons via `SUPERSET_PTY_DAEMON_VERSION` and +// probe it back on adoption. We do **not** auto-kill on mismatch (sessions +// live in the daemon); the user explicitly triggers restart. +// +// TODO: replace with a build-step that reads +// `node_modules/@superset/pty-daemon/package.json` and writes a generated +// constant, so the lockstep can't drift silently. For now: hand-edit and +// rely on PR review. +export const EXPECTED_DAEMON_VERSION = "0.1.0"; diff --git a/packages/host-service/src/daemon/index.ts b/packages/host-service/src/daemon/index.ts new file mode 100644 index 00000000000..81b6504ee06 --- /dev/null +++ b/packages/host-service/src/daemon/index.ts @@ -0,0 +1,14 @@ +export { + DaemonSupervisor, + type DaemonSupervisorOptions, + listDaemonSessions, + probeDaemonVersion, +} from "./DaemonSupervisor.ts"; +export { EXPECTED_DAEMON_VERSION } from "./expected-version.ts"; +export { + __resetSupervisorForTesting, + getSupervisor, + resolveSupervisorScriptPath, + startDaemonBootstrap, + waitForDaemonReady, +} from "./singleton.ts"; diff --git a/packages/host-service/src/daemon/log-fd.ts b/packages/host-service/src/daemon/log-fd.ts new file mode 100644 index 00000000000..7fa6e5dbadf --- /dev/null +++ b/packages/host-service/src/daemon/log-fd.ts @@ -0,0 +1,33 @@ +// Append-mode log fd for the daemon's stdio with size-based rotation. +// Mirrors the desktop's host-service log handling — when the bundle moves +// host-service into a headless deploy, daemon logs are still recoverable +// without an external log shipper. + +import * as fs from "node:fs"; +import * as path from "node:path"; + +export const MAX_DAEMON_LOG_BYTES = 5 * 1024 * 1024; + +/** + * Open an append-mode log fd, truncating first if it already exceeds + * `maxBytes`. Returns -1 on failure so callers can fall back to ignoring + * child stdio. + */ +export function openRotatingLogFd(logPath: string, maxBytes: number): number { + try { + fs.mkdirSync(path.dirname(logPath), { recursive: true, mode: 0o700 }); + if (fs.existsSync(logPath)) { + try { + const { size } = fs.statSync(logPath); + if (size > maxBytes) { + fs.writeFileSync(logPath, "", { mode: 0o600 }); + } + } catch { + // best-effort + } + } + return fs.openSync(logPath, "a", 0o600); + } catch { + return -1; + } +} diff --git a/packages/host-service/src/daemon/manifest.ts b/packages/host-service/src/daemon/manifest.ts new file mode 100644 index 00000000000..6f7884f6ea0 --- /dev/null +++ b/packages/host-service/src/daemon/manifest.ts @@ -0,0 +1,104 @@ +// Manifest for a running pty-daemon instance. Lives under +// $SUPERSET_HOME_DIR/host/{organizationId}/. Different lifecycle from +// host-service's own manifest — the daemon outlives host-service restarts. + +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { homedir } from "node:os"; +import { join } from "node:path"; + +export interface PtyDaemonManifest { + pid: number; + socketPath: string; + protocolVersions: number[]; + startedAt: number; + organizationId: string; +} + +function supersetHomeDir(): string { + return process.env.SUPERSET_HOME_DIR || join(homedir(), ".superset"); +} + +export function ptyDaemonManifestDir(organizationId: string): string { + return join(supersetHomeDir(), "host", organizationId); +} + +function ptyDaemonManifestPath(organizationId: string): string { + return join(ptyDaemonManifestDir(organizationId), "pty-daemon-manifest.json"); +} + +export function writePtyDaemonManifest(manifest: PtyDaemonManifest): void { + const dir = ptyDaemonManifestDir(manifest.organizationId); + if (!existsSync(dir)) { + mkdirSync(dir, { recursive: true, mode: 0o700 }); + } + writeFileSync( + ptyDaemonManifestPath(manifest.organizationId), + JSON.stringify(manifest), + { encoding: "utf-8", mode: 0o600 }, + ); +} + +export function readPtyDaemonManifest( + organizationId: string, +): PtyDaemonManifest | null { + const filePath = ptyDaemonManifestPath(organizationId); + if (!existsSync(filePath)) return null; + + try { + const raw = readFileSync(filePath, "utf-8"); + const data = JSON.parse(raw); + if ( + typeof data.pid !== "number" || + typeof data.socketPath !== "string" || + !Array.isArray(data.protocolVersions) || + typeof data.startedAt !== "number" || + typeof data.organizationId !== "string" + ) { + return null; + } + return data as PtyDaemonManifest; + } catch { + return null; + } +} + +export function listPtyDaemonManifests(): PtyDaemonManifest[] { + const hostDir = join(supersetHomeDir(), "host"); + if (!existsSync(hostDir)) return []; + const manifests: PtyDaemonManifest[] = []; + try { + for (const entry of readdirSync(hostDir, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + const manifest = readPtyDaemonManifest(entry.name); + if (manifest) manifests.push(manifest); + } + } catch { + // best-effort + } + return manifests; +} + +export function removePtyDaemonManifest(organizationId: string): void { + const filePath = ptyDaemonManifestPath(organizationId); + try { + if (existsSync(filePath)) unlinkSync(filePath); + } catch { + // best-effort + } +} + +export function isProcessAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch (err) { + return (err as NodeJS.ErrnoException).code === "EPERM"; + } +} diff --git a/packages/host-service/src/daemon/singleton.test.ts b/packages/host-service/src/daemon/singleton.test.ts new file mode 100644 index 00000000000..33fa33d4344 --- /dev/null +++ b/packages/host-service/src/daemon/singleton.test.ts @@ -0,0 +1,114 @@ +// Tests for the daemon supervisor singleton + bootstrap helpers. +// We don't spawn a real daemon here — the singleton is just plumbing +// (DI for the supervisor, fire-and-track promise stash). Real-spawn +// coverage lives in DaemonSupervisor.node-test.ts. + +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; +import { DaemonSupervisor } from "./DaemonSupervisor.ts"; +import { + __resetSupervisorForTesting, + getSupervisor, + startDaemonBootstrap, + waitForDaemonReady, +} from "./singleton.ts"; + +beforeEach(() => { + __resetSupervisorForTesting(); +}); + +afterEach(() => { + __resetSupervisorForTesting(); +}); + +describe("getSupervisor", () => { + test("returns the same instance across calls", () => { + const a = getSupervisor("/nonexistent"); + const b = getSupervisor("/different"); + // Singleton — second arg is ignored after first construction. + expect(b).toBe(a); + }); + + test("constructs with the provided scriptPath on first call", () => { + const sup = getSupervisor("/some/path/pty-daemon.js"); + // We can't read scriptPath via public API, but we can confirm the + // supervisor was constructed (not null) and uses the path when it + // tries to spawn — `existsSync` check throws "script not found". + expect(sup).toBeInstanceOf(DaemonSupervisor); + }); +}); + +describe("fire-and-track bootstrap", () => { + test("startDaemonBootstrap kicks off ensure without awaiting", async () => { + const sup = getSupervisor("/nonexistent"); + const ensureMock = mock(async () => { + // Long-running ensure that we control via a manual settle. + await new Promise((r) => setTimeout(r, 50)); + return {} as Awaited>; + }); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + + const t0 = Date.now(); + startDaemonBootstrap("org-fnt"); + const elapsed = Date.now() - t0; + // Should return immediately, not after the ensure delay. + expect(elapsed).toBeLessThan(20); + expect(ensureMock).toHaveBeenCalledTimes(1); + expect(ensureMock).toHaveBeenCalledWith("org-fnt"); + + // Now await readiness — should complete after ensure resolves. + await waitForDaemonReady("org-fnt"); + // Sanity: ensure was invoked exactly once across both calls. + expect(ensureMock).toHaveBeenCalledTimes(1); + }); + + test("startDaemonBootstrap is idempotent", () => { + const sup = getSupervisor("/nonexistent"); + const ensureMock = mock(async () => { + await new Promise((r) => setTimeout(r, 100)); + return {} as Awaited>; + }); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + + startDaemonBootstrap("org-idempotent"); + startDaemonBootstrap("org-idempotent"); + startDaemonBootstrap("org-idempotent"); + expect(ensureMock).toHaveBeenCalledTimes(1); + }); + + test("waitForDaemonReady kicks off bootstrap if none in flight", async () => { + const sup = getSupervisor("/nonexistent"); + const ensureMock = mock(async () => { + return {} as Awaited>; + }); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + + await waitForDaemonReady("org-lazy"); + expect(ensureMock).toHaveBeenCalledTimes(1); + }); + + test("a failed bootstrap is retryable", async () => { + const sup = getSupervisor("/nonexistent"); + let failNext = true; + const ensureMock = mock(async () => { + if (failNext) { + failNext = false; + throw new Error("simulated spawn failure"); + } + return {} as Awaited>; + }); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + + // First wait surfaces the failure. + await expect(waitForDaemonReady("org-retry")).rejects.toThrow( + "simulated spawn failure", + ); + // Second wait kicks off a new bootstrap (the failed promise was + // cleared) and succeeds. + await waitForDaemonReady("org-retry"); + expect(ensureMock).toHaveBeenCalledTimes(2); + }); +}); diff --git a/packages/host-service/src/daemon/singleton.ts b/packages/host-service/src/daemon/singleton.ts new file mode 100644 index 00000000000..08e35949ee6 --- /dev/null +++ b/packages/host-service/src/daemon/singleton.ts @@ -0,0 +1,104 @@ +// Singleton DaemonSupervisor for the host-service process. One supervisor +// per host-service instance; it manages exactly one daemon (per the org +// host-service was started with). Lazy bootstrap so tests can construct +// host-service without spawning a real daemon — the bootstrap is kicked +// off explicitly from `serve.ts`. + +import { existsSync } from "node:fs"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; +import { DaemonSupervisor } from "./DaemonSupervisor.ts"; + +let supervisor: DaemonSupervisor | null = null; +let bootstrapPromise: Promise | null = null; + +/** + * Resolve the daemon entry script path. In production, host-service.js and + * pty-daemon.js are bundled side-by-side in the same dist directory. In + * dev (running from source under bun), we fall back to the workspace + * package's `dist/pty-daemon.js`. Either is fine — both are real Node + * scripts. + */ +export function resolveSupervisorScriptPath(): string { + const override = process.env.SUPERSET_PTY_DAEMON_SCRIPT_PATH; + if (override) return override; + + const here = path.dirname(fileURLToPath(import.meta.url)); + // Production / dev (electron-vite bundle): host-service.js and + // pty-daemon.js are emitted side-by-side in the same dist directory, + // so `here` and the daemon entry share a parent. + const sideBySide = path.resolve(here, "pty-daemon.js"); + if (existsSync(sideBySide)) return sideBySide; + + // Source-running fallback (`bun run` from packages/host-service): + // `here` is `packages/host-service/src/daemon/`; the daemon's bundled + // entry sits at `packages/pty-daemon/dist/pty-daemon.js` after + // `bun run build:daemon` in that package. + const workspaceDist = path.resolve( + here, + "..", + "..", + "..", + "pty-daemon", + "dist", + "pty-daemon.js", + ); + return workspaceDist; +} + +export function getSupervisor(scriptPath?: string): DaemonSupervisor { + if (!supervisor) { + supervisor = new DaemonSupervisor({ + scriptPath: scriptPath ?? resolveSupervisorScriptPath(), + }); + } + return supervisor; +} + +/** + * Kick off `ensure(orgId)` without awaiting (per the host-service + * migration plan, decision D3 — fire-and-track). Stash the promise so + * callers that need the daemon up can await it via `waitForDaemonReady`. + */ +export function startDaemonBootstrap(organizationId: string): void { + if (bootstrapPromise) return; + const sup = getSupervisor(); + console.log(`[supervisor] kicking off bootstrap for org=${organizationId}`); + bootstrapPromise = sup + .ensure(organizationId) + .then((inst) => { + console.log( + `[supervisor] bootstrap OK for org=${organizationId} pid=${inst.pid} version=${inst.runningVersion}${inst.updatePending ? " (update pending)" : ""}`, + ); + return inst; + }) + .catch((err) => { + console.error( + `[supervisor] bootstrap failed for org=${organizationId}:`, + err, + ); + // Reset so a future request can retry. + bootstrapPromise = null; + throw err; + }); +} + +/** + * Awaits the in-flight bootstrap. If bootstrap hasn't started, kicks one + * off first. Terminal request handlers call this before using the + * supervisor's socket path. + */ +export async function waitForDaemonReady( + organizationId: string, +): Promise { + if (!bootstrapPromise) startDaemonBootstrap(organizationId); + if (bootstrapPromise) { + await bootstrapPromise; + } +} + +/** Test-only — reset the singleton between tests. */ +export function __resetSupervisorForTesting(): void { + supervisor = null; + bootstrapPromise = null; +} diff --git a/packages/host-service/src/db/db.ts b/packages/host-service/src/db/db.ts index ab74757267e..f5d07674b4d 100644 --- a/packages/host-service/src/db/db.ts +++ b/packages/host-service/src/db/db.ts @@ -3,7 +3,7 @@ import { dirname } from "node:path"; import Database from "better-sqlite3"; import { drizzle } from "drizzle-orm/better-sqlite3"; import { migrate } from "drizzle-orm/better-sqlite3/migrator"; -import * as schema from "./schema"; +import * as schema from "./schema.ts"; export type HostDb = ReturnType; diff --git a/packages/host-service/src/db/index.ts b/packages/host-service/src/db/index.ts index e6cb0767895..9a32c356c6c 100644 --- a/packages/host-service/src/db/index.ts +++ b/packages/host-service/src/db/index.ts @@ -1,2 +1,2 @@ -export { createDb, type HostDb } from "./db"; -export * from "./schema"; +export { createDb, type HostDb } from "./db.ts"; +export * from "./schema.ts"; diff --git a/packages/host-service/src/events/event-bus.ts b/packages/host-service/src/events/event-bus.ts index b550f568e13..5d96c3cdd4f 100644 --- a/packages/host-service/src/events/event-bus.ts +++ b/packages/host-service/src/events/event-bus.ts @@ -2,12 +2,12 @@ import type { NodeWebSocket } from "@hono/node-ws"; import type { DetectedPort } from "@superset/port-scanner"; import type { FsWatchEvent } from "@superset/workspace-fs/host"; import type { Hono } from "hono"; -import type { HostDb } from "../db"; -import { portManager } from "../ports/port-manager"; -import { getLabelsForWorkspace } from "../ports/static-ports"; -import type { WorkspaceFilesystemManager } from "../runtime/filesystem"; -import { GitWatcher } from "./git-watcher"; -import type { ClientMessage, ServerMessage } from "./types"; +import type { HostDb } from "../db/index.ts"; +import { portManager } from "../ports/port-manager.ts"; +import { getLabelsForWorkspace } from "../ports/static-ports.ts"; +import type { WorkspaceFilesystemManager } from "../runtime/filesystem/index.ts"; +import { GitWatcher } from "./git-watcher.ts"; +import type { ClientMessage, ServerMessage } from "./types.ts"; type WsSocket = { send: (data: string) => void; diff --git a/packages/host-service/src/events/git-watcher.ts b/packages/host-service/src/events/git-watcher.ts index 802863ae640..a3e36f05284 100644 --- a/packages/host-service/src/events/git-watcher.ts +++ b/packages/host-service/src/events/git-watcher.ts @@ -2,9 +2,9 @@ import { execFile } from "node:child_process"; import { type FSWatcher, watch } from "node:fs"; import { promisify } from "node:util"; import type { FsWatchEvent } from "@superset/workspace-fs/host"; -import type { HostDb } from "../db"; -import { workspaces } from "../db/schema"; -import type { WorkspaceFilesystemManager } from "../runtime/filesystem"; +import type { HostDb } from "../db/index.ts"; +import { workspaces } from "../db/schema.ts"; +import type { WorkspaceFilesystemManager } from "../runtime/filesystem/index.ts"; const execFileAsync = promisify(execFile); diff --git a/packages/host-service/src/events/index.ts b/packages/host-service/src/events/index.ts index a7e05c678a1..fbe6b206c97 100644 --- a/packages/host-service/src/events/index.ts +++ b/packages/host-service/src/events/index.ts @@ -1,8 +1,8 @@ -export { EventBus, registerEventBusRoute } from "./event-bus"; +export { EventBus, registerEventBusRoute } from "./event-bus.ts"; export { type AgentLifecycleEventType, mapEventType, -} from "./map-event-type"; +} from "./map-event-type.ts"; export type { AgentLifecycleMessage, ClientMessage, @@ -14,4 +14,4 @@ export type { PortChangedMessage, ServerMessage, TerminalLifecycleMessage, -} from "./types"; +} from "./types.ts"; diff --git a/packages/host-service/src/events/types.ts b/packages/host-service/src/events/types.ts index 1ac18bf82d4..38ea6b11595 100644 --- a/packages/host-service/src/events/types.ts +++ b/packages/host-service/src/events/types.ts @@ -1,6 +1,6 @@ import type { DetectedPort } from "@superset/port-scanner"; import type { FsWatchEvent } from "@superset/workspace-fs/host"; -import type { AgentLifecycleEventType } from "./map-event-type"; +import type { AgentLifecycleEventType } from "./map-event-type.ts"; // ── Server → Client ──────────────────────────────────────────────── diff --git a/packages/host-service/src/no-electron-coupling.test.ts b/packages/host-service/src/no-electron-coupling.test.ts new file mode 100644 index 00000000000..ad71437bb79 --- /dev/null +++ b/packages/host-service/src/no-electron-coupling.test.ts @@ -0,0 +1,80 @@ +// Asserts host-service source has no Electron coupling. The migration's +// thesis is that host-service is independently deployable; this test +// keeps that promise honest by failing if someone accidentally imports +// electron, uses an Electron global, or shells out to an Electron API. +// +// Why a grep test rather than a real `node dist/host-service.js` smoke +// test: native addons (better-sqlite3, node-pty, @parcel/watcher) are +// marked external in the bundle and currently expect Electron's +// resolution path. Solving the native-addon distribution for headless +// deploy is its own slice. In the meantime this test catches the +// regression class the smoke test was designed to catch: "did we +// re-couple to Electron at the source level?" + +import { describe, expect, test } from "bun:test"; +import * as fs from "node:fs"; +import * as path from "node:path"; + +const SRC_DIR = path.resolve(import.meta.dirname); + +const ELECTRON_PATTERNS = [ + // Imports + /from\s+["']electron["']/, + /from\s+["']@electron[/-]/, + /from\s+["']electron\/(main|renderer)["']/, + /require\(["']electron["']\)/, + // Runtime detection / globals + /process\.versions\.electron/, + /\bapp\.(getPath|getName|getVersion|isPackaged)\b/, + /\bdialog\.(showMessageBox|showSaveDialog|showOpenDialog)\b/, + /\bBrowserWindow\b/, + /\bipcMain\b/, + /\bipcRenderer\b/, +]; + +function* walk(dir: string): Generator { + for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + if (entry.name === "node_modules" || entry.name === "dist") continue; + yield* walk(full); + continue; + } + if (!entry.isFile()) continue; + if (!entry.name.endsWith(".ts") && !entry.name.endsWith(".tsx")) continue; + // Skip self. + if (full === import.meta.url.replace(/^file:\/\//, "")) continue; + yield full; + } +} + +describe("host-service has no Electron coupling", () => { + test("no Electron imports or globals in src/", () => { + const offenders: { file: string; line: number; match: string }[] = []; + for (const file of walk(SRC_DIR)) { + const contents = fs.readFileSync(file, "utf-8"); + const lines = contents.split("\n"); + lines.forEach((line, idx) => { + // Skip our own assertions. + if (line.includes("ELECTRON_PATTERNS")) return; + for (const pat of ELECTRON_PATTERNS) { + if (pat.test(line)) { + offenders.push({ + file: path.relative(SRC_DIR, file), + line: idx + 1, + match: line.trim(), + }); + } + } + }); + } + if (offenders.length > 0) { + throw new Error( + `Found Electron coupling in host-service source:\n${offenders + .map((o) => ` ${o.file}:${o.line} ${o.match}`) + .join("\n")}`, + ); + } + expect(offenders.length).toBe(0); + }); +}); diff --git a/packages/host-service/src/ports/port-manager.ts b/packages/host-service/src/ports/port-manager.ts index b4faa3da452..912ba007d5f 100644 --- a/packages/host-service/src/ports/port-manager.ts +++ b/packages/host-service/src/ports/port-manager.ts @@ -1,5 +1,5 @@ import { PortManager } from "@superset/port-scanner"; -import { treeKillWithEscalation } from "./tree-kill"; +import { treeKillWithEscalation } from "./tree-kill.ts"; export const portManager = new PortManager({ killFn: treeKillWithEscalation, diff --git a/packages/host-service/src/runtime/filesystem/filesystem.ts b/packages/host-service/src/runtime/filesystem/filesystem.ts index 81ff604aa47..10ef8ec6078 100644 --- a/packages/host-service/src/runtime/filesystem/filesystem.ts +++ b/packages/host-service/src/runtime/filesystem/filesystem.ts @@ -5,8 +5,8 @@ import { getSearchIndex, } from "@superset/workspace-fs/host"; import { eq } from "drizzle-orm"; -import type { HostDb } from "../../db"; -import { projects, workspaces } from "../../db/schema"; +import type { HostDb } from "../../db/index.ts"; +import { projects, workspaces } from "../../db/schema.ts"; export interface WorkspaceFilesystemManagerOptions { db: HostDb; diff --git a/packages/host-service/src/runtime/filesystem/index.ts b/packages/host-service/src/runtime/filesystem/index.ts index 4af25866e5e..d074530d95a 100644 --- a/packages/host-service/src/runtime/filesystem/index.ts +++ b/packages/host-service/src/runtime/filesystem/index.ts @@ -1,2 +1,2 @@ -export type { WorkspaceFilesystemManagerOptions } from "./filesystem"; -export { WorkspaceFilesystemManager } from "./filesystem"; +export type { WorkspaceFilesystemManagerOptions } from "./filesystem.ts"; +export { WorkspaceFilesystemManager } from "./filesystem.ts"; diff --git a/packages/host-service/src/runtime/teardown/teardown.ts b/packages/host-service/src/runtime/teardown/teardown.ts index 2c428706f89..44c483aa94d 100644 --- a/packages/host-service/src/runtime/teardown/teardown.ts +++ b/packages/host-service/src/runtime/teardown/teardown.ts @@ -56,7 +56,7 @@ export async function runTeardown({ // Single-quoted so no shell interpolation is possible on the path. const initialCommand = `bash ${singleQuote(scriptPath)} ; exit $?`; - const session = createTerminalSessionInternal({ + const session = await createTerminalSessionInternal({ terminalId, workspaceId, db, diff --git a/packages/host-service/src/serve.ts b/packages/host-service/src/serve.ts index 1bcf1c761ba..eebd2cadcad 100644 --- a/packages/host-service/src/serve.ts +++ b/packages/host-service/src/serve.ts @@ -1,5 +1,6 @@ import { serve } from "@hono/node-server"; import { createApp } from "./app"; +import { getSupervisor, startDaemonBootstrap } from "./daemon"; import { env } from "./env"; import { JwtApiAuthProvider } from "./providers/auth"; import { LocalGitCredentialProvider } from "./providers/git"; @@ -10,9 +11,21 @@ import { initTerminalBaseEnv, resolveTerminalBaseEnv } from "./terminal/env"; import { connectRelay } from "./tunnel"; async function main(): Promise { + console.log( + `[host-service] starting (org=${env.ORGANIZATION_ID}, port=${env.PORT}, NODE_ENV=${process.env.NODE_ENV ?? "unset"})`, + ); + const terminalBaseEnv = await resolveTerminalBaseEnv(); initTerminalBaseEnv(terminalBaseEnv); + // Fire-and-track: kick off pty-daemon spawn-or-adopt without blocking + // host-service startup. Terminal request handlers `await + // waitForDaemonReady(orgId)` before using the supervisor's socket path, + // so an in-flight bootstrap doesn't race with the first terminal launch. + // Non-terminal requests (workspaces, git, chat) are unaffected if the + // daemon takes time to come up or fails entirely. + startDaemonBootstrap(env.ORGANIZATION_ID); + const authProvider = new JwtApiAuthProvider( env.AUTH_TOKEN, env.SUPERSET_API_URL, @@ -34,6 +47,34 @@ async function main(): Promise { }, }); + // Dev-mode shutdown: kill the daemon on host-service exit so dev + // iteration on daemon code resets cleanly. Production keeps the + // daemon detached so PTYs survive host-service restarts. + // Per the migration plan's D5 decision. + const isDev = process.env.NODE_ENV !== "production"; + if (isDev) { + let shuttingDown = false; + const devShutdown = async (signal: NodeJS.Signals) => { + if (shuttingDown) return; + shuttingDown = true; + console.log( + `[host-service] dev-mode ${signal} — stopping pty-daemon for clean iteration`, + ); + try { + await getSupervisor().stop(env.ORGANIZATION_ID); + } catch (err) { + console.error( + "[host-service] dev shutdown: supervisor.stop failed:", + err, + ); + } finally { + process.exit(0); + } + }; + process.on("SIGINT", () => void devShutdown("SIGINT")); + process.on("SIGTERM", () => void devShutdown("SIGTERM")); + } + const server = serve({ fetch: app.fetch, port: env.PORT }, (info) => { // Install only after the server is listening so startup throws still // reach `main().catch(...)` and exit with a non-zero code. diff --git a/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts b/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts new file mode 100644 index 00000000000..7fe6d8767be --- /dev/null +++ b/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts @@ -0,0 +1,274 @@ +// End-to-end test for DaemonClient against a real pty-daemon Server. +// Runs under Node (`node --experimental-strip-types --test`) because the +// daemon spawns real PTYs via node-pty. + +import { strict as assert } from "node:assert"; +import * as os from "node:os"; +import * as path from "node:path"; +import { after, before, test } from "node:test"; +import { Server } from "@superset/pty-daemon"; +import { DaemonClient } from "./DaemonClient.ts"; + +const sockPath = path.join( + os.tmpdir(), + `host-daemon-client-${process.pid}.sock`, +); +let server: Server; + +before(async () => { + server = new Server({ + socketPath: sockPath, + daemonVersion: "0.0.0-host-test", + }); + await server.listen(); +}); + +after(async () => { + await server.close(); +}); + +test("connect + handshake exposes daemon version", async () => { + const c = new DaemonClient({ socketPath: sockPath }); + await c.connect(); + assert.equal(c.version, "0.0.0-host-test"); + assert.equal(c.protocol, 1); + assert.ok(c.isConnected); + await c.dispose(); +}); + +test("open + subscribe + receive output + close", async () => { + const c = new DaemonClient({ socketPath: sockPath }); + await c.connect(); + + const id = "host-test-0"; + const result = await c.open(id, { + shell: "/bin/sh", + argv: ["-c", "echo from-daemon-client; sleep 0.2"], + cols: 80, + rows: 24, + }); + assert.ok(result.pid > 0); + + const chunks: Buffer[] = []; + const exitInfo: { code: number | null; signal: number | null }[] = []; + const unsubscribe = c.subscribe( + id, + { replay: true }, + { + onOutput: (b) => chunks.push(b), + onExit: (info) => exitInfo.push(info), + }, + ); + + await new Promise((r) => setTimeout(r, 600)); + const combined = Buffer.concat(chunks).toString("utf8"); + assert.ok( + combined.includes("from-daemon-client"), + `output missing marker: ${combined}`, + ); + assert.equal(exitInfo.length, 1); + assert.equal(exitInfo[0]?.code, 0); + + unsubscribe(); + await c.dispose(); +}); + +test("input is forwarded; resize updates dims", async () => { + const c = new DaemonClient({ socketPath: sockPath }); + await c.connect(); + + const id = "host-test-1"; + await c.open(id, { + shell: "/bin/sh", + argv: ["-i"], + cols: 80, + rows: 24, + }); + + const chunks: Buffer[] = []; + const unsubscribe = c.subscribe( + id, + { replay: false }, + { + onOutput: (b) => chunks.push(b), + onExit: () => {}, + }, + ); + + c.input(id, Buffer.from("echo input-marker\n")); + + await waitFor( + () => Buffer.concat(chunks).toString().includes("input-marker"), + 3000, + ); + + c.resize(id, 100, 30); + const list = await c.list(); + const me = list.find((s) => s.id === id); + assert.equal(me?.cols, 100); + assert.equal(me?.rows, 30); + + unsubscribe(); + await c.close(id, "SIGTERM"); + await c.dispose(); +}); + +test("multiple local subscribers get fanned out from one wire subscription", async () => { + const c = new DaemonClient({ socketPath: sockPath }); + await c.connect(); + + const id = "host-test-fanout"; + await c.open(id, { + shell: "/bin/sh", + argv: ["-c", "echo fanout; sleep 0.3"], + cols: 80, + rows: 24, + }); + + const a: Buffer[] = []; + const b: Buffer[] = []; + const unsubA = c.subscribe( + id, + { replay: true }, + { + onOutput: (buf) => a.push(buf), + onExit: () => {}, + }, + ); + // Second subscriber must use replay:false — the daemon's buffer was + // already delivered to the first subscribe; requesting replay again + // is now an explicit error (see DaemonClient.subscribe). The + // fan-out applies to live output only. + const unsubB = c.subscribe( + id, + { replay: false }, + { + onOutput: (buf) => b.push(buf), + onExit: () => {}, + }, + ); + + await new Promise((r) => setTimeout(r, 500)); + assert.ok(Buffer.concat(a).toString().includes("fanout")); + assert.ok(Buffer.concat(b).toString().includes("fanout")); + + unsubA(); + unsubB(); + await c.dispose(); +}); + +test("disconnect callback fires when daemon goes away", async () => { + // Spin up a throw-away server we can shut down independently. + const localPath = path.join( + os.tmpdir(), + `host-daemon-client-disc-${process.pid}.sock`, + ); + const local = new Server({ + socketPath: localPath, + daemonVersion: "0.0.0-disc", + }); + await local.listen(); + + const c = new DaemonClient({ socketPath: localPath }); + await c.connect(); + + const disc = new Promise((resolve) => { + c.onDisconnect(() => resolve()); + }); + + await local.close(); + await disc; + assert.equal(c.isConnected, false); + await c.dispose(); +}); + +test("adoption flow: client A opens, drops, client B finds + subscribes-with-replay", async () => { + // This is the exact host-service-restart sequence we hit in production: + // host-service v1 opens a daemon session, then dies. host-service v2 + // starts fresh, calls daemon.open() blindly → "session already exists" + // → must fall back to list() + subscribe(replay:true). Regression test + // for the "session already exists" tight loop. + const a = new DaemonClient({ socketPath: sockPath }); + await a.connect(); + const id = "host-restart-adopt"; + const openA = await a.open(id, { + shell: "/bin/sh", + argv: ["-i"], + cols: 80, + rows: 24, + }); + const aChunks: Buffer[] = []; + const unsubA = a.subscribe( + id, + { replay: false }, + { onOutput: (c) => aChunks.push(c), onExit: () => {} }, + ); + a.input(id, Buffer.from("echo before-host-restart\n")); + await waitFor( + () => Buffer.concat(aChunks).toString().includes("before-host-restart"), + 3000, + ); + unsubA(); + await a.dispose(); + + // Brief settle so the daemon registers A's disconnect. + await new Promise((r) => setTimeout(r, 100)); + + // "host-service v2" connects fresh. + const b = new DaemonClient({ socketPath: sockPath }); + await b.connect(); + + // Naive open should error with "session already exists" — that's the + // signal host-service uses to switch to adoption mode. + let openErr: Error | null = null; + try { + await b.open(id, { + shell: "/bin/sh", + argv: ["-i"], + cols: 80, + rows: 24, + }); + } catch (e) { + openErr = e as Error; + } + assert.ok(openErr, "second open of same id must throw"); + assert.match(openErr?.message ?? "", /session already exists/); + + // list() finds the live session. + const list = await b.list(); + const found = list.find((s) => s.id === id); + assert.ok(found, "list must surface the existing session"); + assert.equal(found?.alive, true); + assert.equal(found?.pid, openA.pid); + + // Subscribe with replay → see the buffered output from A's lifetime. + const bChunks: Buffer[] = []; + const unsubB = b.subscribe( + id, + { replay: true }, + { onOutput: (c) => bChunks.push(c), onExit: () => {} }, + ); + await waitFor( + () => Buffer.concat(bChunks).toString().includes("before-host-restart"), + 3000, + ); + + // And new input through B reaches the (still-living) shell. + b.input(id, Buffer.from("echo after-host-restart\n")); + await waitFor( + () => Buffer.concat(bChunks).toString().includes("after-host-restart"), + 3000, + ); + + unsubB(); + await b.close(id, "SIGTERM"); + await b.dispose(); +}); + +async function waitFor(predicate: () => boolean, ms: number): Promise { + const start = Date.now(); + while (!predicate()) { + if (Date.now() - start > ms) throw new Error("waitFor timed out"); + await new Promise((r) => setTimeout(r, 25)); + } +} diff --git a/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts b/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts new file mode 100644 index 00000000000..07cc79541f4 --- /dev/null +++ b/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts @@ -0,0 +1,442 @@ +// Client for the pty-daemon Unix-socket protocol. +// +// host-service holds a single long-lived DaemonClient. PTYs are owned by the +// daemon; this client is purely a thin transport over the socket: send typed +// requests, receive typed events, route output/exit to per-session callbacks. +// +// Lifecycle: +// - connect() opens the socket and completes the handshake. +// - subscribe(sessionId) registers callbacks; you'll receive every output +// and exit frame the daemon emits for that session id. +// - dispose() closes the socket; the daemon keeps owning sessions. +// +// Failure model: connection-level errors (daemon crash, socket close) are +// surfaced via onDisconnect. The desktop coordinator is responsible for +// respawning the daemon and host-service can reconnect by constructing a new +// DaemonClient. There is no in-band reconnect logic here — keep it dumb. + +import * as net from "node:net"; +import { + CURRENT_PROTOCOL_VERSION, + encodeFrame, + FrameDecoder, + type ServerMessage, + type SessionInfo, + type SessionMeta, +} from "@superset/pty-daemon/protocol"; + +export interface OpenResult { + id: string; + pid: number; +} + +export interface ExitInfo { + code: number | null; + signal: number | null; +} + +export type Signal = "SIGINT" | "SIGTERM" | "SIGKILL" | "SIGHUP"; + +export interface SubscribeCallbacks { + onOutput: (chunk: Buffer) => void; + onExit: (info: ExitInfo) => void; +} + +interface SessionCallbacks { + output: Set<(chunk: Buffer) => void>; + exit: Set<(info: ExitInfo) => void>; +} + +export interface DaemonClientOptions { + socketPath: string; + connectTimeoutMs?: number; +} + +/** + * Per-request timeouts. The daemon should respond within milliseconds for + * close/list, and within a few seconds for open (PTY spawn includes shell + * startup). Without these, a live-but-stuck daemon can hang callers + * indefinitely — a real risk if `node-pty.spawn` ever blocks. + */ +const OPEN_TIMEOUT_MS = 15_000; +const CLOSE_TIMEOUT_MS = 5_000; +const LIST_TIMEOUT_MS = 5_000; + +export class DaemonClient { + private readonly opts: DaemonClientOptions; + private socket: net.Socket | null = null; + private decoder = new FrameDecoder(); + private readonly callbacks = new Map(); + private readonly disconnectCbs = new Set<(err?: Error) => void>(); + private daemonVersion = ""; + private negotiated: number | null = null; + private connected = false; + + constructor(opts: DaemonClientOptions) { + this.opts = opts; + } + + async connect(): Promise { + const socket = await openSocket(this.opts); + this.socket = socket; + socket.on("data", (chunk) => this.onData(chunk)); + socket.on("close", () => this.onClose()); + socket.on("error", (err) => this.onClose(err)); + try { + await this.handshake(); + } catch (err) { + // Handshake rejected — destroy the socket and clear state so the + // caller's retry sees a clean slate. Without this, the socket and + // its listeners leak across failed connect attempts. + this.socket = null; + socket.removeAllListeners(); + socket.destroy(); + throw err; + } + this.connected = true; + } + + get isConnected(): boolean { + return this.connected && this.socket !== null && !this.socket.destroyed; + } + + get version(): string { + return this.daemonVersion; + } + + get protocol(): number { + return this.negotiated ?? CURRENT_PROTOCOL_VERSION; + } + + onDisconnect(cb: (err?: Error) => void): () => void { + this.disconnectCbs.add(cb); + return () => { + this.disconnectCbs.delete(cb); + }; + } + + async open(id: string, meta: SessionMeta): Promise { + const reply = await this.requestSession( + id, + { type: "open", id, meta }, + OPEN_TIMEOUT_MS, + ); + if (reply.type === "open-ok") return { id, pid: reply.pid }; + if (reply.type === "error") throw new Error(`open ${id}: ${reply.message}`); + throw new Error(`open ${id}: unexpected reply ${reply.type}`); + } + + async close(id: string, signal: Signal = "SIGHUP"): Promise { + const reply = await this.requestSession( + id, + { type: "close", id, signal }, + CLOSE_TIMEOUT_MS, + ); + if (reply.type === "closed") return; + if (reply.type === "error") + throw new Error(`close ${id}: ${reply.message}`); + throw new Error(`close ${id}: unexpected reply ${reply.type}`); + } + + async list(): Promise { + const reply = await this.requestNonSession( + { type: "list" }, + "list-reply", + LIST_TIMEOUT_MS, + ); + if (reply.type === "list-reply") return reply.sessions; + throw new Error(`list: unexpected reply ${reply.type}`); + } + + /** Fire-and-forget; bytes go straight to the PTY. */ + input(id: string, data: Buffer): void { + this.send({ + type: "input", + id, + data: data.toString("base64"), + }); + } + + /** Fire-and-forget; daemon validates dims. */ + resize(id: string, cols: number, rows: number): void { + this.send({ type: "resize", id, cols, rows }); + } + + /** + * Subscribe to a session's output + exit stream. Returns an unsubscribe + * function. With `replay: true` the daemon sends its current ring buffer + * before live streaming begins. Multiple subscribers per session are + * supported — the daemon fans output out to all of them. + */ + subscribe( + id: string, + opts: { replay: boolean }, + cb: SubscribeCallbacks, + ): () => void { + let entry = this.callbacks.get(id); + const wasFirst = !entry; + if (!entry) { + entry = { output: new Set(), exit: new Set() }; + this.callbacks.set(id, entry); + } + entry.output.add(cb.onOutput); + entry.exit.add(cb.onExit); + // Only the first subscribe per session id sends the wire `subscribe`. + // Subsequent local callbacks just register into the existing entry. + // The daemon's ring buffer is delivered once, on the first subscribe + // — so `replay: true` only makes sense on a fresh subscription. + // Loud-fail the surprising case where a later subscriber asks for + // replay; the caller needs to replay from a server-side cache + // instead (see terminal.ts replayBuffer). + if (!wasFirst && opts.replay) { + throw new Error( + `subscribe(${id}): replay is not available on a second subscribe; the daemon's buffer was already consumed.`, + ); + } + if (wasFirst) { + this.send({ type: "subscribe", id, replay: opts.replay }); + } + return () => { + const e = this.callbacks.get(id); + if (!e) return; + e.output.delete(cb.onOutput); + e.exit.delete(cb.onExit); + if (e.output.size === 0 && e.exit.size === 0) { + this.callbacks.delete(id); + this.send({ type: "unsubscribe", id }); + } + }; + } + + async dispose(): Promise { + this.connected = false; + const sock = this.socket; + this.socket = null; + if (!sock || sock.destroyed) return; + await new Promise((resolve) => { + sock.end(() => resolve()); + setTimeout(() => { + if (!sock.destroyed) sock.destroy(); + resolve(); + }, 200); + }); + } + + // ---- Internals ---- + + private async handshake(): Promise { + this.send({ + type: "hello", + protocols: [CURRENT_PROTOCOL_VERSION], + }); + const ack = await this.waitForFrame( + (m) => m.type === "hello-ack" || m.type === "error", + 5000, + ); + if (ack.type === "error") { + throw new Error(`daemon handshake failed: ${ack.message}`); + } + if (ack.type !== "hello-ack") { + throw new Error(`daemon handshake unexpected reply: ${ack.type}`); + } + this.daemonVersion = ack.daemonVersion; + this.negotiated = ack.protocol; + } + + private requestSession( + id: string, + req: + | { type: "open"; id: string; meta: SessionMeta } + | { type: "close"; id: string; signal: Signal }, + timeoutMs: number, + ): Promise { + return new Promise((resolve, reject) => { + let resolved = false; + const settle = (m: ServerMessage) => { + if (resolved) return; + resolved = true; + cleanup(); + resolve(m); + }; + const fail = (err: Error) => { + if (resolved) return; + resolved = true; + cleanup(); + reject(err); + }; + const off = this.on((m) => { + if (m.type === "error" && m.id === id) settle(m); + else if (req.type === "open" && m.type === "open-ok" && m.id === id) + settle(m); + else if (req.type === "close" && m.type === "closed" && m.id === id) + settle(m); + }); + const offDisc = this.onDisconnect((err) => + fail(err ?? new Error("daemon disconnected")), + ); + const timer = setTimeout( + () => + fail( + new Error( + `daemon ${req.type} ${id}: timed out after ${timeoutMs}ms`, + ), + ), + timeoutMs, + ); + const cleanup = () => { + off(); + offDisc(); + clearTimeout(timer); + }; + this.send(req); + }); + } + + private requestNonSession( + req: { type: "list" }, + expectType: "list-reply", + timeoutMs: number, + ): Promise { + return new Promise((resolve, reject) => { + let resolved = false; + const settle = (m: ServerMessage) => { + if (resolved) return; + resolved = true; + cleanup(); + resolve(m); + }; + const fail = (err: Error) => { + if (resolved) return; + resolved = true; + cleanup(); + reject(err); + }; + const off = this.on((m) => { + if (m.type === expectType) { + settle(m); + return; + } + // Non-session error frames (no `id`) belong to the + // most-recent non-session request — settle on those. Errors + // keyed to a session id come from concurrent ops on that + // session; ignore them here. + if (m.type === "error" && m.id === undefined) settle(m); + }); + const offDisc = this.onDisconnect((err) => + fail(err ?? new Error("daemon disconnected")), + ); + const timer = setTimeout( + () => + fail(new Error(`daemon ${req.type}: timed out after ${timeoutMs}ms`)), + timeoutMs, + ); + const cleanup = () => { + off(); + offDisc(); + clearTimeout(timer); + }; + this.send(req); + }); + } + + /** Register a one-shot listener. Returns an unsubscribe; called for every frame until disposed. */ + private on(cb: (m: ServerMessage) => void): () => void { + this.adhocListeners.add(cb); + return () => { + this.adhocListeners.delete(cb); + }; + } + + private adhocListeners = new Set<(m: ServerMessage) => void>(); + + private waitForFrame( + predicate: (m: ServerMessage) => boolean, + timeoutMs: number, + ): Promise { + return new Promise((resolve, reject) => { + const off = this.on((m) => { + if (predicate(m)) { + off(); + clearTimeout(timer); + resolve(m); + } + }); + const timer = setTimeout(() => { + off(); + reject(new Error(`daemon: timed out after ${timeoutMs}ms`)); + }, timeoutMs); + }); + } + + private send(msg: unknown): void { + const sock = this.socket; + if (!sock || sock.destroyed) { + throw new Error("DaemonClient: socket not connected"); + } + sock.write(encodeFrame(msg)); + } + + private onData(chunk: Buffer): void { + this.decoder.push(chunk); + let frames: unknown[]; + try { + frames = this.decoder.drain(); + } catch (err) { + // Protocol decode failure — the wire stream is corrupt. Hard-close + // the transport so we don't keep accepting data on a broken + // connection. Without destroy() the socket can keep delivering + // frames after onClose() has fired. + this.socket?.destroy(); + this.onClose(err as Error); + return; + } + for (const raw of frames) { + const msg = raw as ServerMessage; + // Route session-keyed events to subscriber callbacks. + if (msg.type === "output" && this.callbacks.has(msg.id)) { + const buf = Buffer.from(msg.data, "base64"); + for (const cb of this.callbacks.get(msg.id)?.output ?? []) { + cb(buf); + } + continue; + } + if (msg.type === "exit" && this.callbacks.has(msg.id)) { + const info: ExitInfo = { code: msg.code, signal: msg.signal }; + for (const cb of this.callbacks.get(msg.id)?.exit ?? []) { + cb(info); + } + continue; + } + // Everything else (open-ok, closed, error, hello-ack, list-reply) + // goes through the adhoc listener fan-out so request/response + // helpers can pick it up. + for (const l of this.adhocListeners) l(msg); + } + } + + private onClose(err?: Error): void { + if (!this.connected && this.socket === null) return; + this.connected = false; + this.socket = null; + for (const cb of this.disconnectCbs) cb(err); + } +} + +function openSocket(opts: DaemonClientOptions): Promise { + const timeoutMs = opts.connectTimeoutMs ?? 5000; + return new Promise((resolve, reject) => { + const socket = net.createConnection({ path: opts.socketPath }); + const timer = setTimeout(() => { + socket.destroy(); + reject(new Error(`DaemonClient: connect timed out after ${timeoutMs}ms`)); + }, timeoutMs); + socket.once("connect", () => { + clearTimeout(timer); + resolve(socket); + }); + socket.once("error", (err) => { + clearTimeout(timer); + reject(err); + }); + }); +} diff --git a/packages/host-service/src/terminal/DaemonClient/index.ts b/packages/host-service/src/terminal/DaemonClient/index.ts new file mode 100644 index 00000000000..ed4338ebe70 --- /dev/null +++ b/packages/host-service/src/terminal/DaemonClient/index.ts @@ -0,0 +1,8 @@ +export type { + DaemonClientOptions, + ExitInfo, + OpenResult, + Signal, + SubscribeCallbacks, +} from "./DaemonClient.ts"; +export { DaemonClient } from "./DaemonClient.ts"; diff --git a/packages/host-service/src/terminal/daemon-client-singleton.ts b/packages/host-service/src/terminal/daemon-client-singleton.ts new file mode 100644 index 00000000000..d169d993619 --- /dev/null +++ b/packages/host-service/src/terminal/daemon-client-singleton.ts @@ -0,0 +1,102 @@ +// Lazy singleton DaemonClient for host-service. The DaemonSupervisor +// (host-service-internal) owns the daemon's process lifecycle; this +// singleton just connects to the supervisor's socket path on first use +// and reuses the connection for all sessions. +// +// On disconnect we surface via console.error, notify subscribers (terminal.ts +// uses this to close WS sockets so the renderer reconnects against the +// respawned daemon), and let the next caller's getDaemonClient() rebuild +// the client. There's no in-band reconnect here — see DaemonClient's "dumb" +// failure model. + +import { getSupervisor, waitForDaemonReady } from "../daemon/index.ts"; +import { env } from "../env.ts"; +import { DaemonClient } from "./DaemonClient/index.ts"; + +let cached: DaemonClient | null = null; +let connecting: Promise | null = null; + +/** + * Subscribers notified whenever the active DaemonClient disconnects. + * terminal.ts hooks this to close WS sockets and clear in-memory session + * state — without it, sockets stay open and input/resize silently fails. + */ +const disconnectListeners = new Set<(err?: Error) => void>(); + +export function onDaemonDisconnect(cb: (err?: Error) => void): () => void { + disconnectListeners.add(cb); + return () => { + disconnectListeners.delete(cb); + }; +} + +async function ptyDaemonSocketPath(): Promise { + // Test escape hatch: when SUPERSET_PTY_DAEMON_SOCKET is set explicitly + // (e.g. by the adoption integration test), skip the supervisor and + // connect directly. Production paths leave this env var unset; the + // supervisor's own spawn does not set it. + const testOverride = process.env.SUPERSET_PTY_DAEMON_SOCKET; + if (testOverride) return testOverride; + + await waitForDaemonReady(env.ORGANIZATION_ID); + const sockPath = getSupervisor().getSocketPath(env.ORGANIZATION_ID); + if (!sockPath) { + throw new Error( + "pty-daemon is not available: supervisor returned no socket path. " + + "The bootstrap must have failed — check host-service logs for spawn errors.", + ); + } + return sockPath; +} + +export async function getDaemonClient(): Promise { + if (cached?.isConnected) return cached; + if (connecting) return connecting; + const sockPath = await ptyDaemonSocketPath(); + const client = new DaemonClient({ socketPath: sockPath }); + client.onDisconnect((err) => { + console.error( + "[host-service] pty-daemon disconnected:", + err?.message ?? "", + ); + if (cached === client) cached = null; + for (const listener of disconnectListeners) { + try { + listener(err); + } catch (cbErr) { + console.error( + "[host-service] daemon-disconnect listener threw:", + cbErr, + ); + } + } + }); + connecting = client + .connect() + .then(() => { + cached = client; + return client; + }) + .catch(async (error) => { + // Failed connect — clean up the partially initialized client. + await client.dispose().catch(() => {}); + throw error; + }) + .finally(() => { + connecting = null; + }); + return connecting; +} + +/** For tests / shutdown only. */ +export async function disposeDaemonClient(): Promise { + const c = cached; + const inFlight = connecting; + cached = null; + connecting = null; + if (c) await c.dispose(); + if (inFlight) { + const client = await inFlight.catch(() => null); + if (client) await client.dispose(); + } +} diff --git a/packages/host-service/src/terminal/env.ts b/packages/host-service/src/terminal/env.ts index 818615c0a35..6b58a336b1a 100644 --- a/packages/host-service/src/terminal/env.ts +++ b/packages/host-service/src/terminal/env.ts @@ -5,23 +5,26 @@ * at startup — never from desktop main or the live host-service process.env. */ -export { stripTerminalRuntimeEnv } from "./env-strip"; -export type { ShellBootstrapParams, ShellLaunchParams } from "./shell-launch"; +export { stripTerminalRuntimeEnv } from "./env-strip.ts"; +export type { + ShellBootstrapParams, + ShellLaunchParams, +} from "./shell-launch.ts"; export { getShellBootstrapEnv, getShellLaunchArgs, getSupersetShellPaths, resolveLaunchShell, -} from "./shell-launch"; +} from "./shell-launch.ts"; import fs from "node:fs"; import os from "node:os"; import { clearStrictShellEnvCache, getStrictShellEnvironment, -} from "./clean-shell-env"; -import { stripTerminalRuntimeEnv } from "./env-strip"; -import { getShellBootstrapEnv } from "./shell-launch"; +} from "./clean-shell-env.ts"; +import { stripTerminalRuntimeEnv } from "./env-strip.ts"; +import { getShellBootstrapEnv } from "./shell-launch.ts"; const MACOS_SYSTEM_CERT_FILE = "/etc/ssl/cert.pem"; let cachedMacosSystemCertAvailable: boolean | null = null; diff --git a/packages/host-service/src/terminal/terminal.adoption.node-test.ts b/packages/host-service/src/terminal/terminal.adoption.node-test.ts new file mode 100644 index 00000000000..b0fd3b9bc02 --- /dev/null +++ b/packages/host-service/src/terminal/terminal.adoption.node-test.ts @@ -0,0 +1,384 @@ +// End-to-end adoption test. Drives host-service's createTerminalSessionInternal +// against a real pty-daemon Server (in-process), real SQLite host DB, +// and real shells. Simulates a host-service process restart by clearing the +// in-memory sessions Map (via the test-only escape hatch) and disposing the +// DaemonClient singleton, then re-invokes createTerminalSessionInternal with +// the same terminalId and asserts the adoption path: +// - Same shell pid as the original session. +// - Subsequent input reaches the still-living shell. +// +// This is exactly what the daemon's process isolation enables: the daemon +// owns the PTY runtime; the host can test its integration end-to-end without +// any subprocess gymnastics. +// +// Runs under Node (`node --experimental-strip-types --test`). + +import { strict as assert } from "node:assert"; +import { randomUUID } from "node:crypto"; +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; +import { after, before, describe, test } from "node:test"; +import { fileURLToPath } from "node:url"; +import { Server } from "@superset/pty-daemon"; +import { eq } from "drizzle-orm"; +import { createDb, type HostDb } from "../db/index.ts"; +import { projects, workspaces } from "../db/schema.ts"; +import { disposeDaemonClient } from "./daemon-client-singleton.ts"; +import { initTerminalBaseEnv } from "./env.ts"; +import { + __resetSessionsForTesting, + createTerminalSessionInternal, + disposeSession, + listTerminalSessions, +} from "./terminal.ts"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const TEST_HOME = path.join(os.tmpdir(), `host-svc-adopt-${process.pid}`); +const SOCK = path.join(os.tmpdir(), `host-svc-adopt-${process.pid}.sock`); +const MIGRATIONS = path.resolve(__dirname, "../../drizzle"); + +let server: Server; +let db: HostDb; +let projectId: string; +let workspaceId: string; +let worktreePath: string; + +before(async () => { + fs.mkdirSync(TEST_HOME, { recursive: true }); + worktreePath = path.join(TEST_HOME, "worktree"); + fs.mkdirSync(worktreePath, { recursive: true }); + + server = new Server({ + socketPath: SOCK, + daemonVersion: "0.0.0-adoption-e2e", + }); + await server.listen(); + + process.env.SUPERSET_PTY_DAEMON_SOCKET = SOCK; + process.env.SUPERSET_HOME_DIR = TEST_HOME; + process.env.HOST_SERVICE_VERSION = "0.0.0-adoption-e2e"; + process.env.NODE_ENV = "development"; + + initTerminalBaseEnv({ + PATH: process.env.PATH ?? "/usr/bin:/bin", + HOME: process.env.HOME ?? TEST_HOME, + SHELL: "/bin/sh", + }); + + db = createDb(path.join(TEST_HOME, "host.db"), MIGRATIONS); + + projectId = randomUUID(); + workspaceId = randomUUID(); + db.insert(projects).values({ id: projectId, repoPath: worktreePath }).run(); + db.insert(workspaces) + .values({ + id: workspaceId, + projectId, + worktreePath, + branch: "main", + }) + .run(); +}); + +after(async () => { + __resetSessionsForTesting(); + await disposeDaemonClient(); + await server.close(); + try { + fs.rmSync(TEST_HOME, { recursive: true, force: true }); + } catch { + // best-effort + } +}); + +describe("createTerminalSessionInternal — host-service restart adoption", () => { + test("fresh open spawns a shell via the daemon", async () => { + const terminalId = `e2e-fresh-${randomUUID().slice(0, 8)}`; + const result = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok( + !("error" in result), + `expected session, got error: ${JSON.stringify(result)}`, + ); + if ("error" in result) return; + + assert.equal(result.terminalId, terminalId); + assert.ok(result.pty.pid > 0, "pty pid should be populated"); + + const list = listTerminalSessions({ workspaceId }); + assert.ok( + list.find((s) => s.terminalId === terminalId), + "new session should be in listTerminalSessions", + ); + + disposeSession(terminalId, db); + }); + + test("adopts existing daemon session after host-service restart simulation", async () => { + const terminalId = `e2e-adopt-${randomUUID().slice(0, 8)}`; + + const first = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok(!("error" in first)); + if ("error" in first) return; + const originalPid = first.pty.pid; + + first.pty.write("echo before-host-restart\n"); + await waitForOutput(first.pty, "before-host-restart", 3000); + + // Simulate host-service crash + restart. + __resetSessionsForTesting(); + await disposeDaemonClient(); + + const second = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok(!("error" in second)); + if ("error" in second) return; + + assert.equal( + second.pty.pid, + originalPid, + "adopted session should have same shell pid", + ); + assert.equal(second.terminalId, terminalId); + + let buf = ""; + const disposer = second.pty.onData((d) => { + buf += d; + }); + second.pty.write("echo after-host-restart\n"); + await waitFor(() => buf.includes("after-host-restart"), 3000); + disposer.dispose(); + + disposeSession(terminalId, db); + }); + + test("adopted session keeps listed/exited bookkeeping", async () => { + const terminalId = `e2e-bookkeeping-${randomUUID().slice(0, 8)}`; + const first = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok(!("error" in first)); + + __resetSessionsForTesting(); + await disposeDaemonClient(); + + const second = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok(!("error" in second)); + if ("error" in second) return; + + assert.equal(second.exited, false); + assert.equal(second.listed, true); + assert.ok( + listTerminalSessions({ workspaceId }).find( + (s) => s.terminalId === terminalId, + ), + ); + + disposeSession(terminalId, db); + }); + + test("adopted session does NOT re-fire initialCommand", async () => { + // Regression guard: setup.sh terminals pass an initialCommand. After + // host-service restart, adopting the same terminalId must NOT run + // the command a second time — that would re-execute setup.sh + // every host-service restart, which would be catastrophic. + const terminalId = `e2e-initcmd-${randomUUID().slice(0, 8)}`; + const sentinelFile = path.join(TEST_HOME, `initcmd-${terminalId}.sentinel`); + // Run on first lifetime: write a file. We then assert it isn't + // rewritten (would have a new mtime) on the second lifetime. + const initialCommand = `echo $$ > ${sentinelFile}`; + + const first = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: false, + initialCommand, + }); + assert.ok(!("error" in first)); + + // Wait for sentinel file (proves initialCommand ran). + await waitFor(() => fs.existsSync(sentinelFile), 5000); + const firstMtime = fs.statSync(sentinelFile).mtimeMs; + + // Simulate host-service restart and adopt, passing the SAME + // initialCommand (host-service has no way to know it already ran). + __resetSessionsForTesting(); + await disposeDaemonClient(); + + const second = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: false, + initialCommand, + }); + assert.ok(!("error" in second)); + + // Wait long enough for the command to have run if it were going to. + await new Promise((r) => setTimeout(r, 800)); + + // Sentinel mtime unchanged → initialCommand was suppressed on adopt. + const secondMtime = fs.statSync(sentinelFile).mtimeMs; + assert.equal( + secondMtime, + firstMtime, + "initialCommand re-fired on adopted session — would re-run setup.sh on every host-service restart", + ); + + disposeSession(terminalId, db); + }); + + test("adoption when the original workspace row is gone returns a clear error", async () => { + // Race: host-service is down, user deletes the workspace cloud-side, + // the workspace row is removed from the host DB. Daemon still has + // the live session. host-service comes back, renderer reconnects + // with the same terminalId. createTerminalSessionInternal must + // surface a clean error (not crash, not loop). + const ghostWorkspaceId = randomUUID(); + const ghostWorktree = path.join(TEST_HOME, "ghost-worktree"); + fs.mkdirSync(ghostWorktree, { recursive: true }); + db.insert(projects) + .values({ id: randomUUID(), repoPath: ghostWorktree }) + .run(); + const ghostProject = randomUUID(); + db.insert(projects) + .values({ id: ghostProject, repoPath: ghostWorktree }) + .run(); + db.insert(workspaces) + .values({ + id: ghostWorkspaceId, + projectId: ghostProject, + worktreePath: ghostWorktree, + branch: "main", + }) + .run(); + + const terminalId = `e2e-ghost-${randomUUID().slice(0, 8)}`; + const first = await createTerminalSessionInternal({ + terminalId, + workspaceId: ghostWorkspaceId, + db, + listed: true, + }); + assert.ok(!("error" in first)); + + // User deletes workspace mid-restart: row gone, worktree dir removed. + __resetSessionsForTesting(); + await disposeDaemonClient(); + db.delete(workspaces).where(eq(workspaces.id, ghostWorkspaceId)).run(); + fs.rmSync(ghostWorktree, { recursive: true, force: true }); + + const second = await createTerminalSessionInternal({ + terminalId, + workspaceId: ghostWorkspaceId, + db, + listed: true, + }); + assert.ok( + "error" in second, + "adoption with missing workspace must return error, not throw or loop", + ); + if ("error" in second) { + assert.match(second.error, /Workspace worktree not found/); + } + + // Daemon still has the orphan session — clean it up directly so the + // test suite leaves nothing behind. Production needs a periodic + // "orphan session sweep" but that's a separate cleanup concern. + disposeSession(terminalId, db); + }); + + test("dispose then re-create with the same id works (no zombie state)", async () => { + // Rapid lifecycle: user creates terminal, kills it, creates again + // with the same id. Daemon-side cleanup must be done by the time + // the second create runs, otherwise we'd hit "session already + // exists" without an alive shell to adopt. + const terminalId = `e2e-recycle-${randomUUID().slice(0, 8)}`; + + const first = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok(!("error" in first)); + const firstPid = "error" in first ? -1 : first.pty.pid; + + disposeSession(terminalId, db); + + // Wait for the daemon's onExit handler to mark the session exited + // (SIGTERM → shell exits → wireSession.onExit fires → session.exited + // flips to true → handleOpen can then recycle the id). + await new Promise((r) => setTimeout(r, 800)); + + const second = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok( + !("error" in second), + `re-create after dispose failed: ${JSON.stringify(second)}`, + ); + if ("error" in second) return; + + // Different shell pid (real fresh spawn) — not adoption. + assert.notEqual( + second.pty.pid, + firstPid, + "re-create after dispose should be a fresh spawn, not adoption of the dead session", + ); + + disposeSession(terminalId, db); + }); +}); + +// ---------------- helpers ---------------- + +async function waitFor(predicate: () => boolean, ms: number): Promise { + const start = Date.now(); + while (!predicate()) { + if (Date.now() - start > ms) throw new Error("waitFor timed out"); + await new Promise((r) => setTimeout(r, 25)); + } +} + +async function waitForOutput( + pty: { onData: (cb: (d: string) => void) => { dispose(): void } }, + marker: string, + ms: number, +): Promise { + let buf = ""; + const disposer = pty.onData((d) => { + buf += d; + }); + try { + await waitFor(() => buf.includes(marker), ms); + } finally { + disposer.dispose(); + } +} diff --git a/packages/host-service/src/terminal/terminal.ts b/packages/host-service/src/terminal/terminal.ts index cac03519bcc..44a638af6c8 100644 --- a/packages/host-service/src/terminal/terminal.ts +++ b/packages/host-service/src/terminal/terminal.ts @@ -13,17 +13,99 @@ import { } from "@superset/shared/terminal-title-scanner"; import { and, eq, ne } from "drizzle-orm"; import type { Hono } from "hono"; -import { type IPty, spawn } from "node-pty"; -import type { HostDb } from "../db"; -import { projects, terminalSessions, workspaces } from "../db/schema"; -import type { EventBus } from "../events"; -import { portManager } from "../ports/port-manager"; +import type { HostDb } from "../db/index.ts"; +import { projects, terminalSessions, workspaces } from "../db/schema.ts"; +import type { EventBus } from "../events/index.ts"; +import { portManager } from "../ports/port-manager.ts"; +import type { DaemonClient } from "./DaemonClient/index.ts"; +import { + getDaemonClient, + onDaemonDisconnect, +} from "./daemon-client-singleton.ts"; import { buildV2TerminalEnv, getShellLaunchArgs, getTerminalBaseEnv, resolveLaunchShell, -} from "./env"; +} from "./env.ts"; + +/** + * Thin adapter exposing approximately the IPty surface that the rest of + * this file (and teardown.ts) was built against, so most of the call + * sites stay unchanged after the daemon extraction. The PTY itself lives + * in pty-daemon; this is a remote control. + * + * onData / onExit register additional subscribers on top of whatever the + * session's primary subscription is doing — daemon supports multi- + * subscriber fan-out per session, so layered observers work fine. + */ +interface PtyDataDisposer { + dispose(): void; +} + +interface DaemonPty { + pid: number; + write(data: string): void; + resize(cols: number, rows: number): void; + kill(signal?: NodeJS.Signals): void; + onData(cb: (data: string) => void): PtyDataDisposer; + onExit( + cb: (info: { exitCode: number; signal: number }) => void, + ): PtyDataDisposer; +} + +function makeDaemonPty( + daemon: DaemonClient, + sessionId: string, + pid: number, +): DaemonPty { + return { + pid, + write(data) { + daemon.input(sessionId, Buffer.from(data, "utf8")); + }, + resize(cols, rows) { + try { + daemon.resize(sessionId, cols, rows); + } catch { + // Daemon may have disconnected; surface via the next op. + } + }, + kill(signal) { + daemon + .close( + sessionId, + (signal as "SIGTERM" | "SIGKILL" | "SIGINT" | "SIGHUP") ?? "SIGHUP", + ) + .catch(() => { + // Already gone or daemon disconnected — no-op. + }); + }, + onData(cb) { + const unsub = daemon.subscribe( + sessionId, + { replay: false }, + { + onOutput: (chunk) => cb(chunk.toString("utf8")), + onExit: () => {}, + }, + ); + return { dispose: unsub }; + }, + onExit(cb) { + const unsub = daemon.subscribe( + sessionId, + { replay: false }, + { + onOutput: () => {}, + onExit: ({ code, signal }) => + cb({ exitCode: code ?? 0, signal: signal ?? 0 }), + }, + ); + return { dispose: unsub }; + }, + }; +} interface RegisterWorkspaceTerminalRouteOptions { app: Hono; @@ -97,7 +179,9 @@ type ShellReadyState = "pending" | "ready" | "timed_out" | "unsupported"; interface TerminalSession { terminalId: string; workspaceId: string; - pty: IPty; + pty: DaemonPty; + /** Unsubscribe from the daemon's output/exit stream when disposed. */ + unsubscribeDaemon: (() => void) | null; sockets: Set; buffer: string[]; bufferBytes: number; @@ -121,6 +205,63 @@ interface TerminalSession { /** PTY lifetime is independent of socket lifetime — sockets detach/reattach freely. */ const sessions = new Map(); +// When the daemon disconnects, close every WS socket so the renderer's +// existing exponential-backoff reconnect kicks in. On reconnect, host-service +// rebuilds the DaemonClient (next getDaemonClient() call), and the adoption- +// via-list path re-attaches to live sessions on the respawned daemon. Without +// this, sockets stay open and input/resize silently fail because the daemon +// reference is dead. +// +// We also clear the in-memory sessions map so a stale subscription closure +// doesn't keep firing for sessions that no longer match daemon state. +onDaemonDisconnect((err) => { + const sessionCount = sessions.size; + if (sessionCount === 0) return; + console.warn( + `[terminal] pty-daemon disconnected (${err?.message ?? "no message"}); closing ${sessionCount} terminal WS socket(s) to trigger renderer reconnect`, + ); + for (const session of sessions.values()) { + for (const socket of session.sockets) { + try { + socket.close(1011, "pty-daemon disconnected"); + } catch { + // best-effort + } + } + session.sockets.clear(); + if (session.unsubscribeDaemon) { + try { + session.unsubscribeDaemon(); + } catch { + // best-effort + } + session.unsubscribeDaemon = null; + } + } + sessions.clear(); +}); + +/** + * Test-only escape hatch: simulates a host-service process restart by clearing + * the in-memory session map without touching the daemon. After calling this, + * createTerminalSessionInternal() is forced down the adoption-on-EEXIST path + * for any session id the daemon already owns. + * + * NEVER call this from production code paths. + */ +export function __resetSessionsForTesting(): void { + for (const session of sessions.values()) { + if (session.unsubscribeDaemon) { + try { + session.unsubscribeDaemon(); + } catch { + // best-effort + } + } + } + sessions.clear(); +} + function pruneAndCountOpenSockets(session: TerminalSession): number { let openSockets = 0; for (const socket of session.sockets) { @@ -293,6 +434,15 @@ export function disposeSession(terminalId: string, db: HostDb) { // PTY may already be dead } } + // Stop receiving daemon callbacks for this session. + if (session.unsubscribeDaemon) { + try { + session.unsubscribeDaemon(); + } catch { + // best-effort + } + session.unsubscribeDaemon = null; + } sessions.delete(terminalId); } @@ -348,7 +498,7 @@ interface CreateTerminalSessionOptions { listed?: boolean; } -export function createTerminalSessionInternal({ +export async function createTerminalSessionInternal({ terminalId, workspaceId, themeType, @@ -356,7 +506,7 @@ export function createTerminalSessionInternal({ eventBus, initialCommand, listed = true, -}: CreateTerminalSessionOptions): TerminalSession | { error: string } { +}: CreateTerminalSessionOptions): Promise { const existing = sessions.get(terminalId); if (existing) { if (listed) existing.listed = true; @@ -405,21 +555,47 @@ export function createTerminalSessionInternal({ hostAgentHookUrl: getHostAgentHookUrl(), }); - let pty: IPty; + let daemon: DaemonClient; + let openResult: { pid: number }; + let isAdopted = false; try { - pty = spawn(shell, shellArgs, { - name: "xterm-256color", - cwd, - cols: 120, - rows: 32, - env: ptyEnv, - }); + daemon = await getDaemonClient(); + try { + openResult = await daemon.open(terminalId, { + shell, + argv: shellArgs, + cwd, + cols: 120, + rows: 32, + env: ptyEnv, + }); + } catch (err) { + // After host-service restart the daemon may already own this + // session. Adopt it instead of looping forever on "session already + // exists". The daemon kept the buffer + the live shell; we just + // need to stitch up a TerminalSession record on this side and + // subscribe-with-replay below. + const msg = err instanceof Error ? err.message : String(err); + if (msg.includes("session already exists")) { + const list = await daemon.list(); + const found = list.find((s) => s.id === terminalId && s.alive); + if (!found) throw err; + openResult = { pid: found.pid }; + isAdopted = true; + console.log( + `[terminal] adopted existing daemon session ${terminalId} pid=${found.pid}`, + ); + } else { + throw err; + } + } } catch (error) { return { error: error instanceof Error ? error.message : "Failed to start terminal", }; } + const pty: DaemonPty = makeDaemonPty(daemon, terminalId, openResult.pid); const createdAt = Date.now(); @@ -436,9 +612,12 @@ export function createTerminalSessionInternal({ }) .run(); - // Determine shell readiness support + // Determine shell readiness support. Adopted sessions are already past + // shell startup, so treat them as immediately ready — the OSC 133;A + // marker has already flown by and we don't want to gate writes on it. const shellName = shell.split("/").pop() || shell; - const shellSupportsReady = SHELLS_WITH_READY_MARKER.has(shellName); + const shellSupportsReady = + !isAdopted && SHELLS_WITH_READY_MARKER.has(shellName); let shellReadyResolve: (() => void) | null = null; const shellReadyPromise = shellSupportsReady @@ -451,6 +630,7 @@ export function createTerminalSessionInternal({ terminalId, workspaceId, pty, + unsubscribeDaemon: null, sockets: new Set(), buffer: [], bufferBytes: 0, @@ -461,12 +641,18 @@ export function createTerminalSessionInternal({ listed, title: null, titleScanState: createTerminalTitleScanState(), - shellReadyState: shellSupportsReady ? "pending" : "unsupported", + shellReadyState: shellSupportsReady + ? "pending" + : isAdopted + ? "ready" + : "unsupported", shellReadyResolve, shellReadyPromise, shellReadyTimeoutId: null, scanState: createScanState(), - initialCommandQueued: false, + // Adopted sessions have already run their initialCommand in the prior + // host-service lifetime — flag it as queued so we don't double-fire it. + initialCommandQueued: isAdopted, }; sessions.set(terminalId, session); portManager.upsertSession(terminalId, workspaceId, pty.pid); @@ -479,57 +665,69 @@ export function createTerminalSessionInternal({ }, SHELL_READY_TIMEOUT_MS); } - pty.onData((rawData) => { - const titleUpdates = scanForTerminalTitle(session.titleScanState, rawData); - for (const title of titleUpdates.updates) { - setSessionTitle(session, title); - } - - // Scan for OSC 133;A and strip it from output - let data = rawData; - if (session.shellReadyState === "pending") { - const result = scanForShellReady(session.scanState, rawData); - data = result.output; - if (result.matched) { - resolveShellReady(session, "ready"); - } - } - if (data.length === 0) return; - - portManager.checkOutputForHint(data); - - if (broadcastMessage(session, { type: "data", data }) === 0) { - bufferOutput(session, data); - } - }); - - pty.onExit(({ exitCode, signal }) => { - session.exited = true; - session.exitCode = exitCode ?? 0; - session.exitSignal = signal ?? 0; - - portManager.unregisterSession(terminalId); - - db.update(terminalSessions) - .set({ status: "exited", endedAt: Date.now() }) - .where(eq(terminalSessions.id, terminalId)) - .run(); - - broadcastMessage(session, { - type: "exit", - exitCode: session.exitCode, - signal: session.exitSignal, - }); - - eventBus?.broadcastTerminalLifecycle({ - workspaceId, - terminalId, - eventType: "exit", - exitCode: session.exitCode, - signal: session.exitSignal, - occurredAt: Date.now(), - }); - }); + // Subscribe to the daemon's output + exit stream for this session. We + // pass replay:true so a fresh host-service after a restart picks up + // whatever the daemon already had buffered for the session. + session.unsubscribeDaemon = daemon.subscribe( + terminalId, + { replay: true }, + { + onOutput(chunk) { + const rawData = chunk.toString("utf8"); + const titleUpdates = scanForTerminalTitle( + session.titleScanState, + rawData, + ); + for (const title of titleUpdates.updates) { + setSessionTitle(session, title); + } + + // Scan for OSC 133;A and strip it from output. + let data = rawData; + if (session.shellReadyState === "pending") { + const result = scanForShellReady(session.scanState, rawData); + data = result.output; + if (result.matched) { + resolveShellReady(session, "ready"); + } + } + if (data.length === 0) return; + + portManager.checkOutputForHint(data); + + if (broadcastMessage(session, { type: "data", data }) === 0) { + bufferOutput(session, data); + } + }, + onExit({ code, signal }) { + session.exited = true; + session.exitCode = code ?? 0; + session.exitSignal = signal ?? 0; + + portManager.unregisterSession(terminalId); + + db.update(terminalSessions) + .set({ status: "exited", endedAt: Date.now() }) + .where(eq(terminalSessions.id, terminalId)) + .run(); + + broadcastMessage(session, { + type: "exit", + exitCode: session.exitCode, + signal: session.exitSignal, + }); + + eventBus?.broadcastTerminalLifecycle({ + workspaceId, + terminalId, + eventType: "exit", + exitCode: session.exitCode, + signal: session.exitSignal, + occurredAt: Date.now(), + }); + }, + }, + ); if (initialCommand) { queueInitialCommand(session, initialCommand); @@ -555,7 +753,7 @@ export function registerWorkspaceTerminalRoute({ return c.json({ error: "Missing terminalId or workspaceId" }, 400); } - const result = createTerminalSessionInternal({ + const result = await createTerminalSessionInternal({ terminalId: body.terminalId, workspaceId: body.workspaceId, themeType: parseThemeType(body.themeType), @@ -621,27 +819,35 @@ export function registerWorkspaceTerminalRoute({ } const themeType = parseThemeType(c.req.query("themeType")); - const result = createTerminalSessionInternal({ - terminalId, - workspaceId, - themeType, - db, - eventBus, - }); + // Daemon open is async; fire-and-forget while keeping the WS alive. + // On success: register the socket; on failure: surface and close. + void (async () => { + const result = await createTerminalSessionInternal({ + terminalId, + workspaceId, + themeType, + db, + eventBus, + }); - if ("error" in result) { - sendMessage(ws, { type: "error", message: result.error }); - ws.close(1011, result.error); - return; - } + if ("error" in result) { + sendMessage(ws, { type: "error", message: result.error }); + ws.close(1011, result.error); + return; + } + + // WS may have closed during the daemon-open await; don't + // register a dead socket into the session's broadcast set. + if (ws.readyState !== SOCKET_OPEN) return; - result.sockets.add(ws); - sendMessage(ws, { type: "title", title: result.title }); + result.sockets.add(ws); + sendMessage(ws, { type: "title", title: result.title }); - db.update(terminalSessions) - .set({ lastAttachedAt: Date.now() }) - .where(eq(terminalSessions.id, terminalId)) - .run(); + db.update(terminalSessions) + .set({ lastAttachedAt: Date.now() }) + .where(eq(terminalSessions.id, terminalId)) + .run(); + })(); return; } diff --git a/packages/host-service/src/trpc/router/host/host.ts b/packages/host-service/src/trpc/router/host/host.ts index d587b0e2501..313a3cf76f1 100644 --- a/packages/host-service/src/trpc/router/host/host.ts +++ b/packages/host-service/src/trpc/router/host/host.ts @@ -12,7 +12,13 @@ import { protectedProcedure, router } from "../../index"; // not uuid. Older host-service binaries call the now-removed `device.*` // procedures and fail at registration. // 0.2.0: `workspaceCreation.adopt` accepts optional `worktreePath`. -const HOST_SERVICE_VERSION = "0.4.0"; +// 0.5.0: pty-daemon supervision moved into host-service. New +// `terminal.daemon` tRPC namespace; existing 0.4.x host-services +// don't expose it, so the desktop coordinator must refuse to adopt +// them on upgrade and respawn with the new bundle. Adopting in +// place would leave the new desktop talking to old code with no +// `terminal.daemon.*` routes, breaking Settings → Manage daemon. +const HOST_SERVICE_VERSION = "0.5.0"; const ORGANIZATION_CACHE_TTL_MS = 60 * 60 * 1000; let cachedOrganization: { diff --git a/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts b/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts new file mode 100644 index 00000000000..cd7ed80a95c --- /dev/null +++ b/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts @@ -0,0 +1,118 @@ +// Tests for the `terminal.daemon` tRPC procedures. +// +// We exercise the wiring (procedure → supervisor delegation, env.ORGANIZATION_ID +// resolution) against a stubbed singleton supervisor, not a real spawn. +// Real spawn coverage is in src/daemon/DaemonSupervisor.node-test.ts. + +import { beforeEach, describe, expect, mock, test } from "bun:test"; +// We need to control what `getSupervisor()` returns AND what +// `waitForDaemonReady` does. The cleanest way is to install a stub +// supervisor into the singleton via `getSupervisor("...")` (which +// constructs lazily on first call) then monkey-patch the methods we +// care about. +import { __resetSupervisorForTesting, getSupervisor } from "../../../daemon"; + +// Make env.ORGANIZATION_ID resolvable. The env module reads from +// process.env at module load via @t3-oss/env-core, so we must set +// the var BEFORE importing. +process.env.ORGANIZATION_ID = "00000000-0000-4000-8000-000000000000"; +process.env.HOST_SERVICE_SECRET = "test-secret"; +process.env.HOST_DB_PATH = "/tmp/test-host.db"; +process.env.HOST_MIGRATIONS_FOLDER = "/tmp/test-migrations"; +process.env.AUTH_TOKEN = "test-auth-token"; +process.env.SUPERSET_API_URL = "https://cloud.example.com"; + +const { appRouter } = await import("../router.ts"); + +function makeCaller(authenticated = true) { + // Cast to whatever; we only invoke procedures that don't touch db/git/etc. + return appRouter.createCaller({ + isAuthenticated: authenticated, + } as unknown as Parameters[0]); +} + +beforeEach(() => { + __resetSupervisorForTesting(); +}); + +describe("terminal.daemon tRPC procedures", () => { + test("rejects with UNAUTHORIZED when ctx is unauthenticated", async () => { + const caller = makeCaller(false); + await expect(caller.terminal.daemon.getUpdateStatus()).rejects.toThrow( + /Invalid or missing/, + ); + }); + + test("getUpdateStatus delegates to supervisor", async () => { + const sup = getSupervisor("/nonexistent"); + const getUpdateStatusMock = mock(() => ({ + pending: true, + running: "0.0.9", + expected: "0.1.0", + })); + ( + sup as unknown as { getUpdateStatus: typeof sup.getUpdateStatus } + ).getUpdateStatus = getUpdateStatusMock as typeof sup.getUpdateStatus; + + const caller = makeCaller(); + const result = await caller.terminal.daemon.getUpdateStatus(); + + expect(getUpdateStatusMock).toHaveBeenCalledTimes(1); + expect(getUpdateStatusMock).toHaveBeenCalledWith( + "00000000-0000-4000-8000-000000000000", + ); + expect(result).toEqual({ + pending: true, + running: "0.0.9", + expected: "0.1.0", + }); + }); + + test("listSessions awaits bootstrap before delegating", async () => { + const sup = getSupervisor("/nonexistent"); + const order: string[] = []; + + const ensureMock = mock(async () => { + order.push("ensure"); + await new Promise((r) => setTimeout(r, 30)); + return {} as Awaited>; + }); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + + const listMock = mock(async () => { + order.push("list"); + return []; + }); + (sup as unknown as { listSessions: typeof sup.listSessions }).listSessions = + listMock as typeof sup.listSessions; + + const caller = makeCaller(); + const result = await caller.terminal.daemon.listSessions(); + + expect(result).toEqual([]); + // Bootstrap must have started before list resolved. + expect(order[0]).toBe("ensure"); + expect(order).toContain("list"); + }); + + test("restart awaits bootstrap then delegates to supervisor.restart", async () => { + const sup = getSupervisor("/nonexistent"); + const ensureMock = mock( + async () => ({}) as Awaited>, + ); + const restartMock = mock(async () => ({ success: true as const })); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + (sup as unknown as { restart: typeof sup.restart }).restart = + restartMock as typeof sup.restart; + + const caller = makeCaller(); + const result = await caller.terminal.daemon.restart(); + + expect(result).toEqual({ success: true }); + expect(restartMock).toHaveBeenCalledWith( + "00000000-0000-4000-8000-000000000000", + ); + }); +}); diff --git a/packages/host-service/src/trpc/router/terminal/terminal.ts b/packages/host-service/src/trpc/router/terminal/terminal.ts index b8e36c2a862..411263775e0 100644 --- a/packages/host-service/src/trpc/router/terminal/terminal.ts +++ b/packages/host-service/src/trpc/router/terminal/terminal.ts @@ -1,7 +1,9 @@ import { TRPCError } from "@trpc/server"; import { eq } from "drizzle-orm"; import { z } from "zod"; +import { getSupervisor, waitForDaemonReady } from "../../../daemon"; import { terminalSessions, workspaces } from "../../../db/schema"; +import { env } from "../../../env"; import { createTerminalSessionInternal, disposeSession, @@ -10,6 +12,27 @@ import { } from "../../../terminal/terminal"; import { protectedProcedure, router } from "../../index"; +// Daemon control surface — sibling to the per-workspace terminal ops above. +// Org-scoped (one daemon per host-service); reads org id from env. +// Supervisor lives in this same process so calls go through the in-process +// singleton, not over the wire. +const daemonRouter = router({ + getUpdateStatus: protectedProcedure.query(() => + getSupervisor().getUpdateStatus(env.ORGANIZATION_ID), + ), + + listSessions: protectedProcedure.query(async () => { + // Wait for the bootstrap so the supervisor has a socket path. + await waitForDaemonReady(env.ORGANIZATION_ID); + return getSupervisor().listSessions(env.ORGANIZATION_ID); + }), + + restart: protectedProcedure.mutation(async () => { + await waitForDaemonReady(env.ORGANIZATION_ID); + return getSupervisor().restart(env.ORGANIZATION_ID); + }), +}); + export const terminalRouter = router({ launchSession: protectedProcedure .input( @@ -20,9 +43,9 @@ export const terminalRouter = router({ themeType: z.string().optional(), }), ) - .mutation(({ ctx, input }) => { + .mutation(async ({ ctx, input }) => { const terminalId = input.terminalId ?? crypto.randomUUID(); - const result = createTerminalSessionInternal({ + const result = await createTerminalSessionInternal({ terminalId, workspaceId: input.workspaceId, themeType: parseThemeType(input.themeType), @@ -94,4 +117,6 @@ export const terminalRouter = router({ disposeSession(input.terminalId, ctx.db); return { terminalId: input.terminalId, status: "disposed" as const }; }), + + daemon: daemonRouter, }); diff --git a/packages/host-service/src/trpc/router/workspace-creation/procedures/create.ts b/packages/host-service/src/trpc/router/workspace-creation/procedures/create.ts index 965b26dc057..d9853b69d76 100644 --- a/packages/host-service/src/trpc/router/workspace-creation/procedures/create.ts +++ b/packages/host-service/src/trpc/router/workspace-creation/procedures/create.ts @@ -336,7 +336,7 @@ export const create = protectedProcedure const warnings: string[] = []; if (input.composer.runSetupScript) { - const { terminal, warning } = startSetupTerminalIfPresent({ + const { terminal, warning } = await startSetupTerminalIfPresent({ ctx, workspaceId: cloudRow.id, worktreePath, diff --git a/packages/host-service/src/trpc/router/workspace-creation/shared/finish-checkout.ts b/packages/host-service/src/trpc/router/workspace-creation/shared/finish-checkout.ts index 7be2be8f561..e982e2e479f 100644 --- a/packages/host-service/src/trpc/router/workspace-creation/shared/finish-checkout.ts +++ b/packages/host-service/src/trpc/router/workspace-creation/shared/finish-checkout.ts @@ -143,7 +143,7 @@ export async function finishCheckout( const warnings: string[] = [...args.extraWarnings]; if (args.runSetupScript) { - const { terminal, warning } = startSetupTerminalIfPresent({ + const { terminal, warning } = await startSetupTerminalIfPresent({ ctx, workspaceId: cloudRow.id, worktreePath: args.worktreePath, diff --git a/packages/host-service/src/trpc/router/workspace-creation/shared/setup-terminal.ts b/packages/host-service/src/trpc/router/workspace-creation/shared/setup-terminal.ts index 277061ccabe..9edd312b0d9 100644 --- a/packages/host-service/src/trpc/router/workspace-creation/shared/setup-terminal.ts +++ b/packages/host-service/src/trpc/router/workspace-creation/shared/setup-terminal.ts @@ -4,18 +4,21 @@ import { createTerminalSessionInternal } from "../../../../terminal/terminal"; import type { HostServiceContext } from "../../../../types"; import type { TerminalDescriptor } from "./types"; -export function startSetupTerminalIfPresent(args: { +export async function startSetupTerminalIfPresent(args: { ctx: HostServiceContext; workspaceId: string; worktreePath: string; -}): { terminal: TerminalDescriptor | null; warning: string | null } { +}): Promise<{ + terminal: TerminalDescriptor | null; + warning: string | null; +}> { const setupScriptPath = join(args.worktreePath, ".superset", "setup.sh"); if (!existsSync(setupScriptPath)) { return { terminal: null, warning: null }; } const terminalId = crypto.randomUUID(); - const result = createTerminalSessionInternal({ + const result = await createTerminalSessionInternal({ terminalId, workspaceId: args.workspaceId, db: args.ctx.db, diff --git a/packages/host-service/test/setup-env.ts b/packages/host-service/test/setup-env.ts new file mode 100644 index 00000000000..2f0a33dea56 --- /dev/null +++ b/packages/host-service/test/setup-env.ts @@ -0,0 +1,11 @@ +// Populate the env vars `src/env.ts` validates at module load so test runtimes +// that boot host-service via `createApp` (instead of `serve.ts`) can import +// modules that transitively load the validated env. Real values come from +// each test's `createTestHost` config; these defaults exist purely to satisfy +// schema validation at import time. + +process.env.ORGANIZATION_ID ??= "00000000-0000-4000-8000-000000000000"; +process.env.HOST_DB_PATH ??= "/tmp/host-service-test.db"; +process.env.HOST_MIGRATIONS_FOLDER ??= "/tmp/host-service-test-migrations"; +process.env.AUTH_TOKEN ??= "test-auth-token"; +process.env.SUPERSET_API_URL ??= "http://localhost:0"; diff --git a/packages/port-scanner/src/index.ts b/packages/port-scanner/src/index.ts index dc5bd82a156..6a7e6efe696 100644 --- a/packages/port-scanner/src/index.ts +++ b/packages/port-scanner/src/index.ts @@ -2,15 +2,15 @@ export { type KillFn, PortManager, type PortManagerOptions, -} from "./port-manager"; +} from "./port-manager.ts"; export { getListeningPortsForPids, getProcessTree, type PortInfo, -} from "./scanner"; +} from "./scanner.ts"; export { parseStaticPortsConfig, type StaticPortLabel, type StaticPortsParseResult, -} from "./static-ports"; -export type { DetectedPort } from "./types"; +} from "./static-ports.ts"; +export type { DetectedPort } from "./types.ts"; diff --git a/packages/port-scanner/src/port-manager.ts b/packages/port-scanner/src/port-manager.ts index 7ccc634cd1c..bbd5a3af2fd 100644 --- a/packages/port-scanner/src/port-manager.ts +++ b/packages/port-scanner/src/port-manager.ts @@ -3,8 +3,8 @@ import { getListeningPortsForPids, getProcessTree, type PortInfo, -} from "./scanner"; -import type { DetectedPort } from "./types"; +} from "./scanner.ts"; +import type { DetectedPort } from "./types.ts"; /** How often to poll for port changes (in ms) */ const SCAN_INTERVAL_MS = 2500; diff --git a/packages/port-scanner/src/procfs.ts b/packages/port-scanner/src/procfs.ts index 954c7303aa5..650d85aa552 100644 --- a/packages/port-scanner/src/procfs.ts +++ b/packages/port-scanner/src/procfs.ts @@ -1,5 +1,5 @@ import { promises as fs } from "node:fs"; -import type { PortInfo } from "./scanner"; +import type { PortInfo } from "./scanner.ts"; /** * Linux-only: resolve listening TCP ports for a set of PIDs by reading diff --git a/packages/port-scanner/src/scanner.ts b/packages/port-scanner/src/scanner.ts index abe0b603f50..75ab20681e5 100644 --- a/packages/port-scanner/src/scanner.ts +++ b/packages/port-scanner/src/scanner.ts @@ -2,7 +2,7 @@ import { execFile } from "node:child_process"; import os from "node:os"; import { promisify } from "node:util"; import pidtree from "pidtree"; -import { getListeningPortsLinuxProcfs } from "./procfs"; +import { getListeningPortsLinuxProcfs } from "./procfs.ts"; const execFileAsync = promisify(execFile); diff --git a/packages/pty-daemon/README.md b/packages/pty-daemon/README.md new file mode 100644 index 00000000000..b2f169cdd85 --- /dev/null +++ b/packages/pty-daemon/README.md @@ -0,0 +1,119 @@ +# @superset/pty-daemon + +Long-lived PTY-owning process for the v2 desktop terminal. host-service is a +client over a Unix socket; routine host-service upgrades don't touch shells. + +Implements [Phase 1 of the daemon plan](../../apps/desktop/plans/20260429-pty-daemon-implementation.md). +This package is **standalone**: it does not import from `@superset/host-service` +or any other workspace package. Host-service consumes only the protocol types +via `@superset/pty-daemon/protocol`. + +## Runtime + +**Production: Node ≥ 20** (Electron's bundled Node), via +`process.execPath` — exactly the same pattern as `host-service` already +uses today (`packages/host-service/build.ts` → `dist/host-service.js`, +spawned by `apps/desktop/src/main/lib/host-service-coordinator.ts`). +Bun is the build tool, not a runtime. **No new runtime in the desktop +app bundle.** + +**Why not Bun at runtime:** verified during development that node-pty +1.1's master fd handling is incompatible with Bun 1.3 (`tty.ReadStream` +closes immediately, alternate `fs.createReadStream(null, { fd })` +returns EAGAIN with no recovery). The daemon needs a runtime where +node-pty actually works. + +**Dev:** unit tests run under Bun (`bun test`) for speed; integration +tests run under Node (`bun run test:integration`) since they touch real +PTYs. The daemon binary itself runs under Node in both dev and prod. + +## Layout + +``` +src/ +├── main.ts # Node entrypoint: argv → Server.listen() +├── index.ts # Public exports for host-service consumers +├── protocol/ # Wire schemas + length-prefixed framing +│ ├── version.ts # CURRENT_PROTOCOL_VERSION + supported list +│ ├── messages.ts # ClientMessage / ServerMessage unions +│ ├── framing.ts # encodeFrame / FrameDecoder (4-byte BE prefix) +│ └── index.ts +├── Pty/ # node-pty thin wrapper with dim validation +│ ├── Pty.ts +│ └── index.ts +├── SessionStore/ # in-memory map + 64KB ring buffer per session +│ ├── SessionStore.ts +│ └── index.ts +├── handlers/ # pure functions: open/input/resize/close/list/subscribe +│ ├── handlers.ts +│ └── index.ts +└── Server/ # AF_UNIX SOCK_STREAM accept loop, handshake, dispatch + ├── Server.ts + └── index.ts + +test/ +├── helpers/ +│ └── client.ts # reusable DaemonClient: connect, send, waitFor, collect +├── integration.test.ts # smoke / happy-path (3 tests) +└── control-plane.test.ts # exhaustive control-plane coverage (25 tests) + +build.ts # Bun bundler → dist/pty-daemon.js (target: node) +``` + +## Design notes + +- **Stateless from the client's perspective.** Every protocol call carries + full context. No client tracking, no session tombstones, no business + rules. Single design principle from + [the implementation plan](../../apps/desktop/plans/20260429-pty-daemon-implementation.md#the-single-design-principle). +- **Auth boundary = Unix socket file mode 0600.** No in-band tokens. The + daemon trusts whoever can open the socket. +- **Buffer is in-memory only.** Survives host-service restarts (because the + daemon does), but never persisted to disk. No SQLite, no scrollback files. + v1's `HistoryManager` is explicitly out of scope. +- **Protocol versioned from day one.** Handshake (`hello` / `hello-ack`) + picks the highest mutually supported version. + +## Testing + +```sh +bun test # 24 unit tests (protocol framing, handlers, SessionStore, Pty validation) +bun run test:integration # 28 integration tests under node --test: + # - test/integration.test.ts (smoke / happy-path, 3 tests) + # - test/control-plane.test.ts (every usage pattern, 25 tests) +bun run typecheck # tsc --noEmit +bun run build:daemon # bundle src/main.ts → dist/pty-daemon.js (target: node) +``` + +**Control-plane coverage** (`test/control-plane.test.ts`): + +- Handshake: rejects non-hello first, picks highest mutual protocol, rejects unsupported, rejects duplicate hello. +- Session lifecycle: invalid dims, duplicate ids, ENOENT on missing, instant-exit shells, SIGKILL on hung shells. +- I/O patterns: resize during running shell, burst output (200 lines), multi-byte UTF-8 (🚀). +- Multi-client fan-out: two subscribers see same output, unsubscribe stops further delivery, dropped subscriber doesn't crash daemon. +- Detach + reattach (the headline feature): late subscriber gets replay, full reattach cycle continues live after disconnect. +- list reflects active sessions with cols/rows/alive. +- Hostile input: malformed frames disconnect cleanly, oversized frames are rejected, input on exited session returns EEXITED. +- Concurrency: 20 sessions in parallel from one connection, 10 connections opening sessions in parallel. +- Server shutdown: in-flight clients disconnect cleanly, owned PTYs are killed. +- Framing: tolerates split frames across multiple TCP chunks. + +Why two runners? `bun test` is fast for pure-JS work. node-pty doesn't work +under Bun, so anything that spawns a real PTY runs under Node. + +## Running locally + +```sh +bun run start --socket=/tmp/pty-daemon.sock +``` + +Logs go to stderr; stdout stays empty (so the daemon can later be supervised +by host-service with stdout reserved for protocol or kept dark). + +## Out of scope (Phase 1) + +- Host-service integration (DaemonClient, terminal.ts refactor, manifest + adoption) — separate PR. +- Daemon-upgrade handoff via `child_process.spawn` `stdio` fd inheritance + — separate PR (Phase 2 of the plan). +- Windows ConPTY — not in v1 protocol; defer until Windows users justify it. diff --git a/packages/pty-daemon/build.ts b/packages/pty-daemon/build.ts new file mode 100644 index 00000000000..ff7613a8b13 --- /dev/null +++ b/packages/pty-daemon/build.ts @@ -0,0 +1,32 @@ +/** + * Bundles the pty-daemon entry point into a single JS file executable by a + * standalone Node.js runtime (matches packages/host-service/build.ts). Native + * addons (node-pty) are marked external and resolved from the desktop app's + * lib/native/ at runtime. + * + * Production: Electron spawns the daemon via process.execPath (its bundled + * Node), exactly like host-service. No Bun in the production bundle. + */ +import { existsSync, mkdirSync } from "node:fs"; + +const outdir = "dist"; +if (!existsSync(outdir)) { + mkdirSync(outdir, { recursive: true }); +} + +const result = await Bun.build({ + entrypoints: ["src/main.ts"], + target: "node", + outdir, + naming: "pty-daemon.js", + format: "esm", + external: ["node-pty"], +}); + +if (!result.success) { + console.error("[pty-daemon] build failed:"); + for (const log of result.logs) { + console.error(log); + } + process.exit(1); +} diff --git a/packages/pty-daemon/package.json b/packages/pty-daemon/package.json new file mode 100644 index 00000000000..1a116d060f5 --- /dev/null +++ b/packages/pty-daemon/package.json @@ -0,0 +1,39 @@ +{ + "name": "@superset/pty-daemon", + "version": "0.1.0", + "private": true, + "type": "module", + "exports": { + ".": { + "types": "./src/index.ts", + "default": "./src/index.ts" + }, + "./protocol": { + "types": "./src/protocol/index.ts", + "default": "./src/protocol/index.ts" + } + }, + "bin": { + "pty-daemon": "./src/main.ts" + }, + "engines": { + "node": ">=20" + }, + "scripts": { + "clean": "git clean -xdf .cache .turbo dist node_modules", + "start": "node --experimental-strip-types src/main.ts", + "build:daemon": "bun run build.ts", + "typecheck": "tsc --noEmit --emitDeclarationOnly false", + "test": "bun test src/protocol src/SessionStore src/handlers src/Pty/Pty.test.ts", + "test:integration": "node --experimental-strip-types --test test/integration.test.ts test/control-plane.test.ts test/signal-recovery.test.ts" + }, + "dependencies": { + "node-pty": "1.1.0" + }, + "devDependencies": { + "@superset/typescript": "workspace:*", + "@types/node": "^24.9.1", + "bun-types": "^1.3.1", + "typescript": "^5.9.3" + } +} diff --git a/packages/pty-daemon/src/Pty/Pty.test.ts b/packages/pty-daemon/src/Pty/Pty.test.ts new file mode 100644 index 00000000000..0cbad51032b --- /dev/null +++ b/packages/pty-daemon/src/Pty/Pty.test.ts @@ -0,0 +1,34 @@ +import { describe, expect, test } from "bun:test"; +import { spawn } from "./Pty.ts"; + +// node-pty's runtime requires Node (Bun's tty.ReadStream handling is +// incompatible with the master fd setup). The daemon ships running under +// node; integration spawn tests live in test/integration.ts and run via +// `npm run test:integration`. Here we only cover the synchronous validation +// logic that doesn't require spawning a real PTY. + +describe("Pty wrapper (validation only — spawn behavior tested under node)", () => { + test("rejects invalid spawn dims (cols)", () => { + expect(() => + spawn({ + meta: { shell: "/bin/sh", argv: [], cols: 0, rows: 24 }, + }), + ).toThrow(/invalid cols/); + }); + + test("rejects invalid spawn dims (rows)", () => { + expect(() => + spawn({ + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 0 }, + }), + ).toThrow(/invalid rows/); + }); + + test("rejects non-integer dims", () => { + expect(() => + spawn({ + meta: { shell: "/bin/sh", argv: [], cols: 80.5, rows: 24 }, + }), + ).toThrow(/invalid cols/); + }); +}); diff --git a/packages/pty-daemon/src/Pty/Pty.ts b/packages/pty-daemon/src/Pty/Pty.ts new file mode 100644 index 00000000000..608822ac047 --- /dev/null +++ b/packages/pty-daemon/src/Pty/Pty.ts @@ -0,0 +1,84 @@ +import * as nodePty from "node-pty"; +import type { SessionMeta } from "../protocol/index.ts"; + +export type PtyOnData = (data: Buffer) => void; +export type PtyOnExit = (info: { + code: number | null; + signal: number | null; +}) => void; + +export interface Pty { + readonly pid: number; + readonly meta: SessionMeta; + write(data: Buffer): void; + resize(cols: number, rows: number): void; + kill(signal?: NodeJS.Signals): void; + onData(cb: PtyOnData): void; + onExit(cb: PtyOnExit): void; +} + +export interface SpawnOptions { + meta: SessionMeta; +} + +class NodePtyAdapter implements Pty { + readonly pid: number; + meta: SessionMeta; + private term: nodePty.IPty; + + constructor(term: nodePty.IPty, meta: SessionMeta) { + this.term = term; + this.pid = term.pid; + this.meta = meta; + } + + write(data: Buffer): void { + // node-pty's write accepts strings or buffers; pass buffer to keep bytes intact. + this.term.write(data as unknown as string); + } + + resize(cols: number, rows: number): void { + validateDims(cols, rows); + this.term.resize(cols, rows); + this.meta = { ...this.meta, cols, rows }; + } + + kill(signal?: NodeJS.Signals): void { + this.term.kill(signal); + } + + onData(cb: PtyOnData): void { + this.term.onData((d) => { + cb(typeof d === "string" ? Buffer.from(d, "utf8") : d); + }); + } + + onExit(cb: PtyOnExit): void { + this.term.onExit(({ exitCode, signal }) => { + cb({ code: exitCode ?? null, signal: signal ?? null }); + }); + } +} + +function validateDims(cols: number, rows: number): void { + if (!Number.isInteger(cols) || cols <= 0) { + throw new Error(`invalid cols: ${cols}`); + } + if (!Number.isInteger(rows) || rows <= 0) { + throw new Error(`invalid rows: ${rows}`); + } +} + +export function spawn({ meta }: SpawnOptions): Pty { + validateDims(meta.cols, meta.rows); + const term = nodePty.spawn(meta.shell, meta.argv, { + name: "xterm-256color", + cols: meta.cols, + rows: meta.rows, + cwd: meta.cwd, + env: meta.env, + // node-pty's encoding defaults to utf8; we want raw bytes for fidelity. + encoding: null, + }); + return new NodePtyAdapter(term, meta); +} diff --git a/packages/pty-daemon/src/Pty/index.ts b/packages/pty-daemon/src/Pty/index.ts new file mode 100644 index 00000000000..8ce7d556c01 --- /dev/null +++ b/packages/pty-daemon/src/Pty/index.ts @@ -0,0 +1,2 @@ +export type { Pty, PtyOnData, PtyOnExit, SpawnOptions } from "./Pty.ts"; +export { spawn } from "./Pty.ts"; diff --git a/packages/pty-daemon/src/Server/Server.ts b/packages/pty-daemon/src/Server/Server.ts new file mode 100644 index 00000000000..3332cbebdb9 --- /dev/null +++ b/packages/pty-daemon/src/Server/Server.ts @@ -0,0 +1,268 @@ +import * as fs from "node:fs"; +import * as net from "node:net"; +import * as path from "node:path"; +import type { Conn, HandlerCtx } from "../handlers/index.ts"; +import { + handleClose, + handleInput, + handleList, + handleOpen, + handleResize, + handleSubscribe, + handleUnsubscribe, +} from "../handlers/index.ts"; +import { + type ClientMessage, + encodeFrame, + FrameDecoder, + type HelloMessage, + type ServerMessage, + SUPPORTED_PROTOCOL_VERSIONS, +} from "../protocol/index.ts"; +import type { Session } from "../SessionStore/index.ts"; +import { SessionStore } from "../SessionStore/index.ts"; + +export interface ServerOptions { + socketPath: string; + daemonVersion: string; + bufferCap?: number; +} + +interface ConnState extends Conn { + socket: net.Socket; + decoder: FrameDecoder; + negotiated: number | null; +} + +export class Server { + private readonly server: net.Server; + private readonly store: SessionStore; + private readonly conns = new Set(); + private readonly opts: ServerOptions; + + constructor(opts: ServerOptions) { + this.opts = opts; + this.store = new SessionStore({ bufferCap: opts.bufferCap }); + this.server = net.createServer((socket) => this.onConnection(socket)); + } + + async listen(): Promise { + const dir = path.dirname(this.opts.socketPath); + fs.mkdirSync(dir, { recursive: true }); + // Stale-socket cleanup: remove any prior socket file at this path. + try { + fs.unlinkSync(this.opts.socketPath); + } catch (err) { + if ((err as NodeJS.ErrnoException).code !== "ENOENT") throw err; + } + await new Promise((resolve, reject) => { + this.server.once("error", reject); + this.server.listen(this.opts.socketPath, () => { + this.server.off("error", reject); + resolve(); + }); + }); + // Owner-only access. The socket file IS the auth boundary. + fs.chmodSync(this.opts.socketPath, 0o600); + } + + async close(): Promise { + for (const c of this.conns) c.socket.destroy(); + this.conns.clear(); + // Kill all owned PTYs so the daemon process can actually exit (open + // master fds keep the event loop alive). This is what the v1 lessons + // call "synchronous teardown only" — no setTimeout, no graceful drain. + for (const session of this.store.all()) { + try { + session.pty.kill("SIGKILL"); + } catch { + // already dead, ignore + } + } + await new Promise((resolve) => this.server.close(() => resolve())); + try { + fs.unlinkSync(this.opts.socketPath); + } catch { + // ignore + } + } + + private onConnection(socket: net.Socket): void { + const conn: ConnState = { + socket, + decoder: new FrameDecoder(), + negotiated: null, + subscriptions: new Set(), + send: (msg) => writeMessage(socket, msg), + }; + this.conns.add(conn); + + socket.on("data", (chunk) => { + try { + conn.decoder.push(chunk); + for (const raw of conn.decoder.drain()) { + this.dispatch(conn, raw as ClientMessage); + } + } catch (err) { + conn.send({ + type: "error", + message: (err as Error).message, + code: "EPROTO", + }); + socket.destroy(); + } + }); + socket.on("close", () => { + this.conns.delete(conn); + }); + socket.on("error", () => { + this.conns.delete(conn); + }); + } + + private dispatch(conn: ConnState, msg: ClientMessage): void { + // Handshake must come first. + if (conn.negotiated === null) { + if (msg.type !== "hello") { + conn.send({ type: "error", message: "expected hello", code: "EPROTO" }); + conn.socket.destroy(); + return; + } + const negotiated = pickProtocol(msg); + if (negotiated === null) { + conn.send({ + type: "error", + message: `no compatible protocol; daemon supports ${SUPPORTED_PROTOCOL_VERSIONS.join(",")}`, + code: "EVERSION", + }); + conn.socket.destroy(); + return; + } + conn.negotiated = negotiated; + conn.send({ + type: "hello-ack", + protocol: negotiated, + daemonVersion: this.opts.daemonVersion, + }); + return; + } + + const ctx = this.handlerCtx(); + switch (msg.type) { + case "hello": { + conn.send({ + type: "error", + message: "duplicate hello", + code: "EPROTO", + }); + return; + } + case "open": { + conn.send(handleOpen(ctx, msg)); + return; + } + case "input": { + const reply = handleInput(ctx, msg); + if (reply) conn.send(reply); + return; + } + case "resize": { + const reply = handleResize(ctx, msg); + if (reply) conn.send(reply); + return; + } + case "close": { + conn.send(handleClose(ctx, msg)); + return; + } + case "list": { + conn.send(handleList(ctx)); + return; + } + case "subscribe": { + handleSubscribe(ctx, conn, msg); + return; + } + case "unsubscribe": { + handleUnsubscribe(conn, msg); + return; + } + default: { + const t = (msg as { type: string }).type; + conn.send({ + type: "error", + message: `unknown op: ${t}`, + code: "EPROTO", + }); + return; + } + } + } + + private handlerCtx(): HandlerCtx { + return { + store: this.store, + wireSession: (session) => this.wireSession(session), + }; + } + + /** + * Pipe the session's PTY events into the broadcast set: any connection + * subscribed to this session id receives the output / exit frames. + */ + private wireSession(session: Session): void { + session.pty.onData((chunk) => { + this.store.appendOutput(session, chunk); + const out: ServerMessage = { + type: "output", + id: session.id, + data: chunk.toString("base64"), + }; + for (const c of this.conns) { + if (c.subscriptions.has(session.id)) c.send(out); + } + }); + session.pty.onExit((info) => { + session.exited = true; + session.exitCode = info.code; + session.exitSignal = info.signal; + const ev: ServerMessage = { + type: "exit", + id: session.id, + code: info.code, + signal: info.signal, + }; + for (const c of this.conns) { + if (c.subscriptions.has(session.id)) { + c.send(ev); + c.subscriptions.delete(session.id); + } + } + // Delete the session immediately. Without this, every closed + // terminal pane left a row in the store forever — list-reply + // inflated, memory grew unbounded. + // + // Tradeoff: a late subscriber that connects after this point + // (e.g. host-service restarting *during* the shell exit window) + // gets ENOENT instead of the buffered output + exit event. The + // renderer's xterm.js already has whatever was rendered before + // disconnect — it just loses the "Process exited with code N" + // footer for that narrow window. + this.store.delete(session.id); + }); + } +} + +function pickProtocol(hello: HelloMessage): number | null { + const supported = new Set(SUPPORTED_PROTOCOL_VERSIONS); + let best: number | null = null; + for (const v of hello.protocols) { + if (supported.has(v) && (best === null || v > best)) best = v; + } + return best; +} + +function writeMessage(socket: net.Socket, msg: ServerMessage): void { + if (socket.destroyed) return; + socket.write(encodeFrame(msg)); +} diff --git a/packages/pty-daemon/src/Server/index.ts b/packages/pty-daemon/src/Server/index.ts new file mode 100644 index 00000000000..0126739712f --- /dev/null +++ b/packages/pty-daemon/src/Server/index.ts @@ -0,0 +1 @@ +export { Server, type ServerOptions } from "./Server.ts"; diff --git a/packages/pty-daemon/src/SessionStore/SessionStore.test.ts b/packages/pty-daemon/src/SessionStore/SessionStore.test.ts new file mode 100644 index 00000000000..19d6da4c989 --- /dev/null +++ b/packages/pty-daemon/src/SessionStore/SessionStore.test.ts @@ -0,0 +1,82 @@ +import { describe, expect, test } from "bun:test"; +import type { Pty } from "../Pty/index.ts"; +import { SessionStore } from "./SessionStore.ts"; + +function fakePty(meta: { cols: number; rows: number }): Pty { + return { + pid: 12345, + meta: { + shell: "/bin/sh", + argv: [], + cols: meta.cols, + rows: meta.rows, + }, + write: () => {}, + resize: () => {}, + kill: () => {}, + onData: () => {}, + onExit: () => {}, + }; +} + +describe("SessionStore", () => { + test("add / get / delete", () => { + const store = new SessionStore(); + const pty = fakePty({ cols: 80, rows: 24 }); + store.add("s0", pty); + expect(store.size()).toBe(1); + expect(store.get("s0")?.id).toBe("s0"); + expect(store.delete("s0")).toBe(true); + expect(store.size()).toBe(0); + }); + + test("rejects duplicate ids", () => { + const store = new SessionStore(); + const pty = fakePty({ cols: 80, rows: 24 }); + store.add("s0", pty); + expect(() => store.add("s0", pty)).toThrow(/already exists/); + }); + + test("list reflects sessions", () => { + const store = new SessionStore(); + store.add("a", fakePty({ cols: 80, rows: 24 })); + store.add("b", fakePty({ cols: 100, rows: 30 })); + const list = store.list(); + expect(list).toHaveLength(2); + expect(list.map((s) => s.id).sort()).toEqual(["a", "b"]); + }); + + test("appendOutput accumulates within cap", () => { + const store = new SessionStore({ bufferCap: 100 }); + const session = store.add("s0", fakePty({ cols: 80, rows: 24 })); + store.appendOutput(session, Buffer.from("hello")); + store.appendOutput(session, Buffer.from(" world")); + expect(store.snapshotBuffer(session).toString()).toBe("hello world"); + expect(session.bufferBytes).toBe(11); + }); + + test("appendOutput evicts oldest chunks when exceeding cap", () => { + const store = new SessionStore({ bufferCap: 10 }); + const session = store.add("s0", fakePty({ cols: 80, rows: 24 })); + store.appendOutput(session, Buffer.from("AAAA")); // 4 + store.appendOutput(session, Buffer.from("BBBB")); // 8 + store.appendOutput(session, Buffer.from("CCCCCC")); // would be 14 → evict AAAA + const snap = store.snapshotBuffer(session).toString(); + expect(snap).toBe("BBBBCCCCCC"); + expect(session.bufferBytes).toBe(10); + }); + + test("appendOutput keeps buffer within cap across many writes", () => { + const store = new SessionStore({ bufferCap: 32 }); + const session = store.add("s0", fakePty({ cols: 80, rows: 24 })); + for (let i = 0; i < 100; i++) { + store.appendOutput( + session, + Buffer.from(`chunk${i.toString().padStart(2, "0")}-`), + ); + } + expect(session.bufferBytes).toBeLessThanOrEqual(32); + // Final chunk must always be present + expect(store.snapshotBuffer(session).toString()).toContain("chunk99-"); + }); +}); diff --git a/packages/pty-daemon/src/SessionStore/SessionStore.ts b/packages/pty-daemon/src/SessionStore/SessionStore.ts new file mode 100644 index 00000000000..40f69d96979 --- /dev/null +++ b/packages/pty-daemon/src/SessionStore/SessionStore.ts @@ -0,0 +1,104 @@ +import type { Pty } from "../Pty/index.ts"; +import type { SessionInfo } from "../protocol/index.ts"; + +const DEFAULT_BUFFER_BYTES = 64 * 1024; + +export interface Session { + id: string; + pty: Pty; + /** ring buffer for replay-on-attach; in-memory only, never persisted. */ + buffer: Buffer[]; + bufferBytes: number; + bufferCap: number; + exited: boolean; + exitCode: number | null; + exitSignal: number | null; +} + +export interface SessionStoreOptions { + bufferCap?: number; +} + +/** + * In-memory map of active sessions. Daemon-local state; nothing is persisted. + * + * Replay buffer is a circular FIFO of byte chunks per session, capped by + * total byte size. When new output exceeds the cap, oldest chunks are + * dropped (head). The cap is small (~64 KB) — enough to redraw a typical + * shell screen on attach. Larger scrollback is the renderer's xterm.js + * responsibility. + */ +export class SessionStore { + private readonly sessions = new Map(); + private readonly bufferCap: number; + + constructor(opts: SessionStoreOptions = {}) { + this.bufferCap = opts.bufferCap ?? DEFAULT_BUFFER_BYTES; + } + + add(id: string, pty: Pty): Session { + if (this.sessions.has(id)) { + throw new Error(`session already exists: ${id}`); + } + const session: Session = { + id, + pty, + buffer: [], + bufferBytes: 0, + bufferCap: this.bufferCap, + exited: false, + exitCode: null, + exitSignal: null, + }; + this.sessions.set(id, session); + return session; + } + + get(id: string): Session | undefined { + return this.sessions.get(id); + } + + delete(id: string): boolean { + return this.sessions.delete(id); + } + + list(): SessionInfo[] { + const out: SessionInfo[] = []; + for (const s of this.sessions.values()) { + out.push({ + id: s.id, + pid: s.pty.pid, + cols: s.pty.meta.cols, + rows: s.pty.meta.rows, + alive: !s.exited, + }); + } + return out; + } + + all(): IterableIterator { + return this.sessions.values(); + } + + size(): number { + return this.sessions.size; + } + + /** Append output to a session's ring buffer; evict oldest chunks past the cap. */ + appendOutput(session: Session, chunk: Buffer): void { + session.buffer.push(chunk); + session.bufferBytes += chunk.byteLength; + while ( + session.bufferBytes > session.bufferCap && + session.buffer.length > 0 + ) { + const head = session.buffer.shift(); + if (head) session.bufferBytes -= head.byteLength; + } + } + + /** Snapshot the buffered bytes for replay; doesn't clear the buffer. */ + snapshotBuffer(session: Session): Buffer { + return Buffer.concat(session.buffer); + } +} diff --git a/packages/pty-daemon/src/SessionStore/index.ts b/packages/pty-daemon/src/SessionStore/index.ts new file mode 100644 index 00000000000..0e7e0400010 --- /dev/null +++ b/packages/pty-daemon/src/SessionStore/index.ts @@ -0,0 +1,2 @@ +export type { Session, SessionStoreOptions } from "./SessionStore.ts"; +export { SessionStore } from "./SessionStore.ts"; diff --git a/packages/pty-daemon/src/handlers/handlers.test.ts b/packages/pty-daemon/src/handlers/handlers.test.ts new file mode 100644 index 00000000000..c0c97975c1d --- /dev/null +++ b/packages/pty-daemon/src/handlers/handlers.test.ts @@ -0,0 +1,216 @@ +import { beforeEach, describe, expect, test } from "bun:test"; +import type { Pty, SpawnOptions } from "../Pty/index.ts"; +import type { ServerMessage } from "../protocol/index.ts"; +import { SessionStore } from "../SessionStore/index.ts"; +import type { Conn, HandlerCtx } from "./handlers.ts"; +import { + handleClose, + handleInput, + handleList, + handleOpen, + handleResize, + handleSubscribe, + handleUnsubscribe, +} from "./handlers.ts"; + +interface FakePtyState { + pid: number; + cols: number; + rows: number; + written: Buffer[]; + killed: boolean; +} + +function makeFakePty(state: FakePtyState, meta: SpawnOptions["meta"]): Pty { + state.cols = meta.cols; + state.rows = meta.rows; + return { + pid: state.pid, + meta, + write: (b) => state.written.push(b), + resize: (c, r) => { + state.cols = c; + state.rows = r; + }, + kill: () => { + state.killed = true; + }, + onData: () => {}, + onExit: () => {}, + }; +} + +function makeConn(): Conn & { sent: ServerMessage[] } { + const sent: ServerMessage[] = []; + return { + sent, + subscriptions: new Set(), + send: (m) => sent.push(m), + }; +} + +let nextPid = 1000; +let states: FakePtyState[] = []; +let wired: Pty[] = []; + +function makeCtx(): HandlerCtx & { + spawnedStates: FakePtyState[]; + wired: Pty[]; +} { + const store = new SessionStore(); + return { + store, + spawnedStates: states, + wired, + wireSession: (s) => { + wired.push(s.pty); + }, + spawnPty: (opts) => { + const state: FakePtyState = { + pid: nextPid++, + cols: opts.meta.cols, + rows: opts.meta.rows, + written: [], + killed: false, + }; + states.push(state); + return makeFakePty(state, opts.meta); + }, + }; +} + +beforeEach(() => { + nextPid = 1000; + states = []; + wired = []; +}); + +describe("handlers", () => { + test("open: spawns a session and replies open-ok", () => { + const ctx = makeCtx(); + const reply = handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + expect(reply.type).toBe("open-ok"); + if (reply.type === "open-ok") expect(reply.pid).toBe(1000); + expect(ctx.store.size()).toBe(1); + expect(ctx.wired).toHaveLength(1); + }); + + test("open: rejects duplicate ids", () => { + const ctx = makeCtx(); + const meta = { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }; + handleOpen(ctx, { type: "open", id: "s0", meta }); + const reply = handleOpen(ctx, { type: "open", id: "s0", meta }); + expect(reply.type).toBe("error"); + }); + + test("input writes bytes to the pty", () => { + const ctx = makeCtx(); + handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + const result = handleInput(ctx, { + type: "input", + id: "s0", + data: Buffer.from("hello").toString("base64"), + }); + expect(result).toBeUndefined(); + expect(states[0]?.written.map((b) => b.toString())).toEqual(["hello"]); + }); + + test("input on missing session returns error", () => { + const ctx = makeCtx(); + const result = handleInput(ctx, { + type: "input", + id: "missing", + data: "", + }); + expect(result?.type).toBe("error"); + }); + + test("resize updates dims", () => { + const ctx = makeCtx(); + handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + expect( + handleResize(ctx, { type: "resize", id: "s0", cols: 100, rows: 30 }), + ).toBeUndefined(); + expect(states[0]?.cols).toBe(100); + expect(states[0]?.rows).toBe(30); + }); + + test("close kills the pty and replies closed", () => { + const ctx = makeCtx(); + handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + const reply = handleClose(ctx, { type: "close", id: "s0" }); + expect(reply.type).toBe("closed"); + expect(states[0]?.killed).toBe(true); + }); + + test("list returns all sessions", () => { + const ctx = makeCtx(); + const meta = { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }; + handleOpen(ctx, { type: "open", id: "a", meta }); + handleOpen(ctx, { type: "open", id: "b", meta }); + const reply = handleList(ctx); + expect(reply.sessions).toHaveLength(2); + }); + + test("subscribe with replay sends buffered output", () => { + const ctx = makeCtx(); + handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + const session = ctx.store.get("s0"); + if (!session) throw new Error("no session"); + ctx.store.appendOutput(session, Buffer.from("prior bytes")); + + const conn = makeConn(); + handleSubscribe(ctx, conn, { type: "subscribe", id: "s0", replay: true }); + expect(conn.subscriptions.has("s0")).toBe(true); + expect(conn.sent).toHaveLength(1); + const m = conn.sent[0]; + expect(m?.type).toBe("output"); + if (m?.type === "output") { + expect(Buffer.from(m.data, "base64").toString()).toBe("prior bytes"); + } + }); + + test("subscribe without replay does not send buffered output", () => { + const ctx = makeCtx(); + handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + const session = ctx.store.get("s0"); + if (!session) throw new Error("no session"); + ctx.store.appendOutput(session, Buffer.from("prior bytes")); + + const conn = makeConn(); + handleSubscribe(ctx, conn, { type: "subscribe", id: "s0", replay: false }); + expect(conn.subscriptions.has("s0")).toBe(true); + expect(conn.sent).toHaveLength(0); + }); + + test("unsubscribe removes from conn.subscriptions", () => { + const conn = makeConn(); + conn.subscriptions.add("s0"); + handleUnsubscribe(conn, { type: "unsubscribe", id: "s0" }); + expect(conn.subscriptions.has("s0")).toBe(false); + }); +}); diff --git a/packages/pty-daemon/src/handlers/handlers.ts b/packages/pty-daemon/src/handlers/handlers.ts new file mode 100644 index 00000000000..24b249fc7bf --- /dev/null +++ b/packages/pty-daemon/src/handlers/handlers.ts @@ -0,0 +1,162 @@ +import { + spawn as defaultSpawn, + type Pty, + type SpawnOptions, +} from "../Pty/index.ts"; +import type { + CloseMessage, + InputMessage, + ListReplyMessage, + OpenMessage, + OpenOkMessage, + OutputMessage, + ResizeMessage, + ServerMessage, + SubscribeMessage, + UnsubscribeMessage, +} from "../protocol/index.ts"; +import type { Session, SessionStore } from "../SessionStore/index.ts"; + +/** + * Per-connection state owned by the Server. Handlers receive a Conn ref to + * read/write subscription membership and to send messages. + */ +export interface Conn { + subscriptions: Set; + send(message: ServerMessage): void; +} + +/** + * Wire a freshly-created session's PTY events into the broadcast pipeline. + * Called once at session-open time. The Server owns the broadcast set. + */ +export type SessionWirer = (session: Session) => void; + +export interface HandlerCtx { + store: SessionStore; + wireSession: SessionWirer; + /** Pluggable spawn for testability; defaults to real node-pty in production. */ + spawnPty?: (opts: SpawnOptions) => Pty; +} + +export function handleOpen(ctx: HandlerCtx, msg: OpenMessage): ServerMessage { + const existing = ctx.store.get(msg.id); + if (existing) { + // If the existing entry is for an already-exited shell, treat the open + // as recycling the id: drop the dead entry and let the spawn proceed. + // Live shells still reject with EEXIST so host-service drives the + // adoption-via-list path. + if (existing.exited) { + ctx.store.delete(msg.id); + } else { + return errorFor(msg.id, `session already exists: ${msg.id}`, "EEXIST"); + } + } + let session: Session; + const spawnFn = ctx.spawnPty ?? defaultSpawn; + try { + const pty = spawnFn({ meta: msg.meta }); + session = ctx.store.add(msg.id, pty); + } catch (err) { + return errorFor(msg.id, (err as Error).message, "ESPAWN"); + } + ctx.wireSession(session); + const reply: OpenOkMessage = { + type: "open-ok", + id: msg.id, + pid: session.pty.pid, + }; + return reply; +} + +export function handleInput( + ctx: HandlerCtx, + msg: InputMessage, +): ServerMessage | undefined { + const session = ctx.store.get(msg.id); + if (!session) return errorFor(msg.id, `unknown session: ${msg.id}`, "ENOENT"); + if (session.exited) + return errorFor(msg.id, `session exited: ${msg.id}`, "EEXITED"); + try { + session.pty.write(Buffer.from(msg.data, "base64")); + } catch (err) { + return errorFor(msg.id, (err as Error).message, "EWRITE"); + } + return undefined; +} + +export function handleResize( + ctx: HandlerCtx, + msg: ResizeMessage, +): ServerMessage | undefined { + const session = ctx.store.get(msg.id); + if (!session) return errorFor(msg.id, `unknown session: ${msg.id}`, "ENOENT"); + try { + session.pty.resize(msg.cols, msg.rows); + } catch (err) { + return errorFor(msg.id, (err as Error).message, "ERESIZE"); + } + return undefined; +} + +export function handleClose(ctx: HandlerCtx, msg: CloseMessage): ServerMessage { + const session = ctx.store.get(msg.id); + if (!session) return errorFor(msg.id, `unknown session: ${msg.id}`, "ENOENT"); + try { + // SIGHUP is the right signal for "your terminal is going away" — + // what the kernel sends when a TTY actually closes. Interactive + // shells (especially `zsh -l`) trap SIGTERM and stay alive, so + // using SIGTERM as the default leaks PTY processes on every + // pane close. Callers can still pass an explicit signal. + session.pty.kill(msg.signal ?? "SIGHUP"); + } catch (err) { + return errorFor(msg.id, (err as Error).message, "EKILL"); + } + return { type: "closed", id: msg.id }; +} + +export function handleList(ctx: HandlerCtx): ListReplyMessage { + return { type: "list-reply", sessions: ctx.store.list() }; +} + +/** + * Subscribe the connection to a session. If `replay` is true, immediately + * send an `output` frame containing the buffered bytes before live streaming + * begins. Live streaming is the Server's job once `subscriptions` includes + * this session id. + */ +export function handleSubscribe( + ctx: HandlerCtx, + conn: Conn, + msg: SubscribeMessage, +): void { + const session = ctx.store.get(msg.id); + if (!session) { + conn.send(errorFor(msg.id, `unknown session: ${msg.id}`, "ENOENT")); + return; + } + conn.subscriptions.add(msg.id); + if (msg.replay) { + const snap = ctx.store.snapshotBuffer(session); + if (snap.byteLength > 0) { + const out: OutputMessage = { + type: "output", + id: msg.id, + data: snap.toString("base64"), + }; + conn.send(out); + } + } +} + +export function handleUnsubscribe(conn: Conn, msg: UnsubscribeMessage): void { + conn.subscriptions.delete(msg.id); +} + +function errorFor( + id: string | undefined, + message: string, + code?: string, +): ServerMessage { + return { type: "error", id, message, code }; +} diff --git a/packages/pty-daemon/src/handlers/index.ts b/packages/pty-daemon/src/handlers/index.ts new file mode 100644 index 00000000000..97b7935d62a --- /dev/null +++ b/packages/pty-daemon/src/handlers/index.ts @@ -0,0 +1,10 @@ +export type { Conn, HandlerCtx, SessionWirer } from "./handlers.ts"; +export { + handleClose, + handleInput, + handleList, + handleOpen, + handleResize, + handleSubscribe, + handleUnsubscribe, +} from "./handlers.ts"; diff --git a/packages/pty-daemon/src/index.ts b/packages/pty-daemon/src/index.ts new file mode 100644 index 00000000000..ec0b81b0324 --- /dev/null +++ b/packages/pty-daemon/src/index.ts @@ -0,0 +1,7 @@ +// Public package surface — host-service imports from "@superset/pty-daemon" or +// "@superset/pty-daemon/protocol". Daemon implementation runtime is Node; +// host-service is a CLIENT of the daemon (importing protocol types only), +// not a runtime peer. + +export { Server, type ServerOptions } from "./Server/index.ts"; +export type { Session } from "./SessionStore/index.ts"; diff --git a/packages/pty-daemon/src/main.ts b/packages/pty-daemon/src/main.ts new file mode 100755 index 00000000000..61dc22a86b6 --- /dev/null +++ b/packages/pty-daemon/src/main.ts @@ -0,0 +1,98 @@ +#!/usr/bin/env node +// pty-daemon entrypoint. Runs under Node (node-pty + Bun's tty.ReadStream +// don't get along; see the design doc). +// +// Usage: +// pty-daemon --socket=/path/to/sock [--buffer-bytes=65536] +// +// Logs go to stderr; nothing on stdout. + +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; +import { Server } from "./Server/index.ts"; + +interface CliArgs { + socket: string; + bufferBytes?: number; +} + +function parseArgs(argv: string[]): CliArgs { + const args: Partial = {}; + for (const arg of argv) { + if (arg.startsWith("--socket=")) + args.socket = arg.slice("--socket=".length); + else if (arg.startsWith("--buffer-bytes=")) { + const raw = arg.slice("--buffer-bytes=".length); + const parsed = Number.parseInt(raw, 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + throw new Error( + `--buffer-bytes must be a positive integer, got: ${raw}`, + ); + } + args.bufferBytes = parsed; + } + } + if (!args.socket) { + throw new Error("--socket=PATH is required"); + } + return args as CliArgs; +} + +async function main(): Promise { + const args = parseArgs(process.argv.slice(2)); + // Env takes precedence so the supervisor (or a test harness) can pin + // the version to a known value. Falls back to the package.json read + // when env is unset — that's the deployed-artifact source of truth. + const daemonVersion = + process.env.SUPERSET_PTY_DAEMON_VERSION ?? readPackageVersion(); + const server = new Server({ + socketPath: args.socket, + daemonVersion, + bufferCap: args.bufferBytes, + }); + await server.listen(); + process.stderr.write( + `[pty-daemon] listening on ${args.socket} (v${daemonVersion}, host=${os.hostname()})\n`, + ); + + let shuttingDown = false; + const shutdown = async (signal: NodeJS.Signals) => { + // Re-entry guard: a second SIGINT/SIGTERM during graceful close + // should not double-call server.close() or change the exit code. + if (shuttingDown) return; + shuttingDown = true; + process.stderr.write(`[pty-daemon] received ${signal}, shutting down\n`); + try { + await server.close(); + } catch (err) { + process.stderr.write( + `[pty-daemon] shutdown error: ${(err as Error).stack ?? err}\n`, + ); + } finally { + // Always exit deterministically, even if server.close() threw. + process.exit(0); + } + }; + process.on("SIGINT", () => void shutdown("SIGINT")); + process.on("SIGTERM", () => void shutdown("SIGTERM")); +} + +function readPackageVersion(): string { + try { + const here = path.dirname(fileURLToPath(import.meta.url)); + const pkgPath = path.resolve(here, "..", "package.json"); + const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf8")) as { + version?: string; + }; + return pkg.version ?? "0.0.0"; + } catch { + return "0.0.0"; + } +} + +main().catch((err) => { + process.stderr.write(`[pty-daemon] fatal: ${(err as Error).stack ?? err}\n`); + process.exit(1); +}); diff --git a/packages/pty-daemon/src/protocol/framing.test.ts b/packages/pty-daemon/src/protocol/framing.test.ts new file mode 100644 index 00000000000..0d3c947c766 --- /dev/null +++ b/packages/pty-daemon/src/protocol/framing.test.ts @@ -0,0 +1,49 @@ +import { describe, expect, test } from "bun:test"; +import { decodeFrame, encodeFrame, FrameDecoder } from "./framing.ts"; + +describe("framing", () => { + test("round-trips a simple object", () => { + const msg = { type: "hello", protocols: [1] }; + const frame = encodeFrame(msg); + expect(decodeFrame(frame)).toEqual(msg); + }); + + test("round-trips through FrameDecoder", () => { + const a = { type: "open", id: "s0" }; + const b = { type: "input", id: "s0", data: "aGk=" }; + const dec = new FrameDecoder(); + dec.push(Buffer.concat([encodeFrame(a), encodeFrame(b)])); + expect(dec.drain()).toEqual([a, b]); + }); + + test("FrameDecoder buffers across chunks", () => { + const msg = { type: "open", id: "s0" }; + const full = encodeFrame(msg); + const dec = new FrameDecoder(); + dec.push(full.subarray(0, 2)); + expect(dec.drain()).toEqual([]); + dec.push(full.subarray(2, 6)); + expect(dec.drain()).toEqual([]); + dec.push(full.subarray(6)); + expect(dec.drain()).toEqual([msg]); + }); + + test("FrameDecoder handles partial frame after a complete one", () => { + const a = { type: "open", id: "s0" }; + const b = { type: "open", id: "s1" }; + const buf = Buffer.concat([encodeFrame(a), encodeFrame(b)]); + const dec = new FrameDecoder(); + dec.push(buf.subarray(0, encodeFrame(a).length + 3)); + expect(dec.drain()).toEqual([a]); + dec.push(buf.subarray(encodeFrame(a).length + 3)); + expect(dec.drain()).toEqual([b]); + }); + + test("rejects oversized frames", () => { + const bigHeader = Buffer.alloc(4); + bigHeader.writeUInt32BE(20 * 1024 * 1024, 0); // 20 MB + const dec = new FrameDecoder(); + dec.push(bigHeader); + expect(() => dec.drain()).toThrow(/frame too large/); + }); +}); diff --git a/packages/pty-daemon/src/protocol/framing.ts b/packages/pty-daemon/src/protocol/framing.ts new file mode 100644 index 00000000000..df678a0b76d --- /dev/null +++ b/packages/pty-daemon/src/protocol/framing.ts @@ -0,0 +1,56 @@ +// Length-prefixed binary frames over a SOCK_STREAM socket. +// +// Wire: [u32 BE length][JSON UTF-8 payload of that length] + +const HEADER_BYTES = 4; +const MAX_FRAME_BYTES = 8 * 1024 * 1024; // 8 MB hard cap; abort the connection above this. + +export function encodeFrame(message: unknown): Buffer { + const json = JSON.stringify(message); + const payload = Buffer.from(json, "utf8"); + const header = Buffer.alloc(HEADER_BYTES); + header.writeUInt32BE(payload.byteLength, 0); + return Buffer.concat([header, payload]); +} + +/** + * Streaming decoder. Feed bytes via `push`; iterate completed frames via `drain`. + * Throws on oversized frames so a malformed peer can't exhaust memory. + */ +export class FrameDecoder { + private buf: Buffer = Buffer.alloc(0); + + push(chunk: Buffer): void { + this.buf = this.buf.length === 0 ? chunk : Buffer.concat([this.buf, chunk]); + } + + drain(): unknown[] { + const out: unknown[] = []; + while (this.buf.length >= HEADER_BYTES) { + const len = this.buf.readUInt32BE(0); + if (len > MAX_FRAME_BYTES) { + throw new Error(`frame too large: ${len} bytes`); + } + if (this.buf.length < HEADER_BYTES + len) break; + const payload = this.buf.subarray(HEADER_BYTES, HEADER_BYTES + len); + out.push(JSON.parse(payload.toString("utf8"))); + this.buf = this.buf.subarray(HEADER_BYTES + len); + } + return out; + } +} + +/** + * One-shot decode of a buffer that contains exactly one complete frame. + * Used by tests; production reads use FrameDecoder. + */ +export function decodeFrame(buf: Buffer): unknown { + if (buf.length < HEADER_BYTES) throw new Error("short frame"); + const len = buf.readUInt32BE(0); + if (buf.length !== HEADER_BYTES + len) { + throw new Error( + `frame length mismatch: header=${len} buf=${buf.length - HEADER_BYTES}`, + ); + } + return JSON.parse(buf.subarray(HEADER_BYTES).toString("utf8")); +} diff --git a/packages/pty-daemon/src/protocol/index.ts b/packages/pty-daemon/src/protocol/index.ts new file mode 100644 index 00000000000..dd5bf0788e9 --- /dev/null +++ b/packages/pty-daemon/src/protocol/index.ts @@ -0,0 +1,26 @@ +export { decodeFrame, encodeFrame, FrameDecoder } from "./framing.ts"; +export type { + ClientMessage, + ClosedMessage, + CloseMessage, + ErrorMessage, + ExitMessage, + HelloAckMessage, + HelloMessage, + InputMessage, + ListMessage, + ListReplyMessage, + OpenMessage, + OpenOkMessage, + OutputMessage, + ResizeMessage, + ServerMessage, + SessionInfo, + SessionMeta, + SubscribeMessage, + UnsubscribeMessage, +} from "./messages.ts"; +export { + CURRENT_PROTOCOL_VERSION, + SUPPORTED_PROTOCOL_VERSIONS, +} from "./version.ts"; diff --git a/packages/pty-daemon/src/protocol/messages.ts b/packages/pty-daemon/src/protocol/messages.ts new file mode 100644 index 00000000000..dbb26d5f7e9 --- /dev/null +++ b/packages/pty-daemon/src/protocol/messages.ts @@ -0,0 +1,140 @@ +// Message schemas for the pty-daemon Unix socket protocol. +// +// Wire format: 4-byte big-endian length prefix + UTF-8 JSON payload. +// Binary data (PTY input/output) travels base64-encoded inside the JSON. +// See ../README.md and ../../../../apps/desktop/plans/20260429-pty-daemon-implementation.md + +export interface SessionMeta { + shell: string; + argv: string[]; + cwd?: string; + env?: Record; + cols: number; + rows: number; +} + +export interface SessionInfo { + id: string; + pid: number; + cols: number; + rows: number; + alive: boolean; +} + +// ---------- Handshake ---------- + +export interface HelloMessage { + type: "hello"; + protocols: number[]; + clientVersion?: string; +} + +export interface HelloAckMessage { + type: "hello-ack"; + protocol: number; + daemonVersion: string; +} + +// ---------- Client -> Daemon ---------- + +export interface OpenMessage { + type: "open"; + id: string; + meta: SessionMeta; +} + +export interface InputMessage { + type: "input"; + id: string; + /** base64-encoded bytes */ + data: string; +} + +export interface ResizeMessage { + type: "resize"; + id: string; + cols: number; + rows: number; +} + +export interface CloseMessage { + type: "close"; + id: string; + signal?: "SIGINT" | "SIGTERM" | "SIGKILL" | "SIGHUP"; +} + +export interface ListMessage { + type: "list"; +} + +export interface SubscribeMessage { + type: "subscribe"; + id: string; + /** if true, replay buffered output before live streaming */ + replay: boolean; +} + +export interface UnsubscribeMessage { + type: "unsubscribe"; + id: string; +} + +// ---------- Daemon -> Client ---------- + +export interface OpenOkMessage { + type: "open-ok"; + id: string; + pid: number; +} + +export interface OutputMessage { + type: "output"; + id: string; + /** base64-encoded bytes */ + data: string; +} + +export interface ExitMessage { + type: "exit"; + id: string; + code: number | null; + signal: number | null; +} + +export interface ClosedMessage { + type: "closed"; + id: string; +} + +export interface ListReplyMessage { + type: "list-reply"; + sessions: SessionInfo[]; +} + +export interface ErrorMessage { + type: "error"; + id?: string; + message: string; + code?: string; +} + +// ---------- Unions ---------- + +export type ClientMessage = + | HelloMessage + | OpenMessage + | InputMessage + | ResizeMessage + | CloseMessage + | ListMessage + | SubscribeMessage + | UnsubscribeMessage; + +export type ServerMessage = + | HelloAckMessage + | OpenOkMessage + | OutputMessage + | ExitMessage + | ClosedMessage + | ListReplyMessage + | ErrorMessage; diff --git a/packages/pty-daemon/src/protocol/version.ts b/packages/pty-daemon/src/protocol/version.ts new file mode 100644 index 00000000000..350a807f332 --- /dev/null +++ b/packages/pty-daemon/src/protocol/version.ts @@ -0,0 +1,4 @@ +// Protocol versioning. Increment on breaking changes; add to SUPPORTED list +// while we still need to interop with the previous major during rollouts. +export const CURRENT_PROTOCOL_VERSION = 1 as const; +export const SUPPORTED_PROTOCOL_VERSIONS: readonly number[] = [1]; diff --git a/packages/pty-daemon/test/control-plane.test.ts b/packages/pty-daemon/test/control-plane.test.ts new file mode 100644 index 00000000000..3a990a69c43 --- /dev/null +++ b/packages/pty-daemon/test/control-plane.test.ts @@ -0,0 +1,964 @@ +// Comprehensive control-plane test for pty-daemon. Each test exercises a +// real daemon over a real Unix socket and walks through one usage pattern +// end-to-end. Together these cover every usage shape host-service can throw +// at the daemon: handshake variants, session lifecycle, I/O patterns, +// multi-client subscribe/replay/unsubscribe, detach+reattach, malformed +// input, late subscribers, concurrent N sessions, shutdown. +// +// Runs under Node (`node --experimental-strip-types --test`). + +import { strict as assert } from "node:assert"; +import * as os from "node:os"; +import * as path from "node:path"; +import { after, before, describe, test } from "node:test"; +import { encodeFrame } from "../src/protocol/index.ts"; +import { Server } from "../src/Server/index.ts"; +import { connect, connectAndHello } from "./helpers/client.ts"; + +const sockPath = path.join( + os.tmpdir(), + `pty-daemon-control-${process.pid}.sock`, +); +let server: Server; + +before(async () => { + server = new Server({ + socketPath: sockPath, + daemonVersion: "0.0.0-control", + bufferCap: 8 * 1024, + }); + await server.listen(); +}); + +after(async () => { + await server.close(); +}); + +const SH = "/bin/sh"; +const baseMeta = { + shell: SH, + argv: ["-c", "echo ready; sleep 5"] as string[], + cols: 80, + rows: 24, +}; + +function uniqueId(prefix: string): string { + return `${prefix}-${Math.random().toString(36).slice(2, 8)}`; +} + +// ---------------- Handshake ---------------- + +describe("handshake", () => { + test("rejects non-hello first message", async () => { + const c = await connect(sockPath); + c.send({ type: "list" }); + const err = await c.waitFor((m) => m.type === "error", 1000); + assert.equal(err.type, "error"); + await c.close(); + }); + + test("rejects unsupported protocol versions", async () => { + const c = await connect(sockPath); + c.send({ type: "hello", protocols: [99, 100] }); + const err = await c.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") assert.equal(err.code, "EVERSION"); + await c.close(); + }); + + test("picks highest mutual when multiple offered", async () => { + const c = await connect(sockPath); + c.send({ type: "hello", protocols: [1, 99] }); + const ack = await c.waitFor((m) => m.type === "hello-ack"); + if (ack.type === "hello-ack") assert.equal(ack.protocol, 1); + await c.close(); + }); + + test("rejects duplicate hello", async () => { + const c = await connectAndHello(sockPath); + c.send({ type: "hello", protocols: [1] }); + const err = await c.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") { + assert.match(err.message, /duplicate hello/); + } + await c.close(); + }); +}); + +// ---------------- Session lifecycle ---------------- + +describe("session lifecycle", () => { + test("rejects open with bad cols/rows", async () => { + const c = await connectAndHello(sockPath); + c.send({ + type: "open", + id: uniqueId("badspawn"), + meta: { ...baseMeta, cols: 0 }, + }); + const err = await c.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") assert.equal(err.code, "ESPAWN"); + await c.close(); + }); + + test("rejects duplicate session id", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("dup"); + c.send({ type: "open", id, meta: baseMeta }); + await c.waitFor((m) => m.type === "open-ok"); + c.send({ type: "open", id, meta: baseMeta }); + const err = await c.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") assert.equal(err.code, "EEXIST"); + c.send({ type: "close", id }); + await c.close(); + }); + + test("input/resize/close on missing session return ENOENT", async () => { + const c = await connectAndHello(sockPath); + const missing = "missing-no-such"; + + c.send({ type: "input", id: missing, data: "" }); + const e1 = await c.waitFor((m) => m.type === "error", 1000); + if (e1.type === "error") assert.equal(e1.code, "ENOENT"); + + c.send({ type: "resize", id: missing, cols: 80, rows: 24 }); + const e2 = await c.waitFor((m) => m.type === "error" && m !== e1, 1000); + if (e2.type === "error") assert.equal(e2.code, "ENOENT"); + + c.send({ type: "close", id: missing }); + const e3 = await c.waitFor( + (m) => m.type === "error" && m !== e1 && m !== e2, + 1000, + ); + if (e3.type === "error") assert.equal(e3.code, "ENOENT"); + await c.close(); + }); + + test("instant-exit shell still produces an exit message", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("instant"); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "true"] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: true }); + const exit = await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); + if (exit.type === "exit") assert.equal(exit.code, 0); + await c.close(); + }); + + test("close with SIGKILL terminates a hung shell", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("hung"); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "sleep 60"] }, + }); + const ok = await c.waitFor((m) => m.type === "open-ok" && m.id === id); + if (ok.type !== "open-ok") throw new Error("no open-ok"); + + c.send({ type: "subscribe", id, replay: false }); + c.send({ type: "close", id, signal: "SIGKILL" }); + await c.waitFor((m) => m.type === "closed" && m.id === id); + await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); + await c.close(); + }); + + test("default close (SIGHUP) terminates an interactive login shell", async () => { + // Regression test for a real-world bug: SIGTERM is the wrong default + // for "user closed the terminal pane" because interactive shells + // (especially `zsh -l`) trap SIGTERM and stay alive. The kernel + // sends SIGHUP when a TTY closes, and shells DO honor it. Without + // this, every closed v2 terminal pane leaked a zsh process. + const c = await connectAndHello(sockPath); + const id = uniqueId("interactive"); + // `-i` forces interactive mode even though stdin is a PTY pipe; + // matches the real terminal-launch shape closely enough for this + // regression to fire if someone reverts to SIGTERM. + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: false }); + + // Default close — no explicit signal. Server defaults to SIGHUP. + c.send({ type: "close", id }); + await c.waitFor((m) => m.type === "closed" && m.id === id); + // Critical: the shell must actually exit. If SIGTERM defaults + // returned (the bug), this waitFor would timeout. + await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); + await c.close(); + }); +}); + +// ---------------- I/O patterns ---------------- + +describe("I/O patterns", () => { + test("resize during a running shell does not break stream", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("resize"); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: false }); + + c.send({ type: "resize", id, cols: 120, rows: 40 }); + c.send({ + type: "input", + id, + data: Buffer.from("echo post-resize-marker\n").toString("base64"), + }); + await c.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("post-resize-marker"), + 3000, + ); + + c.send({ type: "close", id, signal: "SIGTERM" }); + await c.close(); + }); + + test("burst output (high-rate stdout) is delivered and ring-capped", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("burst"); + c.send({ + type: "open", + id, + meta: { + ...baseMeta, + argv: [ + "-c", + "for i in $(seq 1 200); do echo BURST:$i; done; sleep 0.5", + ], + }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: false }); + + // Wait until we see the last marker, confirming live delivery. + await c.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("BURST:200"), + 5000, + ); + await c.waitFor((m) => m.type === "exit" && m.id === id, 5000); + await c.close(); + }); + + test("multi-byte UTF-8 output round-trips", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("utf8"); + // 🚀 = 0xF0 0x9F 0x9A 0x80 + c.send({ + type: "open", + id, + meta: { + ...baseMeta, + argv: ["-c", "printf 'rocket: \\xf0\\x9f\\x9a\\x80\\n'; sleep 0.1"], + }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: true }); + await c.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("🚀"), + 3000, + ); + await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); + await c.close(); + }); +}); + +// ---------------- Multi-client subscribe / fan-out ---------------- + +describe("multi-client fan-out", () => { + test("two subscribers both receive the same output", async () => { + const a = await connectAndHello(sockPath); + const b = await connectAndHello(sockPath); + const id = uniqueId("fanout"); + + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "echo fanout-marker; sleep 0.5"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + + a.send({ type: "subscribe", id, replay: false }); + b.send({ type: "subscribe", id, replay: false }); + + await Promise.all([ + a.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("fanout-marker"), + 3000, + ), + b.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("fanout-marker"), + 3000, + ), + ]); + + await Promise.all([a.close(), b.close()]); + }); + + test("unsubscribe stops further output to that connection", async () => { + const a = await connectAndHello(sockPath); + const b = await connectAndHello(sockPath); + const id = uniqueId("unsub"); + + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + + a.send({ type: "subscribe", id, replay: false }); + b.send({ type: "subscribe", id, replay: false }); + + // First marker — both should see it. + a.send({ + type: "input", + id, + data: Buffer.from("echo first-marker\n").toString("base64"), + }); + await Promise.all([ + a.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("first-marker"), + 3000, + ), + b.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("first-marker"), + 3000, + ), + ]); + + // b unsubscribes; a is still subscribed. + b.send({ type: "unsubscribe", id }); + // Small settle so the unsubscribe lands before the next emit. + await new Promise((r) => setTimeout(r, 100)); + + const bAfterUnsub = b.collect( + (m) => m.type === "output" && m.id === id, + 500, + ); + + a.send({ + type: "input", + id, + data: Buffer.from("echo second-marker\n").toString("base64"), + }); + await a.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("second-marker"), + 3000, + ); + + const bMessages = await bAfterUnsub; + const sawSecondOnB = bMessages.some( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("second-marker"), + ); + assert.equal(sawSecondOnB, false); + + a.send({ type: "close", id, signal: "SIGTERM" }); + await Promise.all([a.close(), b.close()]); + }); + + test("subscriber connection drop doesn't crash daemon; other clients keep streaming", async () => { + const owner = await connectAndHello(sockPath); + const dropper = await connectAndHello(sockPath); + const observer = await connectAndHello(sockPath); + const id = uniqueId("dropcrash"); + + owner.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await owner.waitFor((m) => m.type === "open-ok" && m.id === id); + dropper.send({ type: "subscribe", id, replay: false }); + observer.send({ type: "subscribe", id, replay: false }); + + // Force-close the dropper without unsubscribing. + dropper.socket.destroy(); + + owner.send({ + type: "input", + id, + data: Buffer.from("echo survives-drop\n").toString("base64"), + }); + await observer.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("survives-drop"), + 3000, + ); + + owner.send({ type: "close", id, signal: "SIGTERM" }); + await Promise.all([owner.close(), observer.close()]); + }); +}); + +// ---------------- Detach + reattach (the headline feature) ---------------- + +describe("detach + reattach", () => { + test("late subscriber gets prior output via replay", async () => { + const owner = await connectAndHello(sockPath); + const id = uniqueId("late"); + + owner.send({ + type: "open", + id, + meta: { + ...baseMeta, + argv: ["-c", "echo early-marker; sleep 1"], + }, + }); + await owner.waitFor((m) => m.type === "open-ok" && m.id === id); + + // Wait for output to be buffered without any subscriber. + await new Promise((r) => setTimeout(r, 200)); + + const late = await connectAndHello(sockPath); + late.send({ type: "subscribe", id, replay: true }); + await late.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("early-marker"), + 3000, + ); + + owner.send({ type: "close", id, signal: "SIGTERM" }); + await Promise.all([owner.close(), late.close()]); + }); + + test("reattach cycle: subscribe → disconnect → new conn subscribes-with-replay → continues live", async () => { + const owner = await connectAndHello(sockPath); + const id = uniqueId("reattach"); + + owner.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await owner.waitFor((m) => m.type === "open-ok" && m.id === id); + + const first = await connectAndHello(sockPath); + first.send({ type: "subscribe", id, replay: false }); + + // Generate some output via input. + owner.send({ + type: "input", + id, + data: Buffer.from("echo before-reattach\n").toString("base64"), + }); + await first.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("before-reattach"), + 3000, + ); + + // Disconnect the first client. PTY keeps running. + await first.close(); + + // New client connects, asks for replay, and sends another input. + const second = await connectAndHello(sockPath); + second.send({ type: "subscribe", id, replay: true }); + // Replay should arrive immediately containing the prior output. + await second.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("before-reattach"), + 2000, + ); + + owner.send({ + type: "input", + id, + data: Buffer.from("echo after-reattach\n").toString("base64"), + }); + await second.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("after-reattach"), + 3000, + ); + + owner.send({ type: "close", id, signal: "SIGTERM" }); + await Promise.all([owner.close(), second.close()]); + }); +}); + +// ---------------- list ---------------- + +describe("list", () => { + test("reflects active sessions", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("listed"); + c.send({ type: "open", id, meta: baseMeta }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + + c.send({ type: "list" }); + const reply = await c.waitFor((m) => m.type === "list-reply"); + assert.equal(reply.type, "list-reply"); + if (reply.type === "list-reply") { + const found = reply.sessions.find((s) => s.id === id); + assert.ok(found, "session should appear in list"); + assert.equal(found?.cols, 80); + assert.equal(found?.rows, 24); + assert.equal(found?.alive, true); + } + + c.send({ type: "close", id, signal: "SIGTERM" }); + await c.close(); + }); +}); + +// ---------------- Cross-client continuity (host-service restart story) ---------------- + +describe("cross-client continuity (host-service restart simulation)", () => { + // This is the headline path the daemon exists for. Client A (host-service v1) + // opens a session, then disconnects (host-service crashed). Client B + // (host-service v2) connects fresh, discovers the session via list, and + // must NOT try to re-open it — it should subscribe-with-replay and + // continue. Regression test for the "session already exists" tight loop + // observed in production after the first integration land. + + test("client B finds session A's id via list after A disconnects", async () => { + const a = await connectAndHello(sockPath); + const id = uniqueId("restart"); + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "echo from-A; sleep 5"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + // Force-close A's connection without unsubscribing — this simulates a + // host-service crash. The session must keep running on the daemon. + a.socket.destroy(); + + // Brief settle so the daemon notices the close. + await new Promise((r) => setTimeout(r, 100)); + + const b = await connectAndHello(sockPath); + b.send({ type: "list" }); + const reply = await b.waitFor((m) => m.type === "list-reply"); + assert.equal(reply.type, "list-reply"); + if (reply.type === "list-reply") { + const found = reply.sessions.find((s) => s.id === id); + assert.ok(found, `session ${id} should still be in list after A's drop`); + assert.equal(found?.alive, true); + } + + b.send({ type: "close", id, signal: "SIGTERM" }); + await b.close(); + }); + + test("re-opening an existing session id returns EEXIST (the trigger for adoption)", async () => { + // Regression: host-service was caught in a tight loop because it + // blindly called `open` after restart and got "session already exists". + // We rely on this exact error code/message to drive the adoption path + // in host-service's createTerminalSessionInternal. + const a = await connectAndHello(sockPath); + const id = uniqueId("eexist"); + a.send({ type: "open", id, meta: baseMeta }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + + const b = await connectAndHello(sockPath); + b.send({ type: "open", id, meta: baseMeta }); + const err = await b.waitFor((m) => m.type === "error" && m.id === id, 2000); + assert.equal(err.type, "error"); + if (err.type === "error") { + assert.equal(err.code, "EEXIST"); + assert.match(err.message, /session already exists/); + } + + a.send({ type: "close", id, signal: "SIGTERM" }); + await Promise.all([a.close(), b.close()]); + }); + + test("client B subscribes-with-replay to A's session and gets buffered output + live stream", async () => { + // The actual adoption flow: A opens, A produces output, A drops, B + // subscribes with replay. B must see the prior output AND any new + // output produced after B's subscribe. This is what host-service + // does after restart to give the renderer a continuous experience. + const a = await connectAndHello(sockPath); + const id = uniqueId("adopt"); + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + + a.send({ type: "subscribe", id, replay: false }); + a.send({ + type: "input", + id, + data: Buffer.from("echo before-restart\n").toString("base64"), + }); + await a.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("before-restart"), + 3000, + ); + + // A drops without cleanup — host-service "crashed." + a.socket.destroy(); + await new Promise((r) => setTimeout(r, 100)); + + // B picks up the session. First confirms via list, then subscribes + // with replay to get the buffered "before-restart" output. + const b = await connectAndHello(sockPath); + b.send({ type: "list" }); + const list = await b.waitFor((m) => m.type === "list-reply"); + assert.ok( + list.type === "list-reply" && + list.sessions.some((s) => s.id === id && s.alive), + ); + + b.send({ type: "subscribe", id, replay: true }); + await b.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("before-restart"), + 3000, + ); + + // New input from B reaches the (still-living) shell. + b.send({ + type: "input", + id, + data: Buffer.from("echo after-restart\n").toString("base64"), + }); + await b.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("after-restart"), + 3000, + ); + + b.send({ type: "close", id, signal: "SIGTERM" }); + await b.close(); + }); + + test("exited sessions are deleted immediately (no accumulation)", async () => { + // Sessions are removed from the store the moment their PTY exits. + // Late subscribers (e.g. host-service restarting in the exit gap) + // get ENOENT — the renderer falls back to a generic "session + // unavailable" footer. Tradeoff: niche UX regression in the + // restart-during-exit window vs. unbounded session accumulation + // (every closed terminal pane otherwise left a row forever). + const a = await connectAndHello(sockPath); + const id = uniqueId("postexit"); + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "echo final-words; exit 7"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + a.send({ type: "subscribe", id, replay: true }); + await a.waitFor((m) => m.type === "exit" && m.id === id, 3000); + a.socket.destroy(); + // Give the on-exit handler a beat to run its store.delete. + await new Promise((r) => setTimeout(r, 100)); + + // New connection: late subscribe gets nothing useful for a + // vanished id. We assert that the session is gone from list and + // that an op on the id returns ENOENT. + const b = await connectAndHello(sockPath); + b.send({ type: "list" }); + const reply = await b.waitFor((m) => m.type === "list-reply", 1000); + if (reply.type === "list-reply") { + const found = reply.sessions.find((s) => s.id === id); + assert.equal(found, undefined, "exited session should not be in list"); + } + b.send({ type: "close", id }); + const err = await b.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") assert.equal(err.code, "ENOENT"); + await b.close(); + }); + + test("daemon `list` returns sessions whose only client just dropped", async () => { + // Defensive: the daemon must NOT garbage-collect a session just + // because its last client disconnected. host-service relies on the + // session staying alive across the disconnect. + const a = await connectAndHello(sockPath); + const id = uniqueId("orphan"); + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "sleep 30"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + + a.socket.destroy(); + await new Promise((r) => setTimeout(r, 200)); + + const b = await connectAndHello(sockPath); + b.send({ type: "list" }); + const reply = await b.waitFor((m) => m.type === "list-reply"); + if (reply.type === "list-reply") { + const me = reply.sessions.find((s) => s.id === id); + assert.ok(me, "session must persist past last-client disconnect"); + assert.equal(me?.alive, true); + } + b.send({ type: "close", id, signal: "SIGKILL" }); + await b.close(); + }); +}); + +// ---------------- Malformed / abusive input ---------------- + +describe("hostile input", () => { + test("non-JSON in a frame disconnects the client; daemon survives", async () => { + const owner = await connectAndHello(sockPath); + const id = uniqueId("survive"); + owner.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await owner.waitFor((m) => m.type === "open-ok" && m.id === id); + + // Hostile client sends a length-prefixed buffer of garbage that isn't JSON. + const bad = await connect(sockPath); + const garbage = Buffer.from("\x00\x00\x00\x05NOT{}"); + bad.sendRaw(garbage); + // Server should disconnect this conn cleanly. + await new Promise((res) => bad.onClose(res)); + + // Owner is still functional. + owner.send({ type: "subscribe", id, replay: false }); + owner.send({ + type: "input", + id, + data: Buffer.from("echo still-alive\n").toString("base64"), + }); + await owner.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("still-alive"), + 3000, + ); + + owner.send({ type: "close", id, signal: "SIGTERM" }); + await owner.close(); + }); + + test("oversized frame header (> 8 MB cap) disconnects; daemon survives", async () => { + const bad = await connect(sockPath); + const hugeHeader = Buffer.alloc(4); + hugeHeader.writeUInt32BE(20 * 1024 * 1024, 0); + bad.sendRaw(hugeHeader); + await new Promise((res) => bad.onClose(res)); + + // Daemon is still accepting connections. + const c = await connectAndHello(sockPath); + c.send({ type: "list" }); + await c.waitFor((m) => m.type === "list-reply", 1000); + await c.close(); + }); + + test("input on a session that just exited returns ENOENT", async () => { + // Exit deletes the session row, so post-exit input lands on + // "unknown session" — same code path as input on a never-existed + // id. EEXITED is no longer returned because there's no exited + // session to be "exited"; it's just gone. + const c = await connectAndHello(sockPath); + const id = uniqueId("dead"); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "true"] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: true }); + await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); + await new Promise((r) => setTimeout(r, 50)); + + c.send({ + type: "input", + id, + data: Buffer.from("ignored").toString("base64"), + }); + const err = await c.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") assert.equal(err.code, "ENOENT"); + await c.close(); + }); +}); + +// ---------------- Concurrency stress ---------------- + +describe("concurrency", () => { + test("20 sessions opened and streaming concurrently", async () => { + const c = await connectAndHello(sockPath); + const N = 20; + const ids = Array.from({ length: N }, (_, i) => uniqueId(`conc-${i}`)); + + // Open all sessions. Use a workload that runs long enough to outlast + // the open+subscribe round-trip on a busy machine — the spawns happen + // in parallel, but `subscribe replay:false` would race exits otherwise. + for (const id of ids) { + c.send({ + type: "open", + id, + meta: { + ...baseMeta, + argv: ["-c", "echo TICK:start; sleep 0.5; echo TICK:end"], + }, + }); + } + + // Wait for all open-oks. + const openIds = new Set(); + while (openIds.size < N) { + const m = await c.waitFor( + (m) => m.type === "open-ok" && !openIds.has(m.id), + 10_000, + ); + if (m.type === "open-ok") openIds.add(m.id); + } + assert.equal(openIds.size, N); + + // Subscribe with replay so even sessions whose first output landed before + // our subscribe arrives are still surfaced. + for (const id of ids) c.send({ type: "subscribe", id, replay: true }); + + // Wait for the start marker from each session. + const seen = new Set(); + while (seen.size < N) { + const m = await c.waitFor( + (m) => + m.type === "output" && + !seen.has(m.id) && + ids.includes(m.id) && + Buffer.from(m.data, "base64").toString().includes("TICK:start"), + 10_000, + ); + if (m.type === "output") seen.add(m.id); + } + assert.equal(seen.size, N); + + // Wait for all to exit. + const exited = new Set(); + while (exited.size < N) { + const m = await c.waitFor( + (m) => m.type === "exit" && !exited.has(m.id) && ids.includes(m.id), + 10_000, + ); + if (m.type === "exit") exited.add(m.id); + } + + await c.close(); + }); + + test("multiple connections opening sessions in parallel", async () => { + const N = 10; + const conns = await Promise.all( + Array.from({ length: N }, () => connectAndHello(sockPath)), + ); + + await Promise.all( + conns.map(async (c, i) => { + const id = uniqueId(`parallel-${i}`); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", `echo CONN:${i}; sleep 0.2`] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id, 5000); + c.send({ type: "subscribe", id, replay: true }); + await c.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes(`CONN:${i}`), + 5000, + ); + c.send({ type: "close", id, signal: "SIGTERM" }); + await c.close(); + }), + ); + }); +}); + +// ---------------- Server shutdown ---------------- + +describe("server shutdown", () => { + test("disconnects active clients cleanly via close()", async () => { + // Use a *separate* short-lived server so we don't tear down the suite's main one. + const localPath = path.join( + os.tmpdir(), + `pty-daemon-shutdown-${process.pid}-${Date.now()}.sock`, + ); + const local = new Server({ + socketPath: localPath, + daemonVersion: "0.0.0-local", + }); + await local.listen(); + + const c = await connectAndHello(localPath); + const id = uniqueId("shutdown"); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "sleep 60"] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + + const closeWaiter = new Promise((res) => c.onClose(res)); + await local.close(); + // Server.close() destroys all connections. + await closeWaiter; + assert.equal(c.closed(), true); + }); +}); + +// ---------------- Frame-level encoding sanity ---------------- + +describe("framing on the wire", () => { + test("server tolerates split frames across multiple TCP chunks", async () => { + const c = await connect(sockPath); + const hello = encodeFrame({ type: "hello", protocols: [1] }); + // Send the hello in 3-byte chunks to force the decoder to buffer. + for (let i = 0; i < hello.length; i += 3) { + c.sendRaw(hello.subarray(i, Math.min(i + 3, hello.length))); + await new Promise((r) => setTimeout(r, 1)); + } + await c.waitFor((m) => m.type === "hello-ack", 1000); + await c.close(); + }); +}); diff --git a/packages/pty-daemon/test/helpers/client.ts b/packages/pty-daemon/test/helpers/client.ts new file mode 100644 index 00000000000..3a1b548bebe --- /dev/null +++ b/packages/pty-daemon/test/helpers/client.ts @@ -0,0 +1,151 @@ +// Reusable test client for pty-daemon integration tests. +// Speaks the daemon's wire protocol over a Unix socket. + +import * as net from "node:net"; +import { + encodeFrame, + FrameDecoder, + type ServerMessage, +} from "../../src/protocol/index.ts"; + +export interface DaemonClient { + socket: net.Socket; + messages: ServerMessage[]; + send(m: unknown): void; + waitFor( + predicate: (m: ServerMessage) => boolean, + ms?: number, + ): Promise; + collect( + predicate: (m: ServerMessage) => boolean, + ms: number, + ): Promise; + sendRaw(buf: Buffer): void; + close(): Promise; + closed(): boolean; + onClose(cb: () => void): void; +} + +interface Waiter { + predicate: (m: ServerMessage) => boolean; + resolve: (m: ServerMessage) => void; + reject: (e: Error) => void; + timer: NodeJS.Timeout; +} + +export function connect(socketPath: string): Promise { + return new Promise((resolve, reject) => { + const socket = net.createConnection({ path: socketPath }); + const decoder = new FrameDecoder(); + const messages: ServerMessage[] = []; + const waiters: Waiter[] = []; + const closeCbs: Array<() => void> = []; + let isClosed = false; + + socket.on("data", (chunk) => { + try { + decoder.push(chunk); + for (const raw of decoder.drain()) { + const m = raw as ServerMessage; + messages.push(m); + for (let i = waiters.length - 1; i >= 0; i--) { + const w = waiters[i]; + if (w?.predicate(m)) { + clearTimeout(w.timer); + waiters.splice(i, 1); + w.resolve(m); + } + } + } + } catch (err) { + // Surface frame errors to any pending waiter. + for (const w of waiters) { + clearTimeout(w.timer); + w.reject(err as Error); + } + waiters.length = 0; + } + }); + + socket.on("close", () => { + isClosed = true; + for (const cb of closeCbs) cb(); + }); + socket.once("error", reject); + socket.once("connect", () => { + socket.off("error", reject); + resolve({ + socket, + messages, + send(m) { + if (!socket.destroyed) socket.write(encodeFrame(m)); + }, + sendRaw(buf) { + if (!socket.destroyed) socket.write(buf); + }, + waitFor(predicate, ms = 5000) { + return new Promise((res, rej) => { + const found = messages.find(predicate); + if (found) return res(found); + const timer = setTimeout(() => { + const i = waiters.findIndex((w) => w.predicate === predicate); + if (i >= 0) waiters.splice(i, 1); + rej(new Error(`waitFor timed out after ${ms}ms`)); + }, ms); + waiters.push({ predicate, resolve: res, reject: rej, timer }); + }); + }, + collect(predicate, ms) { + return new Promise((res) => { + const collected: ServerMessage[] = messages.filter(predicate); + const onMsg = (chunk: Buffer) => { + void chunk; + for (let i = collected.length; i < messages.length; i++) { + const m = messages[i]; + if (m && predicate(m)) collected.push(m); + } + }; + socket.on("data", onMsg); + setTimeout(() => { + socket.off("data", onMsg); + // Final sweep in case of late drains. + for (let i = collected.length; i < messages.length; i++) { + const m = messages[i]; + if (m && predicate(m)) collected.push(m); + } + res(collected); + }, ms); + }); + }, + close() { + return new Promise((res) => { + if (socket.destroyed) return res(); + socket.end(() => res()); + // Fall back: if `end` doesn't fire close within 200ms, force. + setTimeout(() => { + if (!socket.destroyed) socket.destroy(); + res(); + }, 200); + }); + }, + closed() { + return isClosed; + }, + onClose(cb) { + if (isClosed) cb(); + else closeCbs.push(cb); + }, + }); + }); + }); +} + +/** Convenience: connect and complete the v1 handshake. */ +export async function connectAndHello( + socketPath: string, +): Promise { + const c = await connect(socketPath); + c.send({ type: "hello", protocols: [1] }); + await c.waitFor((m) => m.type === "hello-ack"); + return c; +} diff --git a/packages/pty-daemon/test/integration.test.ts b/packages/pty-daemon/test/integration.test.ts new file mode 100644 index 00000000000..4d4ee962f16 --- /dev/null +++ b/packages/pty-daemon/test/integration.test.ts @@ -0,0 +1,91 @@ +// Smoke / happy-path integration test for pty-daemon. +// +// Runs under Node (`node --experimental-strip-types --test`); see +// test/control-plane.test.ts for the exhaustive control-plane scenarios. + +import { strict as assert } from "node:assert"; +import * as os from "node:os"; +import * as path from "node:path"; +import { after, before, test } from "node:test"; +import { Server } from "../src/Server/index.ts"; +import { connect, connectAndHello } from "./helpers/client.ts"; + +const sockPath = path.join(os.tmpdir(), `pty-daemon-smoke-${process.pid}.sock`); +let server: Server; + +before(async () => { + server = new Server({ socketPath: sockPath, daemonVersion: "0.0.0-test" }); + await server.listen(); +}); + +after(async () => { + await server.close(); +}); + +test("handshake: hello → hello-ack", async () => { + const c = await connect(sockPath); + c.send({ type: "hello", protocols: [1] }); + const ack = await c.waitFor((m) => m.type === "hello-ack"); + assert.equal(ack.type, "hello-ack"); + if (ack.type === "hello-ack") { + assert.equal(ack.protocol, 1); + assert.equal(ack.daemonVersion, "0.0.0-test"); + } + await c.close(); +}); + +test("open → subscribe → output → exit lifecycle", async () => { + const c = await connectAndHello(sockPath); + c.send({ + type: "open", + id: "smoke-0", + meta: { + shell: "/bin/sh", + argv: ["-c", "echo daemon-smoke; sleep 0.2"], + cols: 80, + rows: 24, + }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === "smoke-0"); + c.send({ type: "subscribe", id: "smoke-0", replay: true }); + + await c.waitFor( + (m) => + m.type === "output" && + m.id === "smoke-0" && + Buffer.from(m.data, "base64").toString().includes("daemon-smoke"), + 3000, + ); + const exit = await c.waitFor( + (m) => m.type === "exit" && m.id === "smoke-0", + 3000, + ); + if (exit.type === "exit") assert.equal(exit.code, 0); + await c.close(); +}); + +test("input is forwarded and echoed via output", async () => { + const c = await connectAndHello(sockPath); + c.send({ + type: "open", + id: "smoke-1", + meta: { shell: "/bin/sh", argv: ["-i"], cols: 80, rows: 24 }, + }); + await c.waitFor((m) => m.type === "open-ok"); + c.send({ type: "subscribe", id: "smoke-1", replay: false }); + c.send({ + type: "input", + id: "smoke-1", + data: Buffer.from("echo abc-marker\n").toString("base64"), + }); + await c.waitFor( + (m) => + m.type === "output" && + m.id === "smoke-1" && + Buffer.from(m.data, "base64").toString().includes("abc-marker"), + 3000, + ); + c.send({ type: "close", id: "smoke-1", signal: "SIGTERM" }); + await c.waitFor((m) => m.type === "closed" && m.id === "smoke-1"); + await c.close(); +}); diff --git a/packages/pty-daemon/test/signal-recovery.test.ts b/packages/pty-daemon/test/signal-recovery.test.ts new file mode 100644 index 00000000000..1bc9b0d4ef5 --- /dev/null +++ b/packages/pty-daemon/test/signal-recovery.test.ts @@ -0,0 +1,185 @@ +// Real-signal recovery tests: spawn the bundled daemon as a child process, +// then SIGKILL it (no graceful close events) and verify the client surfaces +// the disconnect cleanly. Different from the existing control-plane tests, +// which use Server.close() — that's the cooperative shutdown path. Real +// production crashes don't go through Server.close. +// +// Runs under Node (`node --experimental-strip-types --test`). + +import { strict as assert } from "node:assert"; +import * as childProcess from "node:child_process"; +import * as fs from "node:fs"; +import * as net from "node:net"; +import * as os from "node:os"; +import * as path from "node:path"; +import { after, before, describe, test } from "node:test"; +import { fileURLToPath } from "node:url"; +import { + CURRENT_PROTOCOL_VERSION, + encodeFrame, + FrameDecoder, + type ServerMessage, +} from "../src/protocol/index.ts"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const DAEMON_BUNDLE = path.resolve(__dirname, "../dist/pty-daemon.js"); +const SOCK = path.join(os.tmpdir(), `pty-daemon-sigkill-${process.pid}.sock`); + +let daemonProcess: childProcess.ChildProcess | null = null; + +before(async () => { + if (!fs.existsSync(DAEMON_BUNDLE)) { + throw new Error( + `Missing daemon bundle at ${DAEMON_BUNDLE}. Run \`bun run build:daemon\` first.`, + ); + } + + daemonProcess = childProcess.spawn( + process.execPath, + [DAEMON_BUNDLE, `--socket=${SOCK}`], + { + stdio: ["ignore", "pipe", "pipe"], + env: { ...process.env, NODE_ENV: "test" }, + }, + ); + daemonProcess.stderr?.on("data", (chunk) => { + process.stderr.write(`[daemon-stderr] ${chunk}`); + }); + + // Wait for socket to become connectable. + const deadline = Date.now() + 5000; + while (Date.now() < deadline) { + if (fs.existsSync(SOCK)) { + const ok = await new Promise((resolve) => { + const s = net.createConnection({ path: SOCK }); + const t = setTimeout(() => { + s.destroy(); + resolve(false); + }, 200); + s.once("connect", () => { + clearTimeout(t); + s.end(); + resolve(true); + }); + s.once("error", () => { + clearTimeout(t); + resolve(false); + }); + }); + if (ok) break; + } + await new Promise((r) => setTimeout(r, 50)); + } +}); + +after(async () => { + if (daemonProcess && !daemonProcess.killed) { + daemonProcess.kill("SIGKILL"); + await new Promise((r) => daemonProcess?.once("exit", r)); + } + try { + fs.unlinkSync(SOCK); + } catch { + // best-effort + } +}); + +describe("daemon SIGKILL recovery", () => { + test("clients receive close events when daemon dies via SIGKILL", async () => { + // Open a connection, complete handshake, send a list to confirm health. + const client = await connect(); + client.send({ type: "hello", protocols: [CURRENT_PROTOCOL_VERSION] }); + await client.waitFor((m) => m.type === "hello-ack", 2000); + + // Capture disconnect. + const disconnected = new Promise((resolve) => + client.socket.once("close", () => resolve()), + ); + + // Now SIGKILL the daemon. No graceful Server.close, no exit broadcast. + assert.ok(daemonProcess); + daemonProcess.kill("SIGKILL"); + await new Promise((r) => daemonProcess?.once("exit", r)); + + // Client should see the socket close within reasonable time. + await Promise.race([ + disconnected, + new Promise((_, rej) => + setTimeout(() => rej(new Error("disconnect not surfaced")), 2000), + ), + ]); + + // Subsequent send fails synchronously (writable: false) or async. + // Either way, no hang. + try { + client.send({ type: "list" }); + } catch { + // Either path is acceptable — just don't hang. + } + + // Process is gone; ensure cleanup so `after` doesn't block. + daemonProcess = null; + try { + fs.unlinkSync(SOCK); + } catch { + // best-effort — daemon's atexit didn't run because of SIGKILL + } + }); +}); + +// ---------------- helpers ---------------- + +interface Client { + socket: net.Socket; + messages: ServerMessage[]; + send(m: unknown): void; + waitFor( + predicate: (m: ServerMessage) => boolean, + ms: number, + ): Promise; +} + +function connect(): Promise { + return new Promise((resolve, reject) => { + const socket = net.createConnection({ path: SOCK }); + const decoder = new FrameDecoder(); + const messages: ServerMessage[] = []; + + socket.on("data", (chunk) => { + decoder.push(chunk); + for (const raw of decoder.drain()) { + messages.push(raw as ServerMessage); + } + }); + + socket.once("error", reject); + socket.once("connect", () => { + resolve({ + socket, + messages, + send(m) { + if (!socket.destroyed) socket.write(encodeFrame(m)); + }, + waitFor(predicate, ms) { + return new Promise((res, rej) => { + const found = messages.find(predicate); + if (found) return res(found); + const onData = () => { + const m = messages.find(predicate); + if (m) { + socket.off("data", onData); + clearTimeout(t); + res(m); + } + }; + const t = setTimeout(() => { + socket.off("data", onData); + rej(new Error("waitFor timed out")); + }, ms); + socket.on("data", onData); + }); + }, + }); + }); + }); +} diff --git a/packages/pty-daemon/tsconfig.json b/packages/pty-daemon/tsconfig.json new file mode 100644 index 00000000000..2d1f2809f81 --- /dev/null +++ b/packages/pty-daemon/tsconfig.json @@ -0,0 +1,11 @@ +{ + "extends": "@superset/typescript/internal-package.json", + "compilerOptions": { + "types": ["bun-types", "node"], + "noUncheckedIndexedAccess": true, + "emitDeclarationOnly": false, + "noEmit": true + }, + "include": ["src", "test"], + "exclude": ["node_modules", "dist"] +} diff --git a/tooling/typescript/base.json b/tooling/typescript/base.json index b73be1ebd38..2d56a6cf542 100644 --- a/tooling/typescript/base.json +++ b/tooling/typescript/base.json @@ -20,7 +20,8 @@ "module": "Preserve", "moduleResolution": "Bundler", - "noEmit": true + "noEmit": true, + "allowImportingTsExtensions": true }, "exclude": ["node_modules", "build", "dist", ".next"] }