From 3e329bb59e2c41f96797a647709cac7dc0185f19 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 17:14:27 -0700 Subject: [PATCH 01/33] feat(pty-daemon): standalone PTY daemon package (Phase 1, skeleton) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New package @superset/pty-daemon implementing the long-lived PTY-owning process described in apps/desktop/plans/20260429-pty-daemon-implementation.md. This PR adds the daemon in isolation; host-service integration lands in a follow-up PR so both can be reviewed independently. What's in: - Versioned Unix-socket protocol (length-prefixed JSON frames; hello/ack handshake; open/input/resize/close/list/subscribe/unsubscribe ops). - Pty wrapper around node-pty with dim validation. - SessionStore: in-memory map + 64KB ring buffer per session. No persistence — explicitly out of scope per the v1 lessons. - Server: AF_UNIX SOCK_STREAM accept loop, file-mode 0600 auth boundary, per-connection subscription set, output/exit fan-out. - Handlers: pure functions over (store, conn, msg). Stateless from the client's perspective. - main.ts entrypoint: argv parsing, signal handling, graceful shutdown. Runtime: Node ≥ 20, not Bun. Verified during implementation that node-pty 1.1's master fd setup is incompatible with Bun 1.3's tty.ReadStream (onData/onExit silently never fire). Daemon ships as a Node script in the desktop app bundle; host-service stays on Bun. Tests: 24 unit tests under bun test (protocol framing, SessionStore, handlers with a fake spawn), 6 integration tests under node --test spawning real shells through real Unix sockets. All green. What's NOT in (separate PRs): - Host-service DaemonClient + terminal.ts refactor + manifest adoption. - Daemon-upgrade fd inheritance handoff (Phase 2). - Renderer / WS / tRPC changes (none required; the renderer is unchanged). --- bun.lock | 18 ++ packages/pty-daemon/README.md | 85 ++++++ packages/pty-daemon/package.json | 38 +++ packages/pty-daemon/src/Pty/Pty.test.ts | 34 +++ packages/pty-daemon/src/Pty/Pty.ts | 84 ++++++ packages/pty-daemon/src/Pty/index.ts | 2 + packages/pty-daemon/src/Server/Server.ts | 247 +++++++++++++++++ packages/pty-daemon/src/Server/index.ts | 1 + .../src/SessionStore/SessionStore.test.ts | 82 ++++++ .../src/SessionStore/SessionStore.ts | 104 ++++++++ packages/pty-daemon/src/SessionStore/index.ts | 2 + .../pty-daemon/src/handlers/handlers.test.ts | 216 +++++++++++++++ packages/pty-daemon/src/handlers/handlers.ts | 148 +++++++++++ packages/pty-daemon/src/handlers/index.ts | 10 + packages/pty-daemon/src/index.ts | 7 + packages/pty-daemon/src/main.ts | 77 ++++++ .../pty-daemon/src/protocol/framing.test.ts | 49 ++++ packages/pty-daemon/src/protocol/framing.ts | 56 ++++ packages/pty-daemon/src/protocol/index.ts | 26 ++ packages/pty-daemon/src/protocol/messages.ts | 140 ++++++++++ packages/pty-daemon/src/protocol/version.ts | 4 + packages/pty-daemon/test/integration.test.ts | 248 ++++++++++++++++++ packages/pty-daemon/tsconfig.json | 12 + 23 files changed, 1690 insertions(+) create mode 100644 packages/pty-daemon/README.md create mode 100644 packages/pty-daemon/package.json create mode 100644 packages/pty-daemon/src/Pty/Pty.test.ts create mode 100644 packages/pty-daemon/src/Pty/Pty.ts create mode 100644 packages/pty-daemon/src/Pty/index.ts create mode 100644 packages/pty-daemon/src/Server/Server.ts create mode 100644 packages/pty-daemon/src/Server/index.ts create mode 100644 packages/pty-daemon/src/SessionStore/SessionStore.test.ts create mode 100644 packages/pty-daemon/src/SessionStore/SessionStore.ts create mode 100644 packages/pty-daemon/src/SessionStore/index.ts create mode 100644 packages/pty-daemon/src/handlers/handlers.test.ts create mode 100644 packages/pty-daemon/src/handlers/handlers.ts create mode 100644 packages/pty-daemon/src/handlers/index.ts create mode 100644 packages/pty-daemon/src/index.ts create mode 100644 packages/pty-daemon/src/main.ts create mode 100644 packages/pty-daemon/src/protocol/framing.test.ts create mode 100644 packages/pty-daemon/src/protocol/framing.ts create mode 100644 packages/pty-daemon/src/protocol/index.ts create mode 100644 packages/pty-daemon/src/protocol/messages.ts create mode 100644 packages/pty-daemon/src/protocol/version.ts create mode 100644 packages/pty-daemon/test/integration.test.ts create mode 100644 packages/pty-daemon/tsconfig.json diff --git a/bun.lock b/bun.lock index e1f79454908..b58736baab0 100644 --- a/bun.lock +++ b/bun.lock @@ -868,6 +868,22 @@ "typescript": "^5.9.3", }, }, + "packages/pty-daemon": { + "name": "@superset/pty-daemon", + "version": "0.1.0", + "bin": { + "pty-daemon": "./src/main.ts", + }, + "dependencies": { + "node-pty": "1.1.0", + }, + "devDependencies": { + "@superset/typescript": "workspace:*", + "@types/node": "^24.9.1", + "bun-types": "^1.3.1", + "typescript": "^5.9.3", + }, + }, "packages/shared": { "name": "@superset/shared", "version": "0.1.0", @@ -2573,6 +2589,8 @@ "@superset/port-scanner": ["@superset/port-scanner@workspace:packages/port-scanner"], + "@superset/pty-daemon": ["@superset/pty-daemon@workspace:packages/pty-daemon"], + "@superset/relay": ["@superset/relay@workspace:apps/relay"], "@superset/shared": ["@superset/shared@workspace:packages/shared"], diff --git a/packages/pty-daemon/README.md b/packages/pty-daemon/README.md new file mode 100644 index 00000000000..97c10567904 --- /dev/null +++ b/packages/pty-daemon/README.md @@ -0,0 +1,85 @@ +# @superset/pty-daemon + +Long-lived PTY-owning process for the v2 desktop terminal. host-service is a +client over a Unix socket; routine host-service upgrades don't touch shells. + +Implements [Phase 1 of the daemon plan](../../apps/desktop/plans/20260429-pty-daemon-implementation.md). +This package is **standalone**: it does not import from `@superset/host-service` +or any other workspace package. Host-service consumes only the protocol types +via `@superset/pty-daemon/protocol`. + +## Runtime + +**Node ≥ 20**, not Bun. node-pty's master fd handling is incompatible with +Bun's `tty.ReadStream` (verified: Bun 1.3, node-pty 1.1 — onData/onExit +silently never fire). The daemon ships as a Node script in the desktop app +bundle; host-service can stay on Bun. + +## Layout + +``` +src/ +├── main.ts # Node entrypoint: argv → Server.listen() +├── index.ts # Public exports for host-service consumers +├── protocol/ # Wire schemas + length-prefixed framing +│ ├── version.ts # CURRENT_PROTOCOL_VERSION + supported list +│ ├── messages.ts # ClientMessage / ServerMessage unions +│ ├── framing.ts # encodeFrame / FrameDecoder (4-byte BE prefix) +│ └── index.ts +├── Pty/ # node-pty thin wrapper with dim validation +│ ├── Pty.ts +│ └── index.ts +├── SessionStore/ # in-memory map + 64KB ring buffer per session +│ ├── SessionStore.ts +│ └── index.ts +├── handlers/ # pure functions: open/input/resize/close/list/subscribe +│ ├── handlers.ts +│ └── index.ts +└── Server/ # AF_UNIX SOCK_STREAM accept loop, handshake, dispatch + ├── Server.ts + └── index.ts + +test/ +└── integration.test.ts # node --test: real shells, real socket +``` + +## Design notes + +- **Stateless from the client's perspective.** Every protocol call carries + full context. No client tracking, no session tombstones, no business + rules. Single design principle from + [the implementation plan](../../apps/desktop/plans/20260429-pty-daemon-implementation.md#the-single-design-principle). +- **Auth boundary = Unix socket file mode 0600.** No in-band tokens. The + daemon trusts whoever can open the socket. +- **Buffer is in-memory only.** Survives host-service restarts (because the + daemon does), but never persisted to disk. No SQLite, no scrollback files. + v1's `HistoryManager` is explicitly out of scope. +- **Protocol versioned from day one.** Handshake (`hello` / `hello-ack`) + picks the highest mutually supported version. + +## Testing + +```sh +bun test # unit tests (protocol, handlers, SessionStore, Pty validation) +bun run test:integration # end-to-end via node --test (spawns real shells) +``` + +Why two runners? `bun test` is fast for pure-JS work. node-pty doesn't work +under Bun, so anything that spawns a real PTY runs under Node. + +## Running locally + +```sh +bun run start --socket=/tmp/pty-daemon.sock +``` + +Logs go to stderr; stdout stays empty (so the daemon can later be supervised +by host-service with stdout reserved for protocol or kept dark). + +## Out of scope (Phase 1) + +- Host-service integration (DaemonClient, terminal.ts refactor, manifest + adoption) — separate PR. +- Daemon-upgrade handoff via `child_process.spawn` `stdio` fd inheritance + — separate PR (Phase 2 of the plan). +- Windows ConPTY — not in v1 protocol; defer until Windows users justify it. diff --git a/packages/pty-daemon/package.json b/packages/pty-daemon/package.json new file mode 100644 index 00000000000..b5193c440ab --- /dev/null +++ b/packages/pty-daemon/package.json @@ -0,0 +1,38 @@ +{ + "name": "@superset/pty-daemon", + "version": "0.1.0", + "private": true, + "type": "module", + "exports": { + ".": { + "types": "./src/index.ts", + "default": "./src/index.ts" + }, + "./protocol": { + "types": "./src/protocol/index.ts", + "default": "./src/protocol/index.ts" + } + }, + "bin": { + "pty-daemon": "./src/main.ts" + }, + "engines": { + "node": ">=20" + }, + "scripts": { + "clean": "git clean -xdf .cache .turbo dist node_modules", + "start": "node --experimental-strip-types src/main.ts", + "typecheck": "tsc --noEmit --emitDeclarationOnly false", + "test": "bun test src/protocol src/SessionStore src/handlers src/Pty/Pty.test.ts", + "test:integration": "node --experimental-strip-types --test test/integration.test.ts" + }, + "dependencies": { + "node-pty": "1.1.0" + }, + "devDependencies": { + "@superset/typescript": "workspace:*", + "@types/node": "^24.9.1", + "bun-types": "^1.3.1", + "typescript": "^5.9.3" + } +} diff --git a/packages/pty-daemon/src/Pty/Pty.test.ts b/packages/pty-daemon/src/Pty/Pty.test.ts new file mode 100644 index 00000000000..0cbad51032b --- /dev/null +++ b/packages/pty-daemon/src/Pty/Pty.test.ts @@ -0,0 +1,34 @@ +import { describe, expect, test } from "bun:test"; +import { spawn } from "./Pty.ts"; + +// node-pty's runtime requires Node (Bun's tty.ReadStream handling is +// incompatible with the master fd setup). The daemon ships running under +// node; integration spawn tests live in test/integration.ts and run via +// `npm run test:integration`. Here we only cover the synchronous validation +// logic that doesn't require spawning a real PTY. + +describe("Pty wrapper (validation only — spawn behavior tested under node)", () => { + test("rejects invalid spawn dims (cols)", () => { + expect(() => + spawn({ + meta: { shell: "/bin/sh", argv: [], cols: 0, rows: 24 }, + }), + ).toThrow(/invalid cols/); + }); + + test("rejects invalid spawn dims (rows)", () => { + expect(() => + spawn({ + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 0 }, + }), + ).toThrow(/invalid rows/); + }); + + test("rejects non-integer dims", () => { + expect(() => + spawn({ + meta: { shell: "/bin/sh", argv: [], cols: 80.5, rows: 24 }, + }), + ).toThrow(/invalid cols/); + }); +}); diff --git a/packages/pty-daemon/src/Pty/Pty.ts b/packages/pty-daemon/src/Pty/Pty.ts new file mode 100644 index 00000000000..608822ac047 --- /dev/null +++ b/packages/pty-daemon/src/Pty/Pty.ts @@ -0,0 +1,84 @@ +import * as nodePty from "node-pty"; +import type { SessionMeta } from "../protocol/index.ts"; + +export type PtyOnData = (data: Buffer) => void; +export type PtyOnExit = (info: { + code: number | null; + signal: number | null; +}) => void; + +export interface Pty { + readonly pid: number; + readonly meta: SessionMeta; + write(data: Buffer): void; + resize(cols: number, rows: number): void; + kill(signal?: NodeJS.Signals): void; + onData(cb: PtyOnData): void; + onExit(cb: PtyOnExit): void; +} + +export interface SpawnOptions { + meta: SessionMeta; +} + +class NodePtyAdapter implements Pty { + readonly pid: number; + meta: SessionMeta; + private term: nodePty.IPty; + + constructor(term: nodePty.IPty, meta: SessionMeta) { + this.term = term; + this.pid = term.pid; + this.meta = meta; + } + + write(data: Buffer): void { + // node-pty's write accepts strings or buffers; pass buffer to keep bytes intact. + this.term.write(data as unknown as string); + } + + resize(cols: number, rows: number): void { + validateDims(cols, rows); + this.term.resize(cols, rows); + this.meta = { ...this.meta, cols, rows }; + } + + kill(signal?: NodeJS.Signals): void { + this.term.kill(signal); + } + + onData(cb: PtyOnData): void { + this.term.onData((d) => { + cb(typeof d === "string" ? Buffer.from(d, "utf8") : d); + }); + } + + onExit(cb: PtyOnExit): void { + this.term.onExit(({ exitCode, signal }) => { + cb({ code: exitCode ?? null, signal: signal ?? null }); + }); + } +} + +function validateDims(cols: number, rows: number): void { + if (!Number.isInteger(cols) || cols <= 0) { + throw new Error(`invalid cols: ${cols}`); + } + if (!Number.isInteger(rows) || rows <= 0) { + throw new Error(`invalid rows: ${rows}`); + } +} + +export function spawn({ meta }: SpawnOptions): Pty { + validateDims(meta.cols, meta.rows); + const term = nodePty.spawn(meta.shell, meta.argv, { + name: "xterm-256color", + cols: meta.cols, + rows: meta.rows, + cwd: meta.cwd, + env: meta.env, + // node-pty's encoding defaults to utf8; we want raw bytes for fidelity. + encoding: null, + }); + return new NodePtyAdapter(term, meta); +} diff --git a/packages/pty-daemon/src/Pty/index.ts b/packages/pty-daemon/src/Pty/index.ts new file mode 100644 index 00000000000..8ce7d556c01 --- /dev/null +++ b/packages/pty-daemon/src/Pty/index.ts @@ -0,0 +1,2 @@ +export type { Pty, PtyOnData, PtyOnExit, SpawnOptions } from "./Pty.ts"; +export { spawn } from "./Pty.ts"; diff --git a/packages/pty-daemon/src/Server/Server.ts b/packages/pty-daemon/src/Server/Server.ts new file mode 100644 index 00000000000..635a5667acc --- /dev/null +++ b/packages/pty-daemon/src/Server/Server.ts @@ -0,0 +1,247 @@ +import * as fs from "node:fs"; +import * as net from "node:net"; +import * as path from "node:path"; +import type { Conn, HandlerCtx } from "../handlers/index.ts"; +import { + handleClose, + handleInput, + handleList, + handleOpen, + handleResize, + handleSubscribe, + handleUnsubscribe, +} from "../handlers/index.ts"; +import { + type ClientMessage, + CURRENT_PROTOCOL_VERSION, + encodeFrame, + FrameDecoder, + type HelloMessage, + type ServerMessage, + SUPPORTED_PROTOCOL_VERSIONS, +} from "../protocol/index.ts"; +import type { Session } from "../SessionStore/index.ts"; +import { SessionStore } from "../SessionStore/index.ts"; + +export interface ServerOptions { + socketPath: string; + daemonVersion: string; + bufferCap?: number; +} + +interface ConnState extends Conn { + socket: net.Socket; + decoder: FrameDecoder; + negotiated: number | null; +} + +export class Server { + private readonly server: net.Server; + private readonly store: SessionStore; + private readonly conns = new Set(); + private readonly opts: ServerOptions; + + constructor(opts: ServerOptions) { + this.opts = opts; + this.store = new SessionStore({ bufferCap: opts.bufferCap }); + this.server = net.createServer((socket) => this.onConnection(socket)); + } + + async listen(): Promise { + const dir = path.dirname(this.opts.socketPath); + fs.mkdirSync(dir, { recursive: true }); + // Stale-socket cleanup: remove any prior socket file at this path. + try { + fs.unlinkSync(this.opts.socketPath); + } catch (err) { + if ((err as NodeJS.ErrnoException).code !== "ENOENT") throw err; + } + await new Promise((resolve, reject) => { + this.server.once("error", reject); + this.server.listen(this.opts.socketPath, () => { + this.server.off("error", reject); + resolve(); + }); + }); + // Owner-only access. The socket file IS the auth boundary. + fs.chmodSync(this.opts.socketPath, 0o600); + } + + async close(): Promise { + for (const c of this.conns) c.socket.destroy(); + this.conns.clear(); + await new Promise((resolve) => this.server.close(() => resolve())); + try { + fs.unlinkSync(this.opts.socketPath); + } catch { + // ignore + } + } + + private onConnection(socket: net.Socket): void { + const conn: ConnState = { + socket, + decoder: new FrameDecoder(), + negotiated: null, + subscriptions: new Set(), + send: (msg) => writeMessage(socket, msg), + }; + this.conns.add(conn); + + socket.on("data", (chunk) => { + try { + conn.decoder.push(chunk); + for (const raw of conn.decoder.drain()) { + this.dispatch(conn, raw as ClientMessage); + } + } catch (err) { + conn.send({ + type: "error", + message: (err as Error).message, + code: "EPROTO", + }); + socket.destroy(); + } + }); + socket.on("close", () => { + this.conns.delete(conn); + }); + socket.on("error", () => { + this.conns.delete(conn); + }); + } + + private dispatch(conn: ConnState, msg: ClientMessage): void { + // Handshake must come first. + if (conn.negotiated === null) { + if (msg.type !== "hello") { + conn.send({ type: "error", message: "expected hello", code: "EPROTO" }); + conn.socket.destroy(); + return; + } + const negotiated = pickProtocol(msg); + if (negotiated === null) { + conn.send({ + type: "error", + message: `no compatible protocol; daemon supports ${SUPPORTED_PROTOCOL_VERSIONS.join(",")}`, + code: "EVERSION", + }); + conn.socket.destroy(); + return; + } + conn.negotiated = negotiated; + conn.send({ + type: "hello-ack", + protocol: negotiated, + daemonVersion: this.opts.daemonVersion, + }); + return; + } + + const ctx = this.handlerCtx(); + switch (msg.type) { + case "hello": { + conn.send({ + type: "error", + message: "duplicate hello", + code: "EPROTO", + }); + return; + } + case "open": { + conn.send(handleOpen(ctx, msg)); + return; + } + case "input": { + const reply = handleInput(ctx, msg); + if (reply) conn.send(reply); + return; + } + case "resize": { + const reply = handleResize(ctx, msg); + if (reply) conn.send(reply); + return; + } + case "close": { + conn.send(handleClose(ctx, msg)); + return; + } + case "list": { + conn.send(handleList(ctx)); + return; + } + case "subscribe": { + handleSubscribe(ctx, conn, msg); + return; + } + case "unsubscribe": { + handleUnsubscribe(conn, msg); + return; + } + default: { + const t = (msg as { type: string }).type; + conn.send({ + type: "error", + message: `unknown op: ${t}`, + code: "EPROTO", + }); + return; + } + } + } + + private handlerCtx(): HandlerCtx { + return { + store: this.store, + wireSession: (session) => this.wireSession(session), + }; + } + + /** + * Pipe the session's PTY events into the broadcast set: any connection + * subscribed to this session id receives the output / exit frames. + */ + private wireSession(session: Session): void { + session.pty.onData((chunk) => { + this.store.appendOutput(session, chunk); + const out: ServerMessage = { + type: "output", + id: session.id, + data: chunk.toString("base64"), + }; + for (const c of this.conns) { + if (c.subscriptions.has(session.id)) c.send(out); + } + }); + session.pty.onExit((info) => { + session.exited = true; + session.exitCode = info.code; + session.exitSignal = info.signal; + const ev: ServerMessage = { + type: "exit", + id: session.id, + code: info.code, + signal: info.signal, + }; + for (const c of this.conns) { + if (c.subscriptions.has(session.id)) c.send(ev); + } + // Keep the session row around briefly so a late subscriber can still + // fetch the buffer; we delete on next list/close. + }); + } +} + +function pickProtocol(hello: HelloMessage): number | null { + const supported = new Set(SUPPORTED_PROTOCOL_VERSIONS); + let best: number | null = null; + for (const v of hello.protocols) { + if (supported.has(v) && (best === null || v > best)) best = v; + } + return best ?? (supported.has(CURRENT_PROTOCOL_VERSION) ? null : null); +} + +function writeMessage(socket: net.Socket, msg: ServerMessage): void { + if (socket.destroyed) return; + socket.write(encodeFrame(msg)); +} diff --git a/packages/pty-daemon/src/Server/index.ts b/packages/pty-daemon/src/Server/index.ts new file mode 100644 index 00000000000..0126739712f --- /dev/null +++ b/packages/pty-daemon/src/Server/index.ts @@ -0,0 +1 @@ +export { Server, type ServerOptions } from "./Server.ts"; diff --git a/packages/pty-daemon/src/SessionStore/SessionStore.test.ts b/packages/pty-daemon/src/SessionStore/SessionStore.test.ts new file mode 100644 index 00000000000..19d6da4c989 --- /dev/null +++ b/packages/pty-daemon/src/SessionStore/SessionStore.test.ts @@ -0,0 +1,82 @@ +import { describe, expect, test } from "bun:test"; +import type { Pty } from "../Pty/index.ts"; +import { SessionStore } from "./SessionStore.ts"; + +function fakePty(meta: { cols: number; rows: number }): Pty { + return { + pid: 12345, + meta: { + shell: "/bin/sh", + argv: [], + cols: meta.cols, + rows: meta.rows, + }, + write: () => {}, + resize: () => {}, + kill: () => {}, + onData: () => {}, + onExit: () => {}, + }; +} + +describe("SessionStore", () => { + test("add / get / delete", () => { + const store = new SessionStore(); + const pty = fakePty({ cols: 80, rows: 24 }); + store.add("s0", pty); + expect(store.size()).toBe(1); + expect(store.get("s0")?.id).toBe("s0"); + expect(store.delete("s0")).toBe(true); + expect(store.size()).toBe(0); + }); + + test("rejects duplicate ids", () => { + const store = new SessionStore(); + const pty = fakePty({ cols: 80, rows: 24 }); + store.add("s0", pty); + expect(() => store.add("s0", pty)).toThrow(/already exists/); + }); + + test("list reflects sessions", () => { + const store = new SessionStore(); + store.add("a", fakePty({ cols: 80, rows: 24 })); + store.add("b", fakePty({ cols: 100, rows: 30 })); + const list = store.list(); + expect(list).toHaveLength(2); + expect(list.map((s) => s.id).sort()).toEqual(["a", "b"]); + }); + + test("appendOutput accumulates within cap", () => { + const store = new SessionStore({ bufferCap: 100 }); + const session = store.add("s0", fakePty({ cols: 80, rows: 24 })); + store.appendOutput(session, Buffer.from("hello")); + store.appendOutput(session, Buffer.from(" world")); + expect(store.snapshotBuffer(session).toString()).toBe("hello world"); + expect(session.bufferBytes).toBe(11); + }); + + test("appendOutput evicts oldest chunks when exceeding cap", () => { + const store = new SessionStore({ bufferCap: 10 }); + const session = store.add("s0", fakePty({ cols: 80, rows: 24 })); + store.appendOutput(session, Buffer.from("AAAA")); // 4 + store.appendOutput(session, Buffer.from("BBBB")); // 8 + store.appendOutput(session, Buffer.from("CCCCCC")); // would be 14 → evict AAAA + const snap = store.snapshotBuffer(session).toString(); + expect(snap).toBe("BBBBCCCCCC"); + expect(session.bufferBytes).toBe(10); + }); + + test("appendOutput keeps buffer within cap across many writes", () => { + const store = new SessionStore({ bufferCap: 32 }); + const session = store.add("s0", fakePty({ cols: 80, rows: 24 })); + for (let i = 0; i < 100; i++) { + store.appendOutput( + session, + Buffer.from(`chunk${i.toString().padStart(2, "0")}-`), + ); + } + expect(session.bufferBytes).toBeLessThanOrEqual(32); + // Final chunk must always be present + expect(store.snapshotBuffer(session).toString()).toContain("chunk99-"); + }); +}); diff --git a/packages/pty-daemon/src/SessionStore/SessionStore.ts b/packages/pty-daemon/src/SessionStore/SessionStore.ts new file mode 100644 index 00000000000..40f69d96979 --- /dev/null +++ b/packages/pty-daemon/src/SessionStore/SessionStore.ts @@ -0,0 +1,104 @@ +import type { Pty } from "../Pty/index.ts"; +import type { SessionInfo } from "../protocol/index.ts"; + +const DEFAULT_BUFFER_BYTES = 64 * 1024; + +export interface Session { + id: string; + pty: Pty; + /** ring buffer for replay-on-attach; in-memory only, never persisted. */ + buffer: Buffer[]; + bufferBytes: number; + bufferCap: number; + exited: boolean; + exitCode: number | null; + exitSignal: number | null; +} + +export interface SessionStoreOptions { + bufferCap?: number; +} + +/** + * In-memory map of active sessions. Daemon-local state; nothing is persisted. + * + * Replay buffer is a circular FIFO of byte chunks per session, capped by + * total byte size. When new output exceeds the cap, oldest chunks are + * dropped (head). The cap is small (~64 KB) — enough to redraw a typical + * shell screen on attach. Larger scrollback is the renderer's xterm.js + * responsibility. + */ +export class SessionStore { + private readonly sessions = new Map(); + private readonly bufferCap: number; + + constructor(opts: SessionStoreOptions = {}) { + this.bufferCap = opts.bufferCap ?? DEFAULT_BUFFER_BYTES; + } + + add(id: string, pty: Pty): Session { + if (this.sessions.has(id)) { + throw new Error(`session already exists: ${id}`); + } + const session: Session = { + id, + pty, + buffer: [], + bufferBytes: 0, + bufferCap: this.bufferCap, + exited: false, + exitCode: null, + exitSignal: null, + }; + this.sessions.set(id, session); + return session; + } + + get(id: string): Session | undefined { + return this.sessions.get(id); + } + + delete(id: string): boolean { + return this.sessions.delete(id); + } + + list(): SessionInfo[] { + const out: SessionInfo[] = []; + for (const s of this.sessions.values()) { + out.push({ + id: s.id, + pid: s.pty.pid, + cols: s.pty.meta.cols, + rows: s.pty.meta.rows, + alive: !s.exited, + }); + } + return out; + } + + all(): IterableIterator { + return this.sessions.values(); + } + + size(): number { + return this.sessions.size; + } + + /** Append output to a session's ring buffer; evict oldest chunks past the cap. */ + appendOutput(session: Session, chunk: Buffer): void { + session.buffer.push(chunk); + session.bufferBytes += chunk.byteLength; + while ( + session.bufferBytes > session.bufferCap && + session.buffer.length > 0 + ) { + const head = session.buffer.shift(); + if (head) session.bufferBytes -= head.byteLength; + } + } + + /** Snapshot the buffered bytes for replay; doesn't clear the buffer. */ + snapshotBuffer(session: Session): Buffer { + return Buffer.concat(session.buffer); + } +} diff --git a/packages/pty-daemon/src/SessionStore/index.ts b/packages/pty-daemon/src/SessionStore/index.ts new file mode 100644 index 00000000000..0e7e0400010 --- /dev/null +++ b/packages/pty-daemon/src/SessionStore/index.ts @@ -0,0 +1,2 @@ +export type { Session, SessionStoreOptions } from "./SessionStore.ts"; +export { SessionStore } from "./SessionStore.ts"; diff --git a/packages/pty-daemon/src/handlers/handlers.test.ts b/packages/pty-daemon/src/handlers/handlers.test.ts new file mode 100644 index 00000000000..c0c97975c1d --- /dev/null +++ b/packages/pty-daemon/src/handlers/handlers.test.ts @@ -0,0 +1,216 @@ +import { beforeEach, describe, expect, test } from "bun:test"; +import type { Pty, SpawnOptions } from "../Pty/index.ts"; +import type { ServerMessage } from "../protocol/index.ts"; +import { SessionStore } from "../SessionStore/index.ts"; +import type { Conn, HandlerCtx } from "./handlers.ts"; +import { + handleClose, + handleInput, + handleList, + handleOpen, + handleResize, + handleSubscribe, + handleUnsubscribe, +} from "./handlers.ts"; + +interface FakePtyState { + pid: number; + cols: number; + rows: number; + written: Buffer[]; + killed: boolean; +} + +function makeFakePty(state: FakePtyState, meta: SpawnOptions["meta"]): Pty { + state.cols = meta.cols; + state.rows = meta.rows; + return { + pid: state.pid, + meta, + write: (b) => state.written.push(b), + resize: (c, r) => { + state.cols = c; + state.rows = r; + }, + kill: () => { + state.killed = true; + }, + onData: () => {}, + onExit: () => {}, + }; +} + +function makeConn(): Conn & { sent: ServerMessage[] } { + const sent: ServerMessage[] = []; + return { + sent, + subscriptions: new Set(), + send: (m) => sent.push(m), + }; +} + +let nextPid = 1000; +let states: FakePtyState[] = []; +let wired: Pty[] = []; + +function makeCtx(): HandlerCtx & { + spawnedStates: FakePtyState[]; + wired: Pty[]; +} { + const store = new SessionStore(); + return { + store, + spawnedStates: states, + wired, + wireSession: (s) => { + wired.push(s.pty); + }, + spawnPty: (opts) => { + const state: FakePtyState = { + pid: nextPid++, + cols: opts.meta.cols, + rows: opts.meta.rows, + written: [], + killed: false, + }; + states.push(state); + return makeFakePty(state, opts.meta); + }, + }; +} + +beforeEach(() => { + nextPid = 1000; + states = []; + wired = []; +}); + +describe("handlers", () => { + test("open: spawns a session and replies open-ok", () => { + const ctx = makeCtx(); + const reply = handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + expect(reply.type).toBe("open-ok"); + if (reply.type === "open-ok") expect(reply.pid).toBe(1000); + expect(ctx.store.size()).toBe(1); + expect(ctx.wired).toHaveLength(1); + }); + + test("open: rejects duplicate ids", () => { + const ctx = makeCtx(); + const meta = { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }; + handleOpen(ctx, { type: "open", id: "s0", meta }); + const reply = handleOpen(ctx, { type: "open", id: "s0", meta }); + expect(reply.type).toBe("error"); + }); + + test("input writes bytes to the pty", () => { + const ctx = makeCtx(); + handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + const result = handleInput(ctx, { + type: "input", + id: "s0", + data: Buffer.from("hello").toString("base64"), + }); + expect(result).toBeUndefined(); + expect(states[0]?.written.map((b) => b.toString())).toEqual(["hello"]); + }); + + test("input on missing session returns error", () => { + const ctx = makeCtx(); + const result = handleInput(ctx, { + type: "input", + id: "missing", + data: "", + }); + expect(result?.type).toBe("error"); + }); + + test("resize updates dims", () => { + const ctx = makeCtx(); + handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + expect( + handleResize(ctx, { type: "resize", id: "s0", cols: 100, rows: 30 }), + ).toBeUndefined(); + expect(states[0]?.cols).toBe(100); + expect(states[0]?.rows).toBe(30); + }); + + test("close kills the pty and replies closed", () => { + const ctx = makeCtx(); + handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + const reply = handleClose(ctx, { type: "close", id: "s0" }); + expect(reply.type).toBe("closed"); + expect(states[0]?.killed).toBe(true); + }); + + test("list returns all sessions", () => { + const ctx = makeCtx(); + const meta = { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }; + handleOpen(ctx, { type: "open", id: "a", meta }); + handleOpen(ctx, { type: "open", id: "b", meta }); + const reply = handleList(ctx); + expect(reply.sessions).toHaveLength(2); + }); + + test("subscribe with replay sends buffered output", () => { + const ctx = makeCtx(); + handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + const session = ctx.store.get("s0"); + if (!session) throw new Error("no session"); + ctx.store.appendOutput(session, Buffer.from("prior bytes")); + + const conn = makeConn(); + handleSubscribe(ctx, conn, { type: "subscribe", id: "s0", replay: true }); + expect(conn.subscriptions.has("s0")).toBe(true); + expect(conn.sent).toHaveLength(1); + const m = conn.sent[0]; + expect(m?.type).toBe("output"); + if (m?.type === "output") { + expect(Buffer.from(m.data, "base64").toString()).toBe("prior bytes"); + } + }); + + test("subscribe without replay does not send buffered output", () => { + const ctx = makeCtx(); + handleOpen(ctx, { + type: "open", + id: "s0", + meta: { shell: "/bin/sh", argv: [], cols: 80, rows: 24 }, + }); + const session = ctx.store.get("s0"); + if (!session) throw new Error("no session"); + ctx.store.appendOutput(session, Buffer.from("prior bytes")); + + const conn = makeConn(); + handleSubscribe(ctx, conn, { type: "subscribe", id: "s0", replay: false }); + expect(conn.subscriptions.has("s0")).toBe(true); + expect(conn.sent).toHaveLength(0); + }); + + test("unsubscribe removes from conn.subscriptions", () => { + const conn = makeConn(); + conn.subscriptions.add("s0"); + handleUnsubscribe(conn, { type: "unsubscribe", id: "s0" }); + expect(conn.subscriptions.has("s0")).toBe(false); + }); +}); diff --git a/packages/pty-daemon/src/handlers/handlers.ts b/packages/pty-daemon/src/handlers/handlers.ts new file mode 100644 index 00000000000..c752982493e --- /dev/null +++ b/packages/pty-daemon/src/handlers/handlers.ts @@ -0,0 +1,148 @@ +import { + spawn as defaultSpawn, + type Pty, + type SpawnOptions, +} from "../Pty/index.ts"; +import type { + CloseMessage, + InputMessage, + ListReplyMessage, + OpenMessage, + OpenOkMessage, + OutputMessage, + ResizeMessage, + ServerMessage, + SubscribeMessage, + UnsubscribeMessage, +} from "../protocol/index.ts"; +import type { Session, SessionStore } from "../SessionStore/index.ts"; + +/** + * Per-connection state owned by the Server. Handlers receive a Conn ref to + * read/write subscription membership and to send messages. + */ +export interface Conn { + subscriptions: Set; + send(message: ServerMessage): void; +} + +/** + * Wire a freshly-created session's PTY events into the broadcast pipeline. + * Called once at session-open time. The Server owns the broadcast set. + */ +export type SessionWirer = (session: Session) => void; + +export interface HandlerCtx { + store: SessionStore; + wireSession: SessionWirer; + /** Pluggable spawn for testability; defaults to real node-pty in production. */ + spawnPty?: (opts: SpawnOptions) => Pty; +} + +export function handleOpen(ctx: HandlerCtx, msg: OpenMessage): ServerMessage { + if (ctx.store.get(msg.id)) { + return errorFor(msg.id, `session already exists: ${msg.id}`, "EEXIST"); + } + let session: Session; + const spawnFn = ctx.spawnPty ?? defaultSpawn; + try { + const pty = spawnFn({ meta: msg.meta }); + session = ctx.store.add(msg.id, pty); + } catch (err) { + return errorFor(msg.id, (err as Error).message, "ESPAWN"); + } + ctx.wireSession(session); + const reply: OpenOkMessage = { + type: "open-ok", + id: msg.id, + pid: session.pty.pid, + }; + return reply; +} + +export function handleInput( + ctx: HandlerCtx, + msg: InputMessage, +): ServerMessage | undefined { + const session = ctx.store.get(msg.id); + if (!session) return errorFor(msg.id, `unknown session: ${msg.id}`, "ENOENT"); + if (session.exited) + return errorFor(msg.id, `session exited: ${msg.id}`, "EEXITED"); + try { + session.pty.write(Buffer.from(msg.data, "base64")); + } catch (err) { + return errorFor(msg.id, (err as Error).message, "EWRITE"); + } + return undefined; +} + +export function handleResize( + ctx: HandlerCtx, + msg: ResizeMessage, +): ServerMessage | undefined { + const session = ctx.store.get(msg.id); + if (!session) return errorFor(msg.id, `unknown session: ${msg.id}`, "ENOENT"); + try { + session.pty.resize(msg.cols, msg.rows); + } catch (err) { + return errorFor(msg.id, (err as Error).message, "ERESIZE"); + } + return undefined; +} + +export function handleClose(ctx: HandlerCtx, msg: CloseMessage): ServerMessage { + const session = ctx.store.get(msg.id); + if (!session) return errorFor(msg.id, `unknown session: ${msg.id}`, "ENOENT"); + try { + session.pty.kill(msg.signal ?? "SIGTERM"); + } catch (err) { + return errorFor(msg.id, (err as Error).message, "EKILL"); + } + return { type: "closed", id: msg.id }; +} + +export function handleList(ctx: HandlerCtx): ListReplyMessage { + return { type: "list-reply", sessions: ctx.store.list() }; +} + +/** + * Subscribe the connection to a session. If `replay` is true, immediately + * send an `output` frame containing the buffered bytes before live streaming + * begins. Live streaming is the Server's job once `subscriptions` includes + * this session id. + */ +export function handleSubscribe( + ctx: HandlerCtx, + conn: Conn, + msg: SubscribeMessage, +): void { + const session = ctx.store.get(msg.id); + if (!session) { + conn.send(errorFor(msg.id, `unknown session: ${msg.id}`, "ENOENT")); + return; + } + conn.subscriptions.add(msg.id); + if (msg.replay) { + const snap = ctx.store.snapshotBuffer(session); + if (snap.byteLength > 0) { + const out: OutputMessage = { + type: "output", + id: msg.id, + data: snap.toString("base64"), + }; + conn.send(out); + } + } +} + +export function handleUnsubscribe(conn: Conn, msg: UnsubscribeMessage): void { + conn.subscriptions.delete(msg.id); +} + +function errorFor( + id: string | undefined, + message: string, + code?: string, +): ServerMessage { + return { type: "error", id, message, code }; +} diff --git a/packages/pty-daemon/src/handlers/index.ts b/packages/pty-daemon/src/handlers/index.ts new file mode 100644 index 00000000000..97b7935d62a --- /dev/null +++ b/packages/pty-daemon/src/handlers/index.ts @@ -0,0 +1,10 @@ +export type { Conn, HandlerCtx, SessionWirer } from "./handlers.ts"; +export { + handleClose, + handleInput, + handleList, + handleOpen, + handleResize, + handleSubscribe, + handleUnsubscribe, +} from "./handlers.ts"; diff --git a/packages/pty-daemon/src/index.ts b/packages/pty-daemon/src/index.ts new file mode 100644 index 00000000000..ec0b81b0324 --- /dev/null +++ b/packages/pty-daemon/src/index.ts @@ -0,0 +1,7 @@ +// Public package surface — host-service imports from "@superset/pty-daemon" or +// "@superset/pty-daemon/protocol". Daemon implementation runtime is Node; +// host-service is a CLIENT of the daemon (importing protocol types only), +// not a runtime peer. + +export { Server, type ServerOptions } from "./Server/index.ts"; +export type { Session } from "./SessionStore/index.ts"; diff --git a/packages/pty-daemon/src/main.ts b/packages/pty-daemon/src/main.ts new file mode 100644 index 00000000000..d40bd03fe1e --- /dev/null +++ b/packages/pty-daemon/src/main.ts @@ -0,0 +1,77 @@ +#!/usr/bin/env node +// pty-daemon entrypoint. Runs under Node (node-pty + Bun's tty.ReadStream +// don't get along; see the design doc). +// +// Usage: +// pty-daemon --socket=/path/to/sock [--buffer-bytes=65536] +// +// Logs go to stderr; nothing on stdout. + +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; +import { Server } from "./Server/index.ts"; + +interface CliArgs { + socket: string; + bufferBytes?: number; +} + +function parseArgs(argv: string[]): CliArgs { + const args: Partial = {}; + for (const arg of argv) { + if (arg.startsWith("--socket=")) + args.socket = arg.slice("--socket=".length); + else if (arg.startsWith("--buffer-bytes=")) { + args.bufferBytes = Number.parseInt( + arg.slice("--buffer-bytes=".length), + 10, + ); + } + } + if (!args.socket) { + throw new Error("--socket=PATH is required"); + } + return args as CliArgs; +} + +async function main(): Promise { + const args = parseArgs(process.argv.slice(2)); + const daemonVersion = readPackageVersion(); + const server = new Server({ + socketPath: args.socket, + daemonVersion, + bufferCap: args.bufferBytes, + }); + await server.listen(); + process.stderr.write( + `[pty-daemon] listening on ${args.socket} (v${daemonVersion}, host=${os.hostname()})\n`, + ); + + const shutdown = async (signal: NodeJS.Signals) => { + process.stderr.write(`[pty-daemon] received ${signal}, shutting down\n`); + await server.close(); + process.exit(0); + }; + process.on("SIGINT", () => void shutdown("SIGINT")); + process.on("SIGTERM", () => void shutdown("SIGTERM")); +} + +function readPackageVersion(): string { + try { + const here = path.dirname(fileURLToPath(import.meta.url)); + const pkgPath = path.resolve(here, "..", "package.json"); + const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf8")) as { + version?: string; + }; + return pkg.version ?? "0.0.0"; + } catch { + return "0.0.0"; + } +} + +main().catch((err) => { + process.stderr.write(`[pty-daemon] fatal: ${(err as Error).stack ?? err}\n`); + process.exit(1); +}); diff --git a/packages/pty-daemon/src/protocol/framing.test.ts b/packages/pty-daemon/src/protocol/framing.test.ts new file mode 100644 index 00000000000..0d3c947c766 --- /dev/null +++ b/packages/pty-daemon/src/protocol/framing.test.ts @@ -0,0 +1,49 @@ +import { describe, expect, test } from "bun:test"; +import { decodeFrame, encodeFrame, FrameDecoder } from "./framing.ts"; + +describe("framing", () => { + test("round-trips a simple object", () => { + const msg = { type: "hello", protocols: [1] }; + const frame = encodeFrame(msg); + expect(decodeFrame(frame)).toEqual(msg); + }); + + test("round-trips through FrameDecoder", () => { + const a = { type: "open", id: "s0" }; + const b = { type: "input", id: "s0", data: "aGk=" }; + const dec = new FrameDecoder(); + dec.push(Buffer.concat([encodeFrame(a), encodeFrame(b)])); + expect(dec.drain()).toEqual([a, b]); + }); + + test("FrameDecoder buffers across chunks", () => { + const msg = { type: "open", id: "s0" }; + const full = encodeFrame(msg); + const dec = new FrameDecoder(); + dec.push(full.subarray(0, 2)); + expect(dec.drain()).toEqual([]); + dec.push(full.subarray(2, 6)); + expect(dec.drain()).toEqual([]); + dec.push(full.subarray(6)); + expect(dec.drain()).toEqual([msg]); + }); + + test("FrameDecoder handles partial frame after a complete one", () => { + const a = { type: "open", id: "s0" }; + const b = { type: "open", id: "s1" }; + const buf = Buffer.concat([encodeFrame(a), encodeFrame(b)]); + const dec = new FrameDecoder(); + dec.push(buf.subarray(0, encodeFrame(a).length + 3)); + expect(dec.drain()).toEqual([a]); + dec.push(buf.subarray(encodeFrame(a).length + 3)); + expect(dec.drain()).toEqual([b]); + }); + + test("rejects oversized frames", () => { + const bigHeader = Buffer.alloc(4); + bigHeader.writeUInt32BE(20 * 1024 * 1024, 0); // 20 MB + const dec = new FrameDecoder(); + dec.push(bigHeader); + expect(() => dec.drain()).toThrow(/frame too large/); + }); +}); diff --git a/packages/pty-daemon/src/protocol/framing.ts b/packages/pty-daemon/src/protocol/framing.ts new file mode 100644 index 00000000000..df678a0b76d --- /dev/null +++ b/packages/pty-daemon/src/protocol/framing.ts @@ -0,0 +1,56 @@ +// Length-prefixed binary frames over a SOCK_STREAM socket. +// +// Wire: [u32 BE length][JSON UTF-8 payload of that length] + +const HEADER_BYTES = 4; +const MAX_FRAME_BYTES = 8 * 1024 * 1024; // 8 MB hard cap; abort the connection above this. + +export function encodeFrame(message: unknown): Buffer { + const json = JSON.stringify(message); + const payload = Buffer.from(json, "utf8"); + const header = Buffer.alloc(HEADER_BYTES); + header.writeUInt32BE(payload.byteLength, 0); + return Buffer.concat([header, payload]); +} + +/** + * Streaming decoder. Feed bytes via `push`; iterate completed frames via `drain`. + * Throws on oversized frames so a malformed peer can't exhaust memory. + */ +export class FrameDecoder { + private buf: Buffer = Buffer.alloc(0); + + push(chunk: Buffer): void { + this.buf = this.buf.length === 0 ? chunk : Buffer.concat([this.buf, chunk]); + } + + drain(): unknown[] { + const out: unknown[] = []; + while (this.buf.length >= HEADER_BYTES) { + const len = this.buf.readUInt32BE(0); + if (len > MAX_FRAME_BYTES) { + throw new Error(`frame too large: ${len} bytes`); + } + if (this.buf.length < HEADER_BYTES + len) break; + const payload = this.buf.subarray(HEADER_BYTES, HEADER_BYTES + len); + out.push(JSON.parse(payload.toString("utf8"))); + this.buf = this.buf.subarray(HEADER_BYTES + len); + } + return out; + } +} + +/** + * One-shot decode of a buffer that contains exactly one complete frame. + * Used by tests; production reads use FrameDecoder. + */ +export function decodeFrame(buf: Buffer): unknown { + if (buf.length < HEADER_BYTES) throw new Error("short frame"); + const len = buf.readUInt32BE(0); + if (buf.length !== HEADER_BYTES + len) { + throw new Error( + `frame length mismatch: header=${len} buf=${buf.length - HEADER_BYTES}`, + ); + } + return JSON.parse(buf.subarray(HEADER_BYTES).toString("utf8")); +} diff --git a/packages/pty-daemon/src/protocol/index.ts b/packages/pty-daemon/src/protocol/index.ts new file mode 100644 index 00000000000..dd5bf0788e9 --- /dev/null +++ b/packages/pty-daemon/src/protocol/index.ts @@ -0,0 +1,26 @@ +export { decodeFrame, encodeFrame, FrameDecoder } from "./framing.ts"; +export type { + ClientMessage, + ClosedMessage, + CloseMessage, + ErrorMessage, + ExitMessage, + HelloAckMessage, + HelloMessage, + InputMessage, + ListMessage, + ListReplyMessage, + OpenMessage, + OpenOkMessage, + OutputMessage, + ResizeMessage, + ServerMessage, + SessionInfo, + SessionMeta, + SubscribeMessage, + UnsubscribeMessage, +} from "./messages.ts"; +export { + CURRENT_PROTOCOL_VERSION, + SUPPORTED_PROTOCOL_VERSIONS, +} from "./version.ts"; diff --git a/packages/pty-daemon/src/protocol/messages.ts b/packages/pty-daemon/src/protocol/messages.ts new file mode 100644 index 00000000000..dbb26d5f7e9 --- /dev/null +++ b/packages/pty-daemon/src/protocol/messages.ts @@ -0,0 +1,140 @@ +// Message schemas for the pty-daemon Unix socket protocol. +// +// Wire format: 4-byte big-endian length prefix + UTF-8 JSON payload. +// Binary data (PTY input/output) travels base64-encoded inside the JSON. +// See ../README.md and ../../../../apps/desktop/plans/20260429-pty-daemon-implementation.md + +export interface SessionMeta { + shell: string; + argv: string[]; + cwd?: string; + env?: Record; + cols: number; + rows: number; +} + +export interface SessionInfo { + id: string; + pid: number; + cols: number; + rows: number; + alive: boolean; +} + +// ---------- Handshake ---------- + +export interface HelloMessage { + type: "hello"; + protocols: number[]; + clientVersion?: string; +} + +export interface HelloAckMessage { + type: "hello-ack"; + protocol: number; + daemonVersion: string; +} + +// ---------- Client -> Daemon ---------- + +export interface OpenMessage { + type: "open"; + id: string; + meta: SessionMeta; +} + +export interface InputMessage { + type: "input"; + id: string; + /** base64-encoded bytes */ + data: string; +} + +export interface ResizeMessage { + type: "resize"; + id: string; + cols: number; + rows: number; +} + +export interface CloseMessage { + type: "close"; + id: string; + signal?: "SIGINT" | "SIGTERM" | "SIGKILL" | "SIGHUP"; +} + +export interface ListMessage { + type: "list"; +} + +export interface SubscribeMessage { + type: "subscribe"; + id: string; + /** if true, replay buffered output before live streaming */ + replay: boolean; +} + +export interface UnsubscribeMessage { + type: "unsubscribe"; + id: string; +} + +// ---------- Daemon -> Client ---------- + +export interface OpenOkMessage { + type: "open-ok"; + id: string; + pid: number; +} + +export interface OutputMessage { + type: "output"; + id: string; + /** base64-encoded bytes */ + data: string; +} + +export interface ExitMessage { + type: "exit"; + id: string; + code: number | null; + signal: number | null; +} + +export interface ClosedMessage { + type: "closed"; + id: string; +} + +export interface ListReplyMessage { + type: "list-reply"; + sessions: SessionInfo[]; +} + +export interface ErrorMessage { + type: "error"; + id?: string; + message: string; + code?: string; +} + +// ---------- Unions ---------- + +export type ClientMessage = + | HelloMessage + | OpenMessage + | InputMessage + | ResizeMessage + | CloseMessage + | ListMessage + | SubscribeMessage + | UnsubscribeMessage; + +export type ServerMessage = + | HelloAckMessage + | OpenOkMessage + | OutputMessage + | ExitMessage + | ClosedMessage + | ListReplyMessage + | ErrorMessage; diff --git a/packages/pty-daemon/src/protocol/version.ts b/packages/pty-daemon/src/protocol/version.ts new file mode 100644 index 00000000000..350a807f332 --- /dev/null +++ b/packages/pty-daemon/src/protocol/version.ts @@ -0,0 +1,4 @@ +// Protocol versioning. Increment on breaking changes; add to SUPPORTED list +// while we still need to interop with the previous major during rollouts. +export const CURRENT_PROTOCOL_VERSION = 1 as const; +export const SUPPORTED_PROTOCOL_VERSIONS: readonly number[] = [1]; diff --git a/packages/pty-daemon/test/integration.test.ts b/packages/pty-daemon/test/integration.test.ts new file mode 100644 index 00000000000..eb2d4a5889b --- /dev/null +++ b/packages/pty-daemon/test/integration.test.ts @@ -0,0 +1,248 @@ +// End-to-end integration test for pty-daemon. +// +// Runs under Node (`node --experimental-strip-types --test`), not Bun, because +// node-pty's master fd handling depends on Node's tty.ReadStream behavior. +// +// Spawns a daemon in-process, connects a TCP-style client to its Unix socket, +// runs through the protocol: hello → open → subscribe(replay) → input → +// receive output → close → exit. + +import { strict as assert } from "node:assert"; +import * as net from "node:net"; +import * as os from "node:os"; +import * as path from "node:path"; +import { after, before, test } from "node:test"; +import { + encodeFrame, + FrameDecoder, + type ServerMessage, +} from "../src/protocol/index.ts"; +import { Server } from "../src/Server/index.ts"; + +const sockPath = path.join(os.tmpdir(), `pty-daemon-test-${process.pid}.sock`); +let server: Server; + +before(async () => { + server = new Server({ socketPath: sockPath, daemonVersion: "0.0.0-test" }); + await server.listen(); +}); + +after(async () => { + await server.close(); +}); + +interface Client { + socket: net.Socket; + messages: ServerMessage[]; + waitFor: ( + predicate: (m: ServerMessage) => boolean, + ms?: number, + ) => Promise; + send: (m: unknown) => void; + close: () => Promise; +} + +function connect(): Promise { + return new Promise((resolve, reject) => { + const socket = net.createConnection({ path: sockPath }); + const decoder = new FrameDecoder(); + const messages: ServerMessage[] = []; + const waiters: Array<{ + predicate: (m: ServerMessage) => boolean; + resolve: (m: ServerMessage) => void; + reject: (e: Error) => void; + timer: NodeJS.Timeout; + }> = []; + + socket.on("data", (chunk) => { + decoder.push(chunk); + for (const raw of decoder.drain()) { + const m = raw as ServerMessage; + messages.push(m); + for (let i = waiters.length - 1; i >= 0; i--) { + const w = waiters[i]; + if (w?.predicate(m)) { + clearTimeout(w.timer); + waiters.splice(i, 1); + w.resolve(m); + } + } + } + }); + + socket.once("error", reject); + socket.once("connect", () => + resolve({ + socket, + messages, + waitFor(predicate, ms = 5000) { + return new Promise((res, rej) => { + const found = messages.find(predicate); + if (found) return res(found); + const timer = setTimeout(() => { + rej(new Error(`waitFor timed out after ${ms}ms`)); + }, ms); + waiters.push({ predicate, resolve: res, reject: rej, timer }); + }); + }, + send(m) { + socket.write(encodeFrame(m)); + }, + close() { + return new Promise((res) => { + socket.end(() => res()); + }); + }, + }), + ); + }); +} + +test("handshake: hello → hello-ack", async () => { + const c = await connect(); + c.send({ type: "hello", protocols: [1] }); + const ack = await c.waitFor((m) => m.type === "hello-ack"); + assert.equal(ack.type, "hello-ack"); + if (ack.type === "hello-ack") { + assert.equal(ack.protocol, 1); + assert.equal(ack.daemonVersion, "0.0.0-test"); + } + await c.close(); +}); + +test("incompatible protocol → error and disconnect", async () => { + const c = await connect(); + c.send({ type: "hello", protocols: [99] }); + const err = await c.waitFor((m) => m.type === "error"); + assert.equal(err.type, "error"); + if (err.type === "error") assert.equal(err.code, "EVERSION"); + await c.close(); +}); + +test("open → subscribe → output → close lifecycle (real shell)", async () => { + const c = await connect(); + c.send({ type: "hello", protocols: [1] }); + await c.waitFor((m) => m.type === "hello-ack"); + + c.send({ + type: "open", + id: "s0", + meta: { + shell: "/bin/sh", + argv: ["-c", "echo daemon-integration; sleep 0.2"], + cols: 80, + rows: 24, + }, + }); + const opened = await c.waitFor((m) => m.type === "open-ok"); + assert.equal(opened.type, "open-ok"); + + c.send({ type: "subscribe", id: "s0", replay: true }); + + const output = await c.waitFor( + (m) => + m.type === "output" && + m.id === "s0" && + Buffer.from(m.data, "base64").toString().includes("daemon-integration"), + 3000, + ); + assert.ok(output.type === "output"); + + const exit = await c.waitFor((m) => m.type === "exit" && m.id === "s0", 3000); + assert.equal(exit.type, "exit"); + if (exit.type === "exit") assert.equal(exit.code, 0); + + await c.close(); +}); + +test("input is forwarded to the shell and echoed back via output", async () => { + const c = await connect(); + c.send({ type: "hello", protocols: [1] }); + await c.waitFor((m) => m.type === "hello-ack"); + c.send({ + type: "open", + id: "s1", + meta: { shell: "/bin/sh", argv: ["-i"], cols: 80, rows: 24 }, + }); + await c.waitFor((m) => m.type === "open-ok"); + c.send({ type: "subscribe", id: "s1", replay: false }); + + // Send "echo abc-marker\n" — shell echoes the typed bytes back through the + // PTY (canonical mode), AND prints "abc-marker" as the command output. + c.send({ + type: "input", + id: "s1", + data: Buffer.from("echo abc-marker\n").toString("base64"), + }); + + await c.waitFor( + (m) => + m.type === "output" && + m.id === "s1" && + Buffer.from(m.data, "base64").toString().includes("abc-marker"), + 3000, + ); + + c.send({ type: "close", id: "s1", signal: "SIGTERM" }); + await c.waitFor((m) => m.type === "closed" && m.id === "s1"); + await c.close(); +}); + +test("subscribe with replay sends prior buffered output", async () => { + const c = await connect(); + c.send({ type: "hello", protocols: [1] }); + await c.waitFor((m) => m.type === "hello-ack"); + + c.send({ + type: "open", + id: "s2", + meta: { + shell: "/bin/sh", + argv: ["-c", "echo replay-test; sleep 1"], + cols: 80, + rows: 24, + }, + }); + await c.waitFor((m) => m.type === "open-ok"); + + // Wait a bit so the shell emits its initial output into the daemon's buffer + // without any subscriber yet. + await new Promise((resolve) => setTimeout(resolve, 200)); + + c.send({ type: "subscribe", id: "s2", replay: true }); + const out = await c.waitFor( + (m) => + m.type === "output" && + m.id === "s2" && + Buffer.from(m.data, "base64").toString().includes("replay-test"), + 2000, + ); + assert.ok(out.type === "output"); + + c.send({ type: "close", id: "s2", signal: "SIGTERM" }); + await c.close(); +}); + +test("list returns active sessions", async () => { + const c = await connect(); + c.send({ type: "hello", protocols: [1] }); + await c.waitFor((m) => m.type === "hello-ack"); + + c.send({ + type: "open", + id: "list-a", + meta: { shell: "/bin/sh", argv: ["-c", "sleep 5"], cols: 80, rows: 24 }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === "list-a"); + + c.send({ type: "list" }); + const list = await c.waitFor((m) => m.type === "list-reply"); + assert.equal(list.type, "list-reply"); + if (list.type === "list-reply") { + assert.ok(list.sessions.some((s) => s.id === "list-a")); + } + + c.send({ type: "close", id: "list-a", signal: "SIGTERM" }); + await c.waitFor((m) => m.type === "closed"); + await c.close(); +}); diff --git a/packages/pty-daemon/tsconfig.json b/packages/pty-daemon/tsconfig.json new file mode 100644 index 00000000000..555dc9e9cf9 --- /dev/null +++ b/packages/pty-daemon/tsconfig.json @@ -0,0 +1,12 @@ +{ + "extends": "@superset/typescript/internal-package.json", + "compilerOptions": { + "types": ["bun-types", "node"], + "noUncheckedIndexedAccess": true, + "allowImportingTsExtensions": true, + "emitDeclarationOnly": false, + "noEmit": true + }, + "include": ["src", "test"], + "exclude": ["node_modules", "dist"] +} From b8784ea722aa6c0efdac0ecfa0b50ce239959f3f Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 18:01:43 -0700 Subject: [PATCH 02/33] feat(pty-daemon): control-plane integration suite + production build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an exhaustive control-plane integration test that exercises every usage pattern host-service can throw at the daemon end-to-end (real shells, real Unix socket), plus the production build pipeline matching the host-service pattern. Test coverage (28 integration tests, all passing in ~2.5s): - Handshake variants (non-hello first, unsupported version, mutual picking, duplicate hello) - Session lifecycle (bad dims, duplicate id, ENOENT on missing, instant-exit, SIGKILL hung shell) - I/O patterns (resize during streaming, burst output, multi-byte UTF-8) - Multi-client fan-out (two subscribers, unsubscribe stops delivery, dropped subscriber doesn't crash) - Detach + reattach (late subscriber gets replay, full disconnect → new conn → continues live) - Hostile input (malformed frames, oversized frames, input on dead session) - Concurrency (20 sessions on one conn, 10 conns in parallel) - Server shutdown (in-flight clients disconnect cleanly) - Frame splitting across TCP chunks Reusable test client extracted to test/helpers/client.ts (waitFor, collect, sendRaw, onClose). Found and fixed during the suite: Server.close() now kills owned PTYs synchronously so the daemon process can actually exit (open master fds were keeping the event loop alive). Aligns with the v1-lessons "synchronous teardown only" rule. Production build: build.ts mirrors packages/host-service/build.ts — Bun.build target=node, externalizes node-pty, emits dist/pty-daemon.js that runs under Electron's bundled Node via process.execPath. No new runtime in the desktop bundle. Bun is build-only; same shape as host-service today. --- packages/pty-daemon/README.md | 48 +- packages/pty-daemon/build.ts | 32 + packages/pty-daemon/package.json | 3 +- packages/pty-daemon/src/Server/Server.ts | 10 + .../pty-daemon/test/control-plane.test.ts | 734 ++++++++++++++++++ packages/pty-daemon/test/helpers/client.ts | 151 ++++ packages/pty-daemon/test/integration.test.ts | 211 +---- 7 files changed, 997 insertions(+), 192 deletions(-) create mode 100644 packages/pty-daemon/build.ts create mode 100644 packages/pty-daemon/test/control-plane.test.ts create mode 100644 packages/pty-daemon/test/helpers/client.ts diff --git a/packages/pty-daemon/README.md b/packages/pty-daemon/README.md index 97c10567904..b2f169cdd85 100644 --- a/packages/pty-daemon/README.md +++ b/packages/pty-daemon/README.md @@ -10,10 +10,22 @@ via `@superset/pty-daemon/protocol`. ## Runtime -**Node ≥ 20**, not Bun. node-pty's master fd handling is incompatible with -Bun's `tty.ReadStream` (verified: Bun 1.3, node-pty 1.1 — onData/onExit -silently never fire). The daemon ships as a Node script in the desktop app -bundle; host-service can stay on Bun. +**Production: Node ≥ 20** (Electron's bundled Node), via +`process.execPath` — exactly the same pattern as `host-service` already +uses today (`packages/host-service/build.ts` → `dist/host-service.js`, +spawned by `apps/desktop/src/main/lib/host-service-coordinator.ts`). +Bun is the build tool, not a runtime. **No new runtime in the desktop +app bundle.** + +**Why not Bun at runtime:** verified during development that node-pty +1.1's master fd handling is incompatible with Bun 1.3 (`tty.ReadStream` +closes immediately, alternate `fs.createReadStream(null, { fd })` +returns EAGAIN with no recovery). The daemon needs a runtime where +node-pty actually works. + +**Dev:** unit tests run under Bun (`bun test`) for speed; integration +tests run under Node (`bun run test:integration`) since they touch real +PTYs. The daemon binary itself runs under Node in both dev and prod. ## Layout @@ -40,7 +52,12 @@ src/ └── index.ts test/ -└── integration.test.ts # node --test: real shells, real socket +├── helpers/ +│ └── client.ts # reusable DaemonClient: connect, send, waitFor, collect +├── integration.test.ts # smoke / happy-path (3 tests) +└── control-plane.test.ts # exhaustive control-plane coverage (25 tests) + +build.ts # Bun bundler → dist/pty-daemon.js (target: node) ``` ## Design notes @@ -60,10 +77,27 @@ test/ ## Testing ```sh -bun test # unit tests (protocol, handlers, SessionStore, Pty validation) -bun run test:integration # end-to-end via node --test (spawns real shells) +bun test # 24 unit tests (protocol framing, handlers, SessionStore, Pty validation) +bun run test:integration # 28 integration tests under node --test: + # - test/integration.test.ts (smoke / happy-path, 3 tests) + # - test/control-plane.test.ts (every usage pattern, 25 tests) +bun run typecheck # tsc --noEmit +bun run build:daemon # bundle src/main.ts → dist/pty-daemon.js (target: node) ``` +**Control-plane coverage** (`test/control-plane.test.ts`): + +- Handshake: rejects non-hello first, picks highest mutual protocol, rejects unsupported, rejects duplicate hello. +- Session lifecycle: invalid dims, duplicate ids, ENOENT on missing, instant-exit shells, SIGKILL on hung shells. +- I/O patterns: resize during running shell, burst output (200 lines), multi-byte UTF-8 (🚀). +- Multi-client fan-out: two subscribers see same output, unsubscribe stops further delivery, dropped subscriber doesn't crash daemon. +- Detach + reattach (the headline feature): late subscriber gets replay, full reattach cycle continues live after disconnect. +- list reflects active sessions with cols/rows/alive. +- Hostile input: malformed frames disconnect cleanly, oversized frames are rejected, input on exited session returns EEXITED. +- Concurrency: 20 sessions in parallel from one connection, 10 connections opening sessions in parallel. +- Server shutdown: in-flight clients disconnect cleanly, owned PTYs are killed. +- Framing: tolerates split frames across multiple TCP chunks. + Why two runners? `bun test` is fast for pure-JS work. node-pty doesn't work under Bun, so anything that spawns a real PTY runs under Node. diff --git a/packages/pty-daemon/build.ts b/packages/pty-daemon/build.ts new file mode 100644 index 00000000000..ff7613a8b13 --- /dev/null +++ b/packages/pty-daemon/build.ts @@ -0,0 +1,32 @@ +/** + * Bundles the pty-daemon entry point into a single JS file executable by a + * standalone Node.js runtime (matches packages/host-service/build.ts). Native + * addons (node-pty) are marked external and resolved from the desktop app's + * lib/native/ at runtime. + * + * Production: Electron spawns the daemon via process.execPath (its bundled + * Node), exactly like host-service. No Bun in the production bundle. + */ +import { existsSync, mkdirSync } from "node:fs"; + +const outdir = "dist"; +if (!existsSync(outdir)) { + mkdirSync(outdir, { recursive: true }); +} + +const result = await Bun.build({ + entrypoints: ["src/main.ts"], + target: "node", + outdir, + naming: "pty-daemon.js", + format: "esm", + external: ["node-pty"], +}); + +if (!result.success) { + console.error("[pty-daemon] build failed:"); + for (const log of result.logs) { + console.error(log); + } + process.exit(1); +} diff --git a/packages/pty-daemon/package.json b/packages/pty-daemon/package.json index b5193c440ab..695a085aadb 100644 --- a/packages/pty-daemon/package.json +++ b/packages/pty-daemon/package.json @@ -22,9 +22,10 @@ "scripts": { "clean": "git clean -xdf .cache .turbo dist node_modules", "start": "node --experimental-strip-types src/main.ts", + "build:daemon": "bun run build.ts", "typecheck": "tsc --noEmit --emitDeclarationOnly false", "test": "bun test src/protocol src/SessionStore src/handlers src/Pty/Pty.test.ts", - "test:integration": "node --experimental-strip-types --test test/integration.test.ts" + "test:integration": "node --experimental-strip-types --test test/integration.test.ts test/control-plane.test.ts" }, "dependencies": { "node-pty": "1.1.0" diff --git a/packages/pty-daemon/src/Server/Server.ts b/packages/pty-daemon/src/Server/Server.ts index 635a5667acc..3395c54502e 100644 --- a/packages/pty-daemon/src/Server/Server.ts +++ b/packages/pty-daemon/src/Server/Server.ts @@ -70,6 +70,16 @@ export class Server { async close(): Promise { for (const c of this.conns) c.socket.destroy(); this.conns.clear(); + // Kill all owned PTYs so the daemon process can actually exit (open + // master fds keep the event loop alive). This is what the v1 lessons + // call "synchronous teardown only" — no setTimeout, no graceful drain. + for (const session of this.store.all()) { + try { + session.pty.kill("SIGKILL"); + } catch { + // already dead, ignore + } + } await new Promise((resolve) => this.server.close(() => resolve())); try { fs.unlinkSync(this.opts.socketPath); diff --git a/packages/pty-daemon/test/control-plane.test.ts b/packages/pty-daemon/test/control-plane.test.ts new file mode 100644 index 00000000000..659ecbdb55a --- /dev/null +++ b/packages/pty-daemon/test/control-plane.test.ts @@ -0,0 +1,734 @@ +// Comprehensive control-plane test for pty-daemon. Each test exercises a +// real daemon over a real Unix socket and walks through one usage pattern +// end-to-end. Together these cover every usage shape host-service can throw +// at the daemon: handshake variants, session lifecycle, I/O patterns, +// multi-client subscribe/replay/unsubscribe, detach+reattach, malformed +// input, late subscribers, concurrent N sessions, shutdown. +// +// Runs under Node (`node --experimental-strip-types --test`). + +import { strict as assert } from "node:assert"; +import * as os from "node:os"; +import * as path from "node:path"; +import { after, before, describe, test } from "node:test"; +import { encodeFrame } from "../src/protocol/index.ts"; +import { Server } from "../src/Server/index.ts"; +import { connect, connectAndHello } from "./helpers/client.ts"; + +const sockPath = path.join( + os.tmpdir(), + `pty-daemon-control-${process.pid}.sock`, +); +let server: Server; + +before(async () => { + server = new Server({ + socketPath: sockPath, + daemonVersion: "0.0.0-control", + bufferCap: 8 * 1024, + }); + await server.listen(); +}); + +after(async () => { + await server.close(); +}); + +const SH = "/bin/sh"; +const baseMeta = { + shell: SH, + argv: ["-c", "echo ready; sleep 5"] as string[], + cols: 80, + rows: 24, +}; + +function uniqueId(prefix: string): string { + return `${prefix}-${Math.random().toString(36).slice(2, 8)}`; +} + +// ---------------- Handshake ---------------- + +describe("handshake", () => { + test("rejects non-hello first message", async () => { + const c = await connect(sockPath); + c.send({ type: "list" }); + const err = await c.waitFor((m) => m.type === "error", 1000); + assert.equal(err.type, "error"); + await c.close(); + }); + + test("rejects unsupported protocol versions", async () => { + const c = await connect(sockPath); + c.send({ type: "hello", protocols: [99, 100] }); + const err = await c.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") assert.equal(err.code, "EVERSION"); + await c.close(); + }); + + test("picks highest mutual when multiple offered", async () => { + const c = await connect(sockPath); + c.send({ type: "hello", protocols: [1, 99] }); + const ack = await c.waitFor((m) => m.type === "hello-ack"); + if (ack.type === "hello-ack") assert.equal(ack.protocol, 1); + await c.close(); + }); + + test("rejects duplicate hello", async () => { + const c = await connectAndHello(sockPath); + c.send({ type: "hello", protocols: [1] }); + const err = await c.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") { + assert.match(err.message, /duplicate hello/); + } + await c.close(); + }); +}); + +// ---------------- Session lifecycle ---------------- + +describe("session lifecycle", () => { + test("rejects open with bad cols/rows", async () => { + const c = await connectAndHello(sockPath); + c.send({ + type: "open", + id: uniqueId("badspawn"), + meta: { ...baseMeta, cols: 0 }, + }); + const err = await c.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") assert.equal(err.code, "ESPAWN"); + await c.close(); + }); + + test("rejects duplicate session id", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("dup"); + c.send({ type: "open", id, meta: baseMeta }); + await c.waitFor((m) => m.type === "open-ok"); + c.send({ type: "open", id, meta: baseMeta }); + const err = await c.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") assert.equal(err.code, "EEXIST"); + c.send({ type: "close", id }); + await c.close(); + }); + + test("input/resize/close on missing session return ENOENT", async () => { + const c = await connectAndHello(sockPath); + const missing = "missing-no-such"; + + c.send({ type: "input", id: missing, data: "" }); + const e1 = await c.waitFor((m) => m.type === "error", 1000); + if (e1.type === "error") assert.equal(e1.code, "ENOENT"); + + c.send({ type: "resize", id: missing, cols: 80, rows: 24 }); + const e2 = await c.waitFor((m) => m.type === "error" && m !== e1, 1000); + if (e2.type === "error") assert.equal(e2.code, "ENOENT"); + + c.send({ type: "close", id: missing }); + const e3 = await c.waitFor( + (m) => m.type === "error" && m !== e1 && m !== e2, + 1000, + ); + if (e3.type === "error") assert.equal(e3.code, "ENOENT"); + await c.close(); + }); + + test("instant-exit shell still produces an exit message", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("instant"); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "true"] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: true }); + const exit = await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); + if (exit.type === "exit") assert.equal(exit.code, 0); + await c.close(); + }); + + test("close with SIGKILL terminates a hung shell", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("hung"); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "sleep 60"] }, + }); + const ok = await c.waitFor((m) => m.type === "open-ok" && m.id === id); + if (ok.type !== "open-ok") throw new Error("no open-ok"); + + c.send({ type: "subscribe", id, replay: false }); + c.send({ type: "close", id, signal: "SIGKILL" }); + await c.waitFor((m) => m.type === "closed" && m.id === id); + await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); + await c.close(); + }); +}); + +// ---------------- I/O patterns ---------------- + +describe("I/O patterns", () => { + test("resize during a running shell does not break stream", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("resize"); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: false }); + + c.send({ type: "resize", id, cols: 120, rows: 40 }); + c.send({ + type: "input", + id, + data: Buffer.from("echo post-resize-marker\n").toString("base64"), + }); + await c.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("post-resize-marker"), + 3000, + ); + + c.send({ type: "close", id, signal: "SIGTERM" }); + await c.close(); + }); + + test("burst output (high-rate stdout) is delivered and ring-capped", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("burst"); + c.send({ + type: "open", + id, + meta: { + ...baseMeta, + argv: [ + "-c", + "for i in $(seq 1 200); do echo BURST:$i; done; sleep 0.5", + ], + }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: false }); + + // Wait until we see the last marker, confirming live delivery. + await c.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("BURST:200"), + 5000, + ); + await c.waitFor((m) => m.type === "exit" && m.id === id, 5000); + await c.close(); + }); + + test("multi-byte UTF-8 output round-trips", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("utf8"); + // 🚀 = 0xF0 0x9F 0x9A 0x80 + c.send({ + type: "open", + id, + meta: { + ...baseMeta, + argv: ["-c", "printf 'rocket: \\xf0\\x9f\\x9a\\x80\\n'; sleep 0.1"], + }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: true }); + await c.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("🚀"), + 3000, + ); + await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); + await c.close(); + }); +}); + +// ---------------- Multi-client subscribe / fan-out ---------------- + +describe("multi-client fan-out", () => { + test("two subscribers both receive the same output", async () => { + const a = await connectAndHello(sockPath); + const b = await connectAndHello(sockPath); + const id = uniqueId("fanout"); + + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "echo fanout-marker; sleep 0.5"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + + a.send({ type: "subscribe", id, replay: false }); + b.send({ type: "subscribe", id, replay: false }); + + await Promise.all([ + a.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("fanout-marker"), + 3000, + ), + b.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("fanout-marker"), + 3000, + ), + ]); + + await Promise.all([a.close(), b.close()]); + }); + + test("unsubscribe stops further output to that connection", async () => { + const a = await connectAndHello(sockPath); + const b = await connectAndHello(sockPath); + const id = uniqueId("unsub"); + + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + + a.send({ type: "subscribe", id, replay: false }); + b.send({ type: "subscribe", id, replay: false }); + + // First marker — both should see it. + a.send({ + type: "input", + id, + data: Buffer.from("echo first-marker\n").toString("base64"), + }); + await Promise.all([ + a.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("first-marker"), + 3000, + ), + b.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("first-marker"), + 3000, + ), + ]); + + // b unsubscribes; a is still subscribed. + b.send({ type: "unsubscribe", id }); + // Small settle so the unsubscribe lands before the next emit. + await new Promise((r) => setTimeout(r, 100)); + + const bAfterUnsub = b.collect( + (m) => m.type === "output" && m.id === id, + 500, + ); + + a.send({ + type: "input", + id, + data: Buffer.from("echo second-marker\n").toString("base64"), + }); + await a.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("second-marker"), + 3000, + ); + + const bMessages = await bAfterUnsub; + const sawSecondOnB = bMessages.some( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("second-marker"), + ); + assert.equal(sawSecondOnB, false); + + a.send({ type: "close", id, signal: "SIGTERM" }); + await Promise.all([a.close(), b.close()]); + }); + + test("subscriber connection drop doesn't crash daemon; other clients keep streaming", async () => { + const owner = await connectAndHello(sockPath); + const dropper = await connectAndHello(sockPath); + const observer = await connectAndHello(sockPath); + const id = uniqueId("dropcrash"); + + owner.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await owner.waitFor((m) => m.type === "open-ok" && m.id === id); + dropper.send({ type: "subscribe", id, replay: false }); + observer.send({ type: "subscribe", id, replay: false }); + + // Force-close the dropper without unsubscribing. + dropper.socket.destroy(); + + owner.send({ + type: "input", + id, + data: Buffer.from("echo survives-drop\n").toString("base64"), + }); + await observer.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("survives-drop"), + 3000, + ); + + owner.send({ type: "close", id, signal: "SIGTERM" }); + await Promise.all([owner.close(), observer.close()]); + }); +}); + +// ---------------- Detach + reattach (the headline feature) ---------------- + +describe("detach + reattach", () => { + test("late subscriber gets prior output via replay", async () => { + const owner = await connectAndHello(sockPath); + const id = uniqueId("late"); + + owner.send({ + type: "open", + id, + meta: { + ...baseMeta, + argv: ["-c", "echo early-marker; sleep 1"], + }, + }); + await owner.waitFor((m) => m.type === "open-ok" && m.id === id); + + // Wait for output to be buffered without any subscriber. + await new Promise((r) => setTimeout(r, 200)); + + const late = await connectAndHello(sockPath); + late.send({ type: "subscribe", id, replay: true }); + await late.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("early-marker"), + 3000, + ); + + owner.send({ type: "close", id, signal: "SIGTERM" }); + await Promise.all([owner.close(), late.close()]); + }); + + test("reattach cycle: subscribe → disconnect → new conn subscribes-with-replay → continues live", async () => { + const owner = await connectAndHello(sockPath); + const id = uniqueId("reattach"); + + owner.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await owner.waitFor((m) => m.type === "open-ok" && m.id === id); + + const first = await connectAndHello(sockPath); + first.send({ type: "subscribe", id, replay: false }); + + // Generate some output via input. + owner.send({ + type: "input", + id, + data: Buffer.from("echo before-reattach\n").toString("base64"), + }); + await first.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("before-reattach"), + 3000, + ); + + // Disconnect the first client. PTY keeps running. + await first.close(); + + // New client connects, asks for replay, and sends another input. + const second = await connectAndHello(sockPath); + second.send({ type: "subscribe", id, replay: true }); + // Replay should arrive immediately containing the prior output. + await second.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("before-reattach"), + 2000, + ); + + owner.send({ + type: "input", + id, + data: Buffer.from("echo after-reattach\n").toString("base64"), + }); + await second.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("after-reattach"), + 3000, + ); + + owner.send({ type: "close", id, signal: "SIGTERM" }); + await Promise.all([owner.close(), second.close()]); + }); +}); + +// ---------------- list ---------------- + +describe("list", () => { + test("reflects active sessions", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("listed"); + c.send({ type: "open", id, meta: baseMeta }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + + c.send({ type: "list" }); + const reply = await c.waitFor((m) => m.type === "list-reply"); + assert.equal(reply.type, "list-reply"); + if (reply.type === "list-reply") { + const found = reply.sessions.find((s) => s.id === id); + assert.ok(found, "session should appear in list"); + assert.equal(found?.cols, 80); + assert.equal(found?.rows, 24); + assert.equal(found?.alive, true); + } + + c.send({ type: "close", id, signal: "SIGTERM" }); + await c.close(); + }); +}); + +// ---------------- Malformed / abusive input ---------------- + +describe("hostile input", () => { + test("non-JSON in a frame disconnects the client; daemon survives", async () => { + const owner = await connectAndHello(sockPath); + const id = uniqueId("survive"); + owner.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await owner.waitFor((m) => m.type === "open-ok" && m.id === id); + + // Hostile client sends a length-prefixed buffer of garbage that isn't JSON. + const bad = await connect(sockPath); + const garbage = Buffer.from("\x00\x00\x00\x05NOT{}"); + bad.sendRaw(garbage); + // Server should disconnect this conn cleanly. + await new Promise((res) => bad.onClose(res)); + + // Owner is still functional. + owner.send({ type: "subscribe", id, replay: false }); + owner.send({ + type: "input", + id, + data: Buffer.from("echo still-alive\n").toString("base64"), + }); + await owner.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("still-alive"), + 3000, + ); + + owner.send({ type: "close", id, signal: "SIGTERM" }); + await owner.close(); + }); + + test("oversized frame header (> 8 MB cap) disconnects; daemon survives", async () => { + const bad = await connect(sockPath); + const hugeHeader = Buffer.alloc(4); + hugeHeader.writeUInt32BE(20 * 1024 * 1024, 0); + bad.sendRaw(hugeHeader); + await new Promise((res) => bad.onClose(res)); + + // Daemon is still accepting connections. + const c = await connectAndHello(sockPath); + c.send({ type: "list" }); + await c.waitFor((m) => m.type === "list-reply", 1000); + await c.close(); + }); + + test("input on already-exited session returns EEXITED", async () => { + const c = await connectAndHello(sockPath); + const id = uniqueId("dead"); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "true"] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: true }); + await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); + + c.send({ + type: "input", + id, + data: Buffer.from("ignored").toString("base64"), + }); + const err = await c.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") assert.equal(err.code, "EEXITED"); + await c.close(); + }); +}); + +// ---------------- Concurrency stress ---------------- + +describe("concurrency", () => { + test("20 sessions opened and streaming concurrently", async () => { + const c = await connectAndHello(sockPath); + const N = 20; + const ids = Array.from({ length: N }, (_, i) => uniqueId(`conc-${i}`)); + + // Open all sessions. Use a workload that runs long enough to outlast + // the open+subscribe round-trip on a busy machine — the spawns happen + // in parallel, but `subscribe replay:false` would race exits otherwise. + for (const id of ids) { + c.send({ + type: "open", + id, + meta: { + ...baseMeta, + argv: ["-c", "echo TICK:start; sleep 0.5; echo TICK:end"], + }, + }); + } + + // Wait for all open-oks. + const openIds = new Set(); + while (openIds.size < N) { + const m = await c.waitFor( + (m) => m.type === "open-ok" && !openIds.has(m.id), + 10_000, + ); + if (m.type === "open-ok") openIds.add(m.id); + } + assert.equal(openIds.size, N); + + // Subscribe with replay so even sessions whose first output landed before + // our subscribe arrives are still surfaced. + for (const id of ids) c.send({ type: "subscribe", id, replay: true }); + + // Wait for the start marker from each session. + const seen = new Set(); + while (seen.size < N) { + const m = await c.waitFor( + (m) => + m.type === "output" && + !seen.has(m.id) && + ids.includes(m.id) && + Buffer.from(m.data, "base64").toString().includes("TICK:start"), + 10_000, + ); + if (m.type === "output") seen.add(m.id); + } + assert.equal(seen.size, N); + + // Wait for all to exit. + const exited = new Set(); + while (exited.size < N) { + const m = await c.waitFor( + (m) => m.type === "exit" && !exited.has(m.id) && ids.includes(m.id), + 10_000, + ); + if (m.type === "exit") exited.add(m.id); + } + + await c.close(); + }); + + test("multiple connections opening sessions in parallel", async () => { + const N = 10; + const conns = await Promise.all( + Array.from({ length: N }, () => connectAndHello(sockPath)), + ); + + await Promise.all( + conns.map(async (c, i) => { + const id = uniqueId(`parallel-${i}`); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", `echo CONN:${i}; sleep 0.2`] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id, 5000); + c.send({ type: "subscribe", id, replay: true }); + await c.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes(`CONN:${i}`), + 5000, + ); + c.send({ type: "close", id, signal: "SIGTERM" }); + await c.close(); + }), + ); + }); +}); + +// ---------------- Server shutdown ---------------- + +describe("server shutdown", () => { + test("disconnects active clients cleanly via close()", async () => { + // Use a *separate* short-lived server so we don't tear down the suite's main one. + const localPath = path.join( + os.tmpdir(), + `pty-daemon-shutdown-${process.pid}-${Date.now()}.sock`, + ); + const local = new Server({ + socketPath: localPath, + daemonVersion: "0.0.0-local", + }); + await local.listen(); + + const c = await connectAndHello(localPath); + const id = uniqueId("shutdown"); + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "sleep 60"] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + + const closeWaiter = new Promise((res) => c.onClose(res)); + await local.close(); + // Server.close() destroys all connections. + await closeWaiter; + assert.equal(c.closed(), true); + }); +}); + +// ---------------- Frame-level encoding sanity ---------------- + +describe("framing on the wire", () => { + test("server tolerates split frames across multiple TCP chunks", async () => { + const c = await connect(sockPath); + const hello = encodeFrame({ type: "hello", protocols: [1] }); + // Send the hello in 3-byte chunks to force the decoder to buffer. + for (let i = 0; i < hello.length; i += 3) { + c.sendRaw(hello.subarray(i, Math.min(i + 3, hello.length))); + await new Promise((r) => setTimeout(r, 1)); + } + await c.waitFor((m) => m.type === "hello-ack", 1000); + await c.close(); + }); +}); diff --git a/packages/pty-daemon/test/helpers/client.ts b/packages/pty-daemon/test/helpers/client.ts new file mode 100644 index 00000000000..3a1b548bebe --- /dev/null +++ b/packages/pty-daemon/test/helpers/client.ts @@ -0,0 +1,151 @@ +// Reusable test client for pty-daemon integration tests. +// Speaks the daemon's wire protocol over a Unix socket. + +import * as net from "node:net"; +import { + encodeFrame, + FrameDecoder, + type ServerMessage, +} from "../../src/protocol/index.ts"; + +export interface DaemonClient { + socket: net.Socket; + messages: ServerMessage[]; + send(m: unknown): void; + waitFor( + predicate: (m: ServerMessage) => boolean, + ms?: number, + ): Promise; + collect( + predicate: (m: ServerMessage) => boolean, + ms: number, + ): Promise; + sendRaw(buf: Buffer): void; + close(): Promise; + closed(): boolean; + onClose(cb: () => void): void; +} + +interface Waiter { + predicate: (m: ServerMessage) => boolean; + resolve: (m: ServerMessage) => void; + reject: (e: Error) => void; + timer: NodeJS.Timeout; +} + +export function connect(socketPath: string): Promise { + return new Promise((resolve, reject) => { + const socket = net.createConnection({ path: socketPath }); + const decoder = new FrameDecoder(); + const messages: ServerMessage[] = []; + const waiters: Waiter[] = []; + const closeCbs: Array<() => void> = []; + let isClosed = false; + + socket.on("data", (chunk) => { + try { + decoder.push(chunk); + for (const raw of decoder.drain()) { + const m = raw as ServerMessage; + messages.push(m); + for (let i = waiters.length - 1; i >= 0; i--) { + const w = waiters[i]; + if (w?.predicate(m)) { + clearTimeout(w.timer); + waiters.splice(i, 1); + w.resolve(m); + } + } + } + } catch (err) { + // Surface frame errors to any pending waiter. + for (const w of waiters) { + clearTimeout(w.timer); + w.reject(err as Error); + } + waiters.length = 0; + } + }); + + socket.on("close", () => { + isClosed = true; + for (const cb of closeCbs) cb(); + }); + socket.once("error", reject); + socket.once("connect", () => { + socket.off("error", reject); + resolve({ + socket, + messages, + send(m) { + if (!socket.destroyed) socket.write(encodeFrame(m)); + }, + sendRaw(buf) { + if (!socket.destroyed) socket.write(buf); + }, + waitFor(predicate, ms = 5000) { + return new Promise((res, rej) => { + const found = messages.find(predicate); + if (found) return res(found); + const timer = setTimeout(() => { + const i = waiters.findIndex((w) => w.predicate === predicate); + if (i >= 0) waiters.splice(i, 1); + rej(new Error(`waitFor timed out after ${ms}ms`)); + }, ms); + waiters.push({ predicate, resolve: res, reject: rej, timer }); + }); + }, + collect(predicate, ms) { + return new Promise((res) => { + const collected: ServerMessage[] = messages.filter(predicate); + const onMsg = (chunk: Buffer) => { + void chunk; + for (let i = collected.length; i < messages.length; i++) { + const m = messages[i]; + if (m && predicate(m)) collected.push(m); + } + }; + socket.on("data", onMsg); + setTimeout(() => { + socket.off("data", onMsg); + // Final sweep in case of late drains. + for (let i = collected.length; i < messages.length; i++) { + const m = messages[i]; + if (m && predicate(m)) collected.push(m); + } + res(collected); + }, ms); + }); + }, + close() { + return new Promise((res) => { + if (socket.destroyed) return res(); + socket.end(() => res()); + // Fall back: if `end` doesn't fire close within 200ms, force. + setTimeout(() => { + if (!socket.destroyed) socket.destroy(); + res(); + }, 200); + }); + }, + closed() { + return isClosed; + }, + onClose(cb) { + if (isClosed) cb(); + else closeCbs.push(cb); + }, + }); + }); + }); +} + +/** Convenience: connect and complete the v1 handshake. */ +export async function connectAndHello( + socketPath: string, +): Promise { + const c = await connect(socketPath); + c.send({ type: "hello", protocols: [1] }); + await c.waitFor((m) => m.type === "hello-ack"); + return c; +} diff --git a/packages/pty-daemon/test/integration.test.ts b/packages/pty-daemon/test/integration.test.ts index eb2d4a5889b..4d4ee962f16 100644 --- a/packages/pty-daemon/test/integration.test.ts +++ b/packages/pty-daemon/test/integration.test.ts @@ -1,25 +1,16 @@ -// End-to-end integration test for pty-daemon. +// Smoke / happy-path integration test for pty-daemon. // -// Runs under Node (`node --experimental-strip-types --test`), not Bun, because -// node-pty's master fd handling depends on Node's tty.ReadStream behavior. -// -// Spawns a daemon in-process, connects a TCP-style client to its Unix socket, -// runs through the protocol: hello → open → subscribe(replay) → input → -// receive output → close → exit. +// Runs under Node (`node --experimental-strip-types --test`); see +// test/control-plane.test.ts for the exhaustive control-plane scenarios. import { strict as assert } from "node:assert"; -import * as net from "node:net"; import * as os from "node:os"; import * as path from "node:path"; import { after, before, test } from "node:test"; -import { - encodeFrame, - FrameDecoder, - type ServerMessage, -} from "../src/protocol/index.ts"; import { Server } from "../src/Server/index.ts"; +import { connect, connectAndHello } from "./helpers/client.ts"; -const sockPath = path.join(os.tmpdir(), `pty-daemon-test-${process.pid}.sock`); +const sockPath = path.join(os.tmpdir(), `pty-daemon-smoke-${process.pid}.sock`); let server: Server; before(async () => { @@ -31,75 +22,8 @@ after(async () => { await server.close(); }); -interface Client { - socket: net.Socket; - messages: ServerMessage[]; - waitFor: ( - predicate: (m: ServerMessage) => boolean, - ms?: number, - ) => Promise; - send: (m: unknown) => void; - close: () => Promise; -} - -function connect(): Promise { - return new Promise((resolve, reject) => { - const socket = net.createConnection({ path: sockPath }); - const decoder = new FrameDecoder(); - const messages: ServerMessage[] = []; - const waiters: Array<{ - predicate: (m: ServerMessage) => boolean; - resolve: (m: ServerMessage) => void; - reject: (e: Error) => void; - timer: NodeJS.Timeout; - }> = []; - - socket.on("data", (chunk) => { - decoder.push(chunk); - for (const raw of decoder.drain()) { - const m = raw as ServerMessage; - messages.push(m); - for (let i = waiters.length - 1; i >= 0; i--) { - const w = waiters[i]; - if (w?.predicate(m)) { - clearTimeout(w.timer); - waiters.splice(i, 1); - w.resolve(m); - } - } - } - }); - - socket.once("error", reject); - socket.once("connect", () => - resolve({ - socket, - messages, - waitFor(predicate, ms = 5000) { - return new Promise((res, rej) => { - const found = messages.find(predicate); - if (found) return res(found); - const timer = setTimeout(() => { - rej(new Error(`waitFor timed out after ${ms}ms`)); - }, ms); - waiters.push({ predicate, resolve: res, reject: rej, timer }); - }); - }, - send(m) { - socket.write(encodeFrame(m)); - }, - close() { - return new Promise((res) => { - socket.end(() => res()); - }); - }, - }), - ); - }); -} - test("handshake: hello → hello-ack", async () => { - const c = await connect(); + const c = await connect(sockPath); c.send({ type: "hello", protocols: [1] }); const ack = await c.waitFor((m) => m.type === "hello-ack"); assert.equal(ack.type, "hello-ack"); @@ -110,139 +34,58 @@ test("handshake: hello → hello-ack", async () => { await c.close(); }); -test("incompatible protocol → error and disconnect", async () => { - const c = await connect(); - c.send({ type: "hello", protocols: [99] }); - const err = await c.waitFor((m) => m.type === "error"); - assert.equal(err.type, "error"); - if (err.type === "error") assert.equal(err.code, "EVERSION"); - await c.close(); -}); - -test("open → subscribe → output → close lifecycle (real shell)", async () => { - const c = await connect(); - c.send({ type: "hello", protocols: [1] }); - await c.waitFor((m) => m.type === "hello-ack"); - +test("open → subscribe → output → exit lifecycle", async () => { + const c = await connectAndHello(sockPath); c.send({ type: "open", - id: "s0", + id: "smoke-0", meta: { shell: "/bin/sh", - argv: ["-c", "echo daemon-integration; sleep 0.2"], + argv: ["-c", "echo daemon-smoke; sleep 0.2"], cols: 80, rows: 24, }, }); - const opened = await c.waitFor((m) => m.type === "open-ok"); - assert.equal(opened.type, "open-ok"); - - c.send({ type: "subscribe", id: "s0", replay: true }); + await c.waitFor((m) => m.type === "open-ok" && m.id === "smoke-0"); + c.send({ type: "subscribe", id: "smoke-0", replay: true }); - const output = await c.waitFor( + await c.waitFor( (m) => m.type === "output" && - m.id === "s0" && - Buffer.from(m.data, "base64").toString().includes("daemon-integration"), + m.id === "smoke-0" && + Buffer.from(m.data, "base64").toString().includes("daemon-smoke"), + 3000, + ); + const exit = await c.waitFor( + (m) => m.type === "exit" && m.id === "smoke-0", 3000, ); - assert.ok(output.type === "output"); - - const exit = await c.waitFor((m) => m.type === "exit" && m.id === "s0", 3000); - assert.equal(exit.type, "exit"); if (exit.type === "exit") assert.equal(exit.code, 0); - await c.close(); }); -test("input is forwarded to the shell and echoed back via output", async () => { - const c = await connect(); - c.send({ type: "hello", protocols: [1] }); - await c.waitFor((m) => m.type === "hello-ack"); +test("input is forwarded and echoed via output", async () => { + const c = await connectAndHello(sockPath); c.send({ type: "open", - id: "s1", + id: "smoke-1", meta: { shell: "/bin/sh", argv: ["-i"], cols: 80, rows: 24 }, }); await c.waitFor((m) => m.type === "open-ok"); - c.send({ type: "subscribe", id: "s1", replay: false }); - - // Send "echo abc-marker\n" — shell echoes the typed bytes back through the - // PTY (canonical mode), AND prints "abc-marker" as the command output. + c.send({ type: "subscribe", id: "smoke-1", replay: false }); c.send({ type: "input", - id: "s1", + id: "smoke-1", data: Buffer.from("echo abc-marker\n").toString("base64"), }); - await c.waitFor( (m) => m.type === "output" && - m.id === "s1" && + m.id === "smoke-1" && Buffer.from(m.data, "base64").toString().includes("abc-marker"), 3000, ); - - c.send({ type: "close", id: "s1", signal: "SIGTERM" }); - await c.waitFor((m) => m.type === "closed" && m.id === "s1"); - await c.close(); -}); - -test("subscribe with replay sends prior buffered output", async () => { - const c = await connect(); - c.send({ type: "hello", protocols: [1] }); - await c.waitFor((m) => m.type === "hello-ack"); - - c.send({ - type: "open", - id: "s2", - meta: { - shell: "/bin/sh", - argv: ["-c", "echo replay-test; sleep 1"], - cols: 80, - rows: 24, - }, - }); - await c.waitFor((m) => m.type === "open-ok"); - - // Wait a bit so the shell emits its initial output into the daemon's buffer - // without any subscriber yet. - await new Promise((resolve) => setTimeout(resolve, 200)); - - c.send({ type: "subscribe", id: "s2", replay: true }); - const out = await c.waitFor( - (m) => - m.type === "output" && - m.id === "s2" && - Buffer.from(m.data, "base64").toString().includes("replay-test"), - 2000, - ); - assert.ok(out.type === "output"); - - c.send({ type: "close", id: "s2", signal: "SIGTERM" }); - await c.close(); -}); - -test("list returns active sessions", async () => { - const c = await connect(); - c.send({ type: "hello", protocols: [1] }); - await c.waitFor((m) => m.type === "hello-ack"); - - c.send({ - type: "open", - id: "list-a", - meta: { shell: "/bin/sh", argv: ["-c", "sleep 5"], cols: 80, rows: 24 }, - }); - await c.waitFor((m) => m.type === "open-ok" && m.id === "list-a"); - - c.send({ type: "list" }); - const list = await c.waitFor((m) => m.type === "list-reply"); - assert.equal(list.type, "list-reply"); - if (list.type === "list-reply") { - assert.ok(list.sessions.some((s) => s.id === "list-a")); - } - - c.send({ type: "close", id: "list-a", signal: "SIGTERM" }); - await c.waitFor((m) => m.type === "closed"); + c.send({ type: "close", id: "smoke-1", signal: "SIGTERM" }); + await c.waitFor((m) => m.type === "closed" && m.id === "smoke-1"); await c.close(); }); From 9bdbf7b8500651473ca48d13ccfc82eb5b5d1257 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 20:38:13 -0700 Subject: [PATCH 03/33] =?UTF-8?q?feat(host-service):=20DaemonClient=20?= =?UTF-8?q?=E2=80=94=20Unix-socket=20client=20for=20pty-daemon?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New module packages/host-service/src/terminal/DaemonClient/. Single long-lived connection to pty-daemon, typed protocol API: - connect() + handshake, exposing version + protocol number - open / close / list as request/response promises - input / resize as fire-and-forget - subscribe(id, { replay }, { onOutput, onExit }) with multi-local- subscriber fan-out from one wire subscription; unsubscribe returned - onDisconnect(cb) for daemon-crash signaling - dispose() for clean shutdown Failure model is intentionally dumb: connection-level errors surface via onDisconnect; the desktop coordinator is responsible for respawning the daemon and host-service can reconnect by constructing a new DaemonClient. No in-band reconnect logic. Adds @superset/pty-daemon as a workspace dependency (host-service was already on node-pty 1.1; this layers the daemon protocol on top). Enables allowImportingTsExtensions in host-service tsconfig because the pty-daemon package's exports map points at .ts source files (Node ESM requires explicit extensions). Tests: 5 integration tests against a real Server (node --test): - connect + handshake exposes daemon version - open + subscribe + receive output + close - input forwarded; resize updates dims - multiple local subscribers fan out from one wire subscription - disconnect callback fires when daemon goes away Avoids parameter property shorthand in the constructor — Node's --experimental-strip-types doesn't allow it. Doesn't touch terminal.ts yet — that's the next commit on this branch. --- bun.lock | 1 + packages/host-service/package.json | 1 + .../DaemonClient/DaemonClient.test.ts | 187 +++++++++ .../src/terminal/DaemonClient/DaemonClient.ts | 369 ++++++++++++++++++ .../src/terminal/DaemonClient/index.ts | 8 + packages/host-service/tsconfig.json | 6 +- packages/pty-daemon/src/main.ts | 0 7 files changed, 571 insertions(+), 1 deletion(-) create mode 100644 packages/host-service/src/terminal/DaemonClient/DaemonClient.test.ts create mode 100644 packages/host-service/src/terminal/DaemonClient/DaemonClient.ts create mode 100644 packages/host-service/src/terminal/DaemonClient/index.ts mode change 100644 => 100755 packages/pty-daemon/src/main.ts diff --git a/bun.lock b/bun.lock index b58736baab0..3e0b456e1ff 100644 --- a/bun.lock +++ b/bun.lock @@ -770,6 +770,7 @@ "@octokit/rest": "^22.0.1", "@superset/chat": "workspace:*", "@superset/port-scanner": "workspace:*", + "@superset/pty-daemon": "workspace:*", "@superset/shared": "workspace:*", "@superset/trpc": "workspace:*", "@superset/workspace-fs": "workspace:*", diff --git a/packages/host-service/package.json b/packages/host-service/package.json index c814f8c0eec..08f703587c1 100644 --- a/packages/host-service/package.json +++ b/packages/host-service/package.json @@ -48,6 +48,7 @@ "@octokit/rest": "^22.0.1", "@superset/chat": "workspace:*", "@superset/port-scanner": "workspace:*", + "@superset/pty-daemon": "workspace:*", "@superset/shared": "workspace:*", "@superset/trpc": "workspace:*", "@superset/workspace-fs": "workspace:*", diff --git a/packages/host-service/src/terminal/DaemonClient/DaemonClient.test.ts b/packages/host-service/src/terminal/DaemonClient/DaemonClient.test.ts new file mode 100644 index 00000000000..30b52c0be29 --- /dev/null +++ b/packages/host-service/src/terminal/DaemonClient/DaemonClient.test.ts @@ -0,0 +1,187 @@ +// End-to-end test for DaemonClient against a real pty-daemon Server. +// Runs under Node (`node --experimental-strip-types --test`) because the +// daemon spawns real PTYs via node-pty. + +import { strict as assert } from "node:assert"; +import * as os from "node:os"; +import * as path from "node:path"; +import { after, before, test } from "node:test"; +import { Server } from "@superset/pty-daemon"; +import { DaemonClient } from "./DaemonClient.ts"; + +const sockPath = path.join( + os.tmpdir(), + `host-daemon-client-${process.pid}.sock`, +); +let server: Server; + +before(async () => { + server = new Server({ + socketPath: sockPath, + daemonVersion: "0.0.0-host-test", + }); + await server.listen(); +}); + +after(async () => { + await server.close(); +}); + +test("connect + handshake exposes daemon version", async () => { + const c = new DaemonClient({ socketPath: sockPath }); + await c.connect(); + assert.equal(c.version, "0.0.0-host-test"); + assert.equal(c.protocol, 1); + assert.ok(c.isConnected); + await c.dispose(); +}); + +test("open + subscribe + receive output + close", async () => { + const c = new DaemonClient({ socketPath: sockPath }); + await c.connect(); + + const id = "host-test-0"; + const result = await c.open(id, { + shell: "/bin/sh", + argv: ["-c", "echo from-daemon-client; sleep 0.2"], + cols: 80, + rows: 24, + }); + assert.ok(result.pid > 0); + + const chunks: Buffer[] = []; + const exitInfo: { code: number | null; signal: number | null }[] = []; + const unsubscribe = c.subscribe( + id, + { replay: true }, + { + onOutput: (b) => chunks.push(b), + onExit: (info) => exitInfo.push(info), + }, + ); + + await new Promise((r) => setTimeout(r, 600)); + const combined = Buffer.concat(chunks).toString("utf8"); + assert.ok( + combined.includes("from-daemon-client"), + `output missing marker: ${combined}`, + ); + assert.equal(exitInfo.length, 1); + assert.equal(exitInfo[0]?.code, 0); + + unsubscribe(); + await c.dispose(); +}); + +test("input is forwarded; resize updates dims", async () => { + const c = new DaemonClient({ socketPath: sockPath }); + await c.connect(); + + const id = "host-test-1"; + await c.open(id, { + shell: "/bin/sh", + argv: ["-i"], + cols: 80, + rows: 24, + }); + + const chunks: Buffer[] = []; + const unsubscribe = c.subscribe( + id, + { replay: false }, + { + onOutput: (b) => chunks.push(b), + onExit: () => {}, + }, + ); + + c.input(id, Buffer.from("echo input-marker\n")); + + await waitFor( + () => Buffer.concat(chunks).toString().includes("input-marker"), + 3000, + ); + + c.resize(id, 100, 30); + const list = await c.list(); + const me = list.find((s) => s.id === id); + assert.equal(me?.cols, 100); + assert.equal(me?.rows, 30); + + unsubscribe(); + await c.close(id, "SIGTERM"); + await c.dispose(); +}); + +test("multiple local subscribers get fanned out from one wire subscription", async () => { + const c = new DaemonClient({ socketPath: sockPath }); + await c.connect(); + + const id = "host-test-fanout"; + await c.open(id, { + shell: "/bin/sh", + argv: ["-c", "echo fanout; sleep 0.3"], + cols: 80, + rows: 24, + }); + + const a: Buffer[] = []; + const b: Buffer[] = []; + const unsubA = c.subscribe( + id, + { replay: true }, + { + onOutput: (buf) => a.push(buf), + onExit: () => {}, + }, + ); + const unsubB = c.subscribe( + id, + { replay: true }, + { + onOutput: (buf) => b.push(buf), + onExit: () => {}, + }, + ); + + await new Promise((r) => setTimeout(r, 500)); + assert.ok(Buffer.concat(a).toString().includes("fanout")); + assert.ok(Buffer.concat(b).toString().includes("fanout")); + + unsubA(); + unsubB(); + await c.dispose(); +}); + +test("disconnect callback fires when daemon goes away", async () => { + // Spin up a throw-away server we can shut down independently. + const localPath = path.join( + os.tmpdir(), + `host-daemon-client-disc-${process.pid}.sock`, + ); + const local = new Server({ + socketPath: localPath, + daemonVersion: "0.0.0-disc", + }); + await local.listen(); + + const c = new DaemonClient({ socketPath: localPath }); + await c.connect(); + + const disc = new Promise((resolve) => { + c.onDisconnect(() => resolve()); + }); + + await local.close(); + await disc; + assert.equal(c.isConnected, false); + await c.dispose(); +}); + +async function waitFor(predicate: () => boolean, ms: number): Promise { + const start = Date.now(); + while (!predicate()) { + if (Date.now() - start > ms) throw new Error("waitFor timed out"); + await new Promise((r) => setTimeout(r, 25)); + } +} diff --git a/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts b/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts new file mode 100644 index 00000000000..1e03eb8be76 --- /dev/null +++ b/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts @@ -0,0 +1,369 @@ +// Client for the pty-daemon Unix-socket protocol. +// +// host-service holds a single long-lived DaemonClient. PTYs are owned by the +// daemon; this client is purely a thin transport over the socket: send typed +// requests, receive typed events, route output/exit to per-session callbacks. +// +// Lifecycle: +// - connect() opens the socket and completes the handshake. +// - subscribe(sessionId) registers callbacks; you'll receive every output +// and exit frame the daemon emits for that session id. +// - dispose() closes the socket; the daemon keeps owning sessions. +// +// Failure model: connection-level errors (daemon crash, socket close) are +// surfaced via onDisconnect. The desktop coordinator is responsible for +// respawning the daemon and host-service can reconnect by constructing a new +// DaemonClient. There is no in-band reconnect logic here — keep it dumb. + +import * as net from "node:net"; +import { + CURRENT_PROTOCOL_VERSION, + encodeFrame, + FrameDecoder, + type ServerMessage, + type SessionInfo, + type SessionMeta, +} from "@superset/pty-daemon/protocol"; + +export interface OpenResult { + id: string; + pid: number; +} + +export interface ExitInfo { + code: number | null; + signal: number | null; +} + +export type Signal = "SIGINT" | "SIGTERM" | "SIGKILL" | "SIGHUP"; + +export interface SubscribeCallbacks { + onOutput: (chunk: Buffer) => void; + onExit: (info: ExitInfo) => void; +} + +interface SessionCallbacks { + output: Set<(chunk: Buffer) => void>; + exit: Set<(info: ExitInfo) => void>; +} + +export interface DaemonClientOptions { + socketPath: string; + connectTimeoutMs?: number; +} + +export class DaemonClient { + private readonly opts: DaemonClientOptions; + private socket: net.Socket | null = null; + private decoder = new FrameDecoder(); + private readonly callbacks = new Map(); + private readonly disconnectCbs = new Set<(err?: Error) => void>(); + private daemonVersion = ""; + private negotiated: number | null = null; + private connected = false; + + constructor(opts: DaemonClientOptions) { + this.opts = opts; + } + + async connect(): Promise { + const socket = await openSocket(this.opts); + this.socket = socket; + socket.on("data", (chunk) => this.onData(chunk)); + socket.on("close", () => this.onClose()); + socket.on("error", (err) => this.onClose(err)); + await this.handshake(); + this.connected = true; + } + + get isConnected(): boolean { + return this.connected && this.socket !== null && !this.socket.destroyed; + } + + get version(): string { + return this.daemonVersion; + } + + get protocol(): number { + return this.negotiated ?? CURRENT_PROTOCOL_VERSION; + } + + onDisconnect(cb: (err?: Error) => void): () => void { + this.disconnectCbs.add(cb); + return () => { + this.disconnectCbs.delete(cb); + }; + } + + async open(id: string, meta: SessionMeta): Promise { + const reply = await this.requestSession(id, { type: "open", id, meta }); + if (reply.type === "open-ok") return { id, pid: reply.pid }; + if (reply.type === "error") throw new Error(`open ${id}: ${reply.message}`); + throw new Error(`open ${id}: unexpected reply ${reply.type}`); + } + + async close(id: string, signal: Signal = "SIGTERM"): Promise { + const reply = await this.requestSession(id, { type: "close", id, signal }); + if (reply.type === "closed") return; + if (reply.type === "error") + throw new Error(`close ${id}: ${reply.message}`); + throw new Error(`close ${id}: unexpected reply ${reply.type}`); + } + + async list(): Promise { + const reply = await this.requestNonSession({ type: "list" }, "list-reply"); + if (reply.type === "list-reply") return reply.sessions; + throw new Error(`list: unexpected reply ${reply.type}`); + } + + /** Fire-and-forget; bytes go straight to the PTY. */ + input(id: string, data: Buffer): void { + this.send({ + type: "input", + id, + data: data.toString("base64"), + }); + } + + /** Fire-and-forget; daemon validates dims. */ + resize(id: string, cols: number, rows: number): void { + this.send({ type: "resize", id, cols, rows }); + } + + /** + * Subscribe to a session's output + exit stream. Returns an unsubscribe + * function. With `replay: true` the daemon sends its current ring buffer + * before live streaming begins. Multiple subscribers per session are + * supported — the daemon fans output out to all of them. + */ + subscribe( + id: string, + opts: { replay: boolean }, + cb: SubscribeCallbacks, + ): () => void { + let entry = this.callbacks.get(id); + const wasFirst = !entry; + if (!entry) { + entry = { output: new Set(), exit: new Set() }; + this.callbacks.set(id, entry); + } + entry.output.add(cb.onOutput); + entry.exit.add(cb.onExit); + // Only the first subscribe per session id sends the wire `subscribe`. + // Subsequent local callbacks just register into the existing entry. + if (wasFirst) { + this.send({ type: "subscribe", id, replay: opts.replay }); + } + return () => { + const e = this.callbacks.get(id); + if (!e) return; + e.output.delete(cb.onOutput); + e.exit.delete(cb.onExit); + if (e.output.size === 0 && e.exit.size === 0) { + this.callbacks.delete(id); + this.send({ type: "unsubscribe", id }); + } + }; + } + + async dispose(): Promise { + this.connected = false; + const sock = this.socket; + this.socket = null; + if (!sock || sock.destroyed) return; + await new Promise((resolve) => { + sock.end(() => resolve()); + setTimeout(() => { + if (!sock.destroyed) sock.destroy(); + resolve(); + }, 200); + }); + } + + // ---- Internals ---- + + private async handshake(): Promise { + this.send({ + type: "hello", + protocols: [CURRENT_PROTOCOL_VERSION], + }); + const ack = await this.waitForFrame( + (m) => m.type === "hello-ack" || m.type === "error", + 5000, + ); + if (ack.type === "error") { + throw new Error(`daemon handshake failed: ${ack.message}`); + } + if (ack.type !== "hello-ack") { + throw new Error(`daemon handshake unexpected reply: ${ack.type}`); + } + this.daemonVersion = ack.daemonVersion; + this.negotiated = ack.protocol; + } + + private requestSession( + id: string, + req: + | { type: "open"; id: string; meta: SessionMeta } + | { type: "close"; id: string; signal: Signal }, + ): Promise { + return new Promise((resolve, reject) => { + let resolved = false; + const settle = (m: ServerMessage) => { + if (resolved) return; + resolved = true; + cleanup(); + resolve(m); + }; + const fail = (err: Error) => { + if (resolved) return; + resolved = true; + cleanup(); + reject(err); + }; + const off = this.on((m) => { + if (m.type === "error" && m.id === id) settle(m); + else if (req.type === "open" && m.type === "open-ok" && m.id === id) + settle(m); + else if (req.type === "close" && m.type === "closed" && m.id === id) + settle(m); + }); + const offDisc = this.onDisconnect((err) => + fail(err ?? new Error("daemon disconnected")), + ); + const cleanup = () => { + off(); + offDisc(); + }; + this.send(req); + }); + } + + private requestNonSession( + req: { type: "list" }, + expectType: "list-reply", + ): Promise { + return new Promise((resolve, reject) => { + let resolved = false; + const settle = (m: ServerMessage) => { + if (resolved) return; + resolved = true; + cleanup(); + resolve(m); + }; + const fail = (err: Error) => { + if (resolved) return; + resolved = true; + cleanup(); + reject(err); + }; + const off = this.on((m) => { + if (m.type === expectType || m.type === "error") settle(m); + }); + const offDisc = this.onDisconnect((err) => + fail(err ?? new Error("daemon disconnected")), + ); + const cleanup = () => { + off(); + offDisc(); + }; + this.send(req); + }); + } + + /** Register a one-shot listener. Returns an unsubscribe; called for every frame until disposed. */ + private on(cb: (m: ServerMessage) => void): () => void { + this.adhocListeners.add(cb); + return () => { + this.adhocListeners.delete(cb); + }; + } + + private adhocListeners = new Set<(m: ServerMessage) => void>(); + + private waitForFrame( + predicate: (m: ServerMessage) => boolean, + timeoutMs: number, + ): Promise { + return new Promise((resolve, reject) => { + const off = this.on((m) => { + if (predicate(m)) { + off(); + clearTimeout(timer); + resolve(m); + } + }); + const timer = setTimeout(() => { + off(); + reject(new Error(`daemon: timed out after ${timeoutMs}ms`)); + }, timeoutMs); + }); + } + + private send(msg: unknown): void { + const sock = this.socket; + if (!sock || sock.destroyed) { + throw new Error("DaemonClient: socket not connected"); + } + sock.write(encodeFrame(msg)); + } + + private onData(chunk: Buffer): void { + this.decoder.push(chunk); + let frames: unknown[]; + try { + frames = this.decoder.drain(); + } catch (err) { + this.onClose(err as Error); + return; + } + for (const raw of frames) { + const msg = raw as ServerMessage; + // Route session-keyed events to subscriber callbacks. + if (msg.type === "output" && this.callbacks.has(msg.id)) { + const buf = Buffer.from(msg.data, "base64"); + for (const cb of this.callbacks.get(msg.id)?.output ?? []) { + cb(buf); + } + continue; + } + if (msg.type === "exit" && this.callbacks.has(msg.id)) { + const info: ExitInfo = { code: msg.code, signal: msg.signal }; + for (const cb of this.callbacks.get(msg.id)?.exit ?? []) { + cb(info); + } + continue; + } + // Everything else (open-ok, closed, error, hello-ack, list-reply) + // goes through the adhoc listener fan-out so request/response + // helpers can pick it up. + for (const l of this.adhocListeners) l(msg); + } + } + + private onClose(err?: Error): void { + if (!this.connected && this.socket === null) return; + this.connected = false; + this.socket = null; + for (const cb of this.disconnectCbs) cb(err); + } +} + +function openSocket(opts: DaemonClientOptions): Promise { + const timeoutMs = opts.connectTimeoutMs ?? 5000; + return new Promise((resolve, reject) => { + const socket = net.createConnection({ path: opts.socketPath }); + const timer = setTimeout(() => { + socket.destroy(); + reject(new Error(`DaemonClient: connect timed out after ${timeoutMs}ms`)); + }, timeoutMs); + socket.once("connect", () => { + clearTimeout(timer); + resolve(socket); + }); + socket.once("error", (err) => { + clearTimeout(timer); + reject(err); + }); + }); +} diff --git a/packages/host-service/src/terminal/DaemonClient/index.ts b/packages/host-service/src/terminal/DaemonClient/index.ts new file mode 100644 index 00000000000..a98b4755a24 --- /dev/null +++ b/packages/host-service/src/terminal/DaemonClient/index.ts @@ -0,0 +1,8 @@ +export type { + DaemonClientOptions, + ExitInfo, + OpenResult, + Signal, + SubscribeCallbacks, +} from "./DaemonClient"; +export { DaemonClient } from "./DaemonClient"; diff --git a/packages/host-service/tsconfig.json b/packages/host-service/tsconfig.json index 1e480221828..c2b5b4f768c 100644 --- a/packages/host-service/tsconfig.json +++ b/packages/host-service/tsconfig.json @@ -2,7 +2,11 @@ "extends": "@superset/typescript/base.json", "compilerOptions": { "jsx": "react-jsx", - "types": ["bun-types"] + "types": ["bun-types"], + // Required because we depend on @superset/pty-daemon, whose package + // exports point at .ts source files (Node ESM requires explicit + // extensions in directory-style imports). + "allowImportingTsExtensions": true }, "include": ["src"] } diff --git a/packages/pty-daemon/src/main.ts b/packages/pty-daemon/src/main.ts old mode 100644 new mode 100755 From b1eb105f0e828c435405e98df89f9d7aae7b1d02 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 20:47:33 -0700 Subject: [PATCH 04/33] feat(desktop): pty-daemon coordinator + manifest + main entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sibling of HostServiceCoordinator that spawns/adopts the long-lived pty-daemon and feeds its socket path to host-service via SUPERSET_PTY_DAEMON_SOCKET. PTYs now live in a process whose lifetime is decoupled from host-service, so host-service restarts don't kill user shells. Pieces: - apps/desktop/src/main/lib/pty-daemon-manifest.ts — sibling of host-service-manifest.ts. Manifest at $SUPERSET_HOME_DIR/host/{orgId}/pty-daemon-manifest.json with pid, socketPath, protocolVersions, daemonVersion, startedAt. - apps/desktop/src/main/lib/pty-daemon-coordinator.ts — ensure() spawns detached child or adopts existing daemon (PID alive AND socket connectable). Same spawn shape as host-service: process.execPath + bundled script, openRotatingLogFd for stdio, writes manifest after socket-ready check. - apps/desktop/src/main/pty-daemon/index.ts — Electron main entry that imports @superset/pty-daemon's Server and provides argv/signal glue. Sibling of src/main/host-service/index.ts. - electron.vite.config.ts: register pty-daemon as a main entry so it bundles to dist/main/pty-daemon.js next to host-service.js. - host-service-coordinator: instantiate PtyDaemonCoordinator, ensure daemon up before each host-service spawn, pass its socket path to host-service via env. buildEnv signature gains a ptyDaemonSocket parameter. - tsconfig: enable allowImportingTsExtensions in apps/desktop and packages/host-service so transitively imported pty-daemon source type-checks (Node ESM requires explicit .ts extensions). What works after this commit: - Daemon spawns and listens on a 0600 Unix socket per organization. - host-service receives SUPERSET_PTY_DAEMON_SOCKET in its env. - Adoption: if a previous daemon is alive + reachable, reuse it. - Stale daemons (PID alive, socket gone) get killed and respawned. What does NOT work yet (next commit on this branch): - terminal.ts in host-service still calls pty.spawn directly. The daemon spawns but its DaemonClient isn't wired into terminal session creation. That's the load-bearing refactor; landing it separately so the coordinator change above can be reviewed in isolation. --- apps/desktop/electron.vite.config.ts | 3 + apps/desktop/package.json | 1 + .../src/main/lib/host-service-coordinator.ts | 22 +- .../src/main/lib/pty-daemon-coordinator.ts | 262 ++++++++++++++++++ .../src/main/lib/pty-daemon-manifest.ts | 95 +++++++ apps/desktop/src/main/pty-daemon/index.ts | 57 ++++ apps/desktop/tsconfig.json | 5 + bun.lock | 1 + 8 files changed, 445 insertions(+), 1 deletion(-) create mode 100644 apps/desktop/src/main/lib/pty-daemon-coordinator.ts create mode 100644 apps/desktop/src/main/lib/pty-daemon-manifest.ts create mode 100644 apps/desktop/src/main/pty-daemon/index.ts diff --git a/apps/desktop/electron.vite.config.ts b/apps/desktop/electron.vite.config.ts index 5f073935c45..f11759ceaa2 100644 --- a/apps/desktop/electron.vite.config.ts +++ b/apps/desktop/electron.vite.config.ts @@ -111,6 +111,9 @@ export default defineConfig({ "git-task-worker": resolve("src/main/git-task-worker.ts"), // Workspace service - local HTTP/tRPC server per org "host-service": resolve("src/main/host-service/index.ts"), + // pty-daemon - long-lived per-org Unix-socket server that owns PTYs. + // Spawned by PtyDaemonCoordinator; survives host-service restarts. + "pty-daemon": resolve("src/main/pty-daemon/index.ts"), }, output: { dir: resolve(devPath, "main"), diff --git a/apps/desktop/package.json b/apps/desktop/package.json index 9af93ec9d16..4cbcb675255 100644 --- a/apps/desktop/package.json +++ b/apps/desktop/package.json @@ -90,6 +90,7 @@ "@superset/macos-process-metrics": "workspace:*", "@superset/panes": "workspace:*", "@superset/port-scanner": "workspace:*", + "@superset/pty-daemon": "workspace:*", "@superset/shared": "workspace:*", "@superset/trpc": "workspace:*", "@superset/ui": "workspace:*", diff --git a/apps/desktop/src/main/lib/host-service-coordinator.ts b/apps/desktop/src/main/lib/host-service-coordinator.ts index 779627d3e5f..d43a8ba4bcb 100644 --- a/apps/desktop/src/main/lib/host-service-coordinator.ts +++ b/apps/desktop/src/main/lib/host-service-coordinator.ts @@ -27,6 +27,7 @@ import { pollHealthCheck, } from "./host-service-utils"; import { localDb } from "./local-db"; +import { PtyDaemonCoordinator } from "./pty-daemon-coordinator"; import { HOOK_PROTOCOL_VERSION } from "./terminal/env"; /** @@ -82,6 +83,13 @@ export class HostServiceCoordinator extends EventEmitter { private scriptPath = path.join(__dirname, "host-service.js"); private machineId = getHostId(); private devReloadWatcher: fs.FSWatcher | null = null; + // Sibling coordinator for the long-lived pty-daemon. Owns PTYs so that + // host-service restarts don't kill user shells. Its scriptPath sits next + // to ours after the build (apps/desktop/src/main + dist/host-service.js + + // dist/pty-daemon.js — see runtime-dependencies.ts for packaging). + private ptyDaemon = new PtyDaemonCoordinator({ + scriptPath: path.join(__dirname, "pty-daemon.js"), + }); async start( organizationId: string, @@ -385,7 +393,17 @@ export class HostServiceCoordinator extends EventEmitter { this.instances.set(organizationId, instance); this.emitStatus(organizationId, "starting", null); - const childEnv = await this.buildEnv(organizationId, port, secret, config); + // Ensure the pty-daemon is up before host-service starts; host-service + // connects to it during boot for terminal ops. + const daemonInstance = await this.ptyDaemon.ensure(organizationId); + + const childEnv = await this.buildEnv( + organizationId, + port, + secret, + config, + daemonInstance.socketPath, + ); // Host-service owns v2 PTYs, so it must survive Electron restarts in // every environment. This mirrors the terminal-host daemon: detach the // child and back stdio with real files so parent teardown cannot close @@ -457,6 +475,7 @@ export class HostServiceCoordinator extends EventEmitter { port: number, secret: string, config: SpawnConfig, + ptyDaemonSocket: string, ): Promise> { const organizationDir = manifestDir(organizationId); const row = localDb.select().from(settings).get(); @@ -479,6 +498,7 @@ export class HostServiceCoordinator extends EventEmitter { SUPERSET_HOME_DIR: SUPERSET_HOME_DIR, SUPERSET_AGENT_HOOK_PORT: String(sharedEnv.DESKTOP_NOTIFICATIONS_PORT), SUPERSET_AGENT_HOOK_VERSION: HOOK_PROTOCOL_VERSION, + SUPERSET_PTY_DAEMON_SOCKET: ptyDaemonSocket, AUTH_TOKEN: config.authToken, CLOUD_API_URL: config.cloudApiUrl, }); diff --git a/apps/desktop/src/main/lib/pty-daemon-coordinator.ts b/apps/desktop/src/main/lib/pty-daemon-coordinator.ts new file mode 100644 index 00000000000..acfaf4e2500 --- /dev/null +++ b/apps/desktop/src/main/lib/pty-daemon-coordinator.ts @@ -0,0 +1,262 @@ +// PtyDaemonCoordinator — sibling of HostServiceCoordinator, owns the +// per-organization pty-daemon process. Spawns or adopts the daemon and +// returns its Unix-socket path. host-service is told the path via +// SUPERSET_PTY_DAEMON_SOCKET so its DaemonClient can connect. +// +// Why detached spawn (matches host-service's approach): the daemon must +// outlive the desktop main process and host-service restarts. PTY ownership +// lives here so the rest of the system can be killed/restarted freely +// without losing user shells. + +import * as childProcess from "node:child_process"; +import * as fs from "node:fs"; +import * as net from "node:net"; +import * as path from "node:path"; +import { SUPERSET_HOME_DIR } from "./app-environment"; +import { isProcessAlive } from "./host-service-manifest"; +import { MAX_HOST_LOG_BYTES, openRotatingLogFd } from "./host-service-utils"; +import { + type PtyDaemonManifest, + ptyDaemonManifestDir, + readPtyDaemonManifest, + removePtyDaemonManifest, + writePtyDaemonManifest, +} from "./pty-daemon-manifest"; + +interface DaemonInstance { + pid: number; + socketPath: string; + startedAt: number; +} + +const SOCKET_READY_TIMEOUT_MS = 5_000; + +/** + * Per-organization socket path. Owner-only directory inherits from the + * existing $SUPERSET_HOME_DIR/host/{orgId}/ tree. + */ +function ptyDaemonSocketPath(organizationId: string): string { + return path.join(ptyDaemonManifestDir(organizationId), "pty-daemon.sock"); +} + +export interface PtyDaemonCoordinatorOptions { + /** Path to the daemon entry script (e.g. dist/pty-daemon.js). */ + scriptPath: string; +} + +export class PtyDaemonCoordinator { + private readonly opts: PtyDaemonCoordinatorOptions; + private readonly instances = new Map(); + private readonly pendingStarts = new Map>(); + + constructor(opts: PtyDaemonCoordinatorOptions) { + this.opts = opts; + } + + /** + * Spawn the daemon if not already running for this organization, or + * adopt the running one. Returns the socket path host-service should + * connect to. + */ + async ensure(organizationId: string): Promise { + const existing = this.instances.get(organizationId); + if (existing) return existing; + const pending = this.pendingStarts.get(organizationId); + if (pending) return pending; + + const startPromise = this.start(organizationId).finally(() => { + this.pendingStarts.delete(organizationId); + }); + this.pendingStarts.set(organizationId, startPromise); + return startPromise; + } + + getSocketPath(organizationId: string): string | null { + return this.instances.get(organizationId)?.socketPath ?? null; + } + + async stop(organizationId: string): Promise { + const instance = this.instances.get(organizationId); + this.instances.delete(organizationId); + if (!instance) return; + try { + process.kill(instance.pid, "SIGTERM"); + } catch { + // Already dead. + } + removePtyDaemonManifest(organizationId); + } + + private async start(organizationId: string): Promise { + // Try to adopt an existing daemon if its manifest is fresh and + // process is alive and the socket is connectable. + const adopted = await this.tryAdopt(organizationId); + if (adopted) { + this.instances.set(organizationId, adopted); + console.log( + `[pty-daemon:${organizationId}] adopted existing daemon pid=${adopted.pid}`, + ); + return adopted; + } + + // Otherwise spawn a fresh one. + return this.spawn(organizationId); + } + + private async tryAdopt( + organizationId: string, + ): Promise { + const manifest = readPtyDaemonManifest(organizationId); + if (!manifest) return null; + if (!isProcessAlive(manifest.pid)) { + removePtyDaemonManifest(organizationId); + return null; + } + const reachable = await isSocketConnectable(manifest.socketPath, 1000); + if (!reachable) { + // PID alive but socket gone — daemon is wedged. Kill and respawn. + try { + process.kill(manifest.pid, "SIGTERM"); + } catch { + // Already dead. + } + removePtyDaemonManifest(organizationId); + return null; + } + return { + pid: manifest.pid, + socketPath: manifest.socketPath, + startedAt: manifest.startedAt, + }; + } + + private async spawn(organizationId: string): Promise { + const dir = ptyDaemonManifestDir(organizationId); + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true, mode: 0o700 }); + } + const socketPath = ptyDaemonSocketPath(organizationId); + + const logFd = openRotatingLogFd( + path.join(dir, "pty-daemon.log"), + MAX_HOST_LOG_BYTES, + ); + const stdio: childProcess.StdioOptions = + logFd >= 0 ? ["ignore", logFd, logFd] : ["ignore", "ignore", "ignore"]; + + const childEnv = { + ...(process.env as Record), + ELECTRON_RUN_AS_NODE: "1", + ORGANIZATION_ID: organizationId, + SUPERSET_HOME_DIR, + }; + + let child: ReturnType; + try { + child = childProcess.spawn( + process.execPath, + [this.opts.scriptPath, `--socket=${socketPath}`], + { + detached: true, + stdio, + env: childEnv, + windowsHide: true, + }, + ); + } finally { + if (logFd >= 0) { + try { + fs.closeSync(logFd); + } catch { + // best-effort + } + } + } + + const childPid = child.pid; + if (!childPid) { + throw new Error(`[pty-daemon:${organizationId}] failed to spawn`); + } + + // Wait for the socket file to appear AND become connectable. + const ready = await waitForSocket(socketPath, SOCKET_READY_TIMEOUT_MS); + if (!ready) { + try { + child.kill("SIGTERM"); + } catch { + // best-effort + } + throw new Error( + `[pty-daemon:${organizationId}] socket did not become ready within ${SOCKET_READY_TIMEOUT_MS}ms`, + ); + } + + child.unref(); + child.on("exit", (code) => { + console.log(`[pty-daemon:${organizationId}] exited with code ${code}`); + const current = this.instances.get(organizationId); + if (current?.pid === childPid) { + this.instances.delete(organizationId); + removePtyDaemonManifest(organizationId); + } + }); + + const startedAt = Date.now(); + const manifest: PtyDaemonManifest = { + pid: childPid, + socketPath, + protocolVersions: [1], + daemonVersion: "unknown", // filled in by hello-ack on first connect + startedAt, + organizationId, + }; + writePtyDaemonManifest(manifest); + + const instance: DaemonInstance = { + pid: childPid, + socketPath, + startedAt, + }; + this.instances.set(organizationId, instance); + console.log( + `[pty-daemon:${organizationId}] spawned pid=${childPid} socket=${socketPath}`, + ); + return instance; + } +} + +async function waitForSocket( + socketPath: string, + timeoutMs: number, +): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + if (fs.existsSync(socketPath)) { + if (await isSocketConnectable(socketPath, 200)) return true; + } + await new Promise((r) => setTimeout(r, 50)); + } + return false; +} + +function isSocketConnectable( + socketPath: string, + timeoutMs: number, +): Promise { + return new Promise((resolve) => { + const sock = net.createConnection({ path: socketPath }); + const timer = setTimeout(() => { + sock.destroy(); + resolve(false); + }, timeoutMs); + sock.once("connect", () => { + clearTimeout(timer); + sock.end(); + resolve(true); + }); + sock.once("error", () => { + clearTimeout(timer); + resolve(false); + }); + }); +} diff --git a/apps/desktop/src/main/lib/pty-daemon-manifest.ts b/apps/desktop/src/main/lib/pty-daemon-manifest.ts new file mode 100644 index 00000000000..9888bb1b09c --- /dev/null +++ b/apps/desktop/src/main/lib/pty-daemon-manifest.ts @@ -0,0 +1,95 @@ +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { join } from "node:path"; +import { SUPERSET_HOME_DIR } from "./app-environment"; + +/** + * Manifest for a running pty-daemon instance. Sibling of + * HostServiceManifest; lives in the same per-organization directory under + * $SUPERSET_HOME_DIR/host/{organizationId}/. Different lifecycles — the + * daemon outlives host-service restarts. + */ +export interface PtyDaemonManifest { + pid: number; + socketPath: string; + protocolVersions: number[]; + daemonVersion: string; + startedAt: number; + organizationId: string; +} + +export function ptyDaemonManifestDir(organizationId: string): string { + return join(SUPERSET_HOME_DIR, "host", organizationId); +} + +function ptyDaemonManifestPath(organizationId: string): string { + return join(ptyDaemonManifestDir(organizationId), "pty-daemon-manifest.json"); +} + +export function writePtyDaemonManifest(manifest: PtyDaemonManifest): void { + const dir = ptyDaemonManifestDir(manifest.organizationId); + if (!existsSync(dir)) { + mkdirSync(dir, { recursive: true, mode: 0o700 }); + } + writeFileSync( + ptyDaemonManifestPath(manifest.organizationId), + JSON.stringify(manifest), + { encoding: "utf-8", mode: 0o600 }, + ); +} + +export function readPtyDaemonManifest( + organizationId: string, +): PtyDaemonManifest | null { + const filePath = ptyDaemonManifestPath(organizationId); + if (!existsSync(filePath)) return null; + + try { + const raw = readFileSync(filePath, "utf-8"); + const data = JSON.parse(raw); + if ( + typeof data.pid !== "number" || + typeof data.socketPath !== "string" || + !Array.isArray(data.protocolVersions) || + typeof data.daemonVersion !== "string" || + typeof data.startedAt !== "number" || + typeof data.organizationId !== "string" + ) { + return null; + } + return data as PtyDaemonManifest; + } catch { + return null; + } +} + +export function listPtyDaemonManifests(): PtyDaemonManifest[] { + const hostDir = join(SUPERSET_HOME_DIR, "host"); + if (!existsSync(hostDir)) return []; + const manifests: PtyDaemonManifest[] = []; + try { + for (const entry of readdirSync(hostDir, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + const manifest = readPtyDaemonManifest(entry.name); + if (manifest) manifests.push(manifest); + } + } catch { + // Best-effort scan. + } + return manifests; +} + +export function removePtyDaemonManifest(organizationId: string): void { + const filePath = ptyDaemonManifestPath(organizationId); + try { + if (existsSync(filePath)) unlinkSync(filePath); + } catch { + // Best-effort removal. + } +} diff --git a/apps/desktop/src/main/pty-daemon/index.ts b/apps/desktop/src/main/pty-daemon/index.ts new file mode 100644 index 00000000000..c97755a175c --- /dev/null +++ b/apps/desktop/src/main/pty-daemon/index.ts @@ -0,0 +1,57 @@ +/** + * pty-daemon — Desktop Entry Point + * + * Long-lived process that owns all PTY sessions. host-service is a client + * over a Unix socket. The PtyDaemonCoordinator (sibling of + * HostServiceCoordinator) spawns this and passes --socket=PATH. + * + * Mirrors the host-service entry shape: imports from the workspace package + * and provides the bare runtime glue (argv parsing, signal handling). + */ + +import { Server } from "@superset/pty-daemon"; + +interface CliArgs { + socket: string; +} + +function parseArgs(argv: string[]): CliArgs { + const args: Partial = {}; + for (const arg of argv) { + if (arg.startsWith("--socket=")) { + args.socket = arg.slice("--socket=".length); + } + } + if (!args.socket) { + throw new Error("--socket=PATH is required"); + } + return args as CliArgs; +} + +async function main(): Promise { + const args = parseArgs(process.argv.slice(2)); + const daemonVersion = process.env.SUPERSET_PTY_DAEMON_VERSION ?? "0.1.0"; + const server = new Server({ + socketPath: args.socket, + daemonVersion, + }); + await server.listen(); + process.stderr.write( + `[pty-daemon] listening on ${args.socket} (v${daemonVersion})\n`, + ); + + const shutdown = async (signal: NodeJS.Signals) => { + process.stderr.write(`[pty-daemon] received ${signal}, shutting down\n`); + await server.close(); + process.exit(0); + }; + process.on("SIGINT", () => void shutdown("SIGINT")); + process.on("SIGTERM", () => void shutdown("SIGTERM")); +} + +void main().catch((error) => { + process.stderr.write( + `[pty-daemon] failed to start: ${(error as Error).stack ?? error}\n`, + ); + process.exit(1); +}); diff --git a/apps/desktop/tsconfig.json b/apps/desktop/tsconfig.json index 32c89c5e403..d8ec40abcfa 100644 --- a/apps/desktop/tsconfig.json +++ b/apps/desktop/tsconfig.json @@ -3,6 +3,11 @@ "compilerOptions": { "types": ["bun"], "baseUrl": ".", + // Required because we depend on @superset/pty-daemon (transitively + // through @superset/host-service and directly via the daemon entry). + // Its package exports point at .ts source, and Node ESM requires + // explicit extensions in directory-style imports. + "allowImportingTsExtensions": true, "paths": { "*": ["src/*"], "~/*": ["./*"] diff --git a/bun.lock b/bun.lock index 3e0b456e1ff..9ab19d94e0b 100644 --- a/bun.lock +++ b/bun.lock @@ -167,6 +167,7 @@ "@superset/macos-process-metrics": "workspace:*", "@superset/panes": "workspace:*", "@superset/port-scanner": "workspace:*", + "@superset/pty-daemon": "workspace:*", "@superset/shared": "workspace:*", "@superset/trpc": "workspace:*", "@superset/ui": "workspace:*", From 401e203fe93534b948ab484ca3040d6a258592e2 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 21:01:02 -0700 Subject: [PATCH 05/33] feat(host-service): route terminal sessions through pty-daemon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The load-bearing change. terminal.ts no longer calls node-pty's spawn; PTY ownership lives in pty-daemon and host-service is a remote control. After this commit, killing host-service does not kill user shells — the daemon's session map and ring buffer survive the restart, and a fresh host-service connects to the existing daemon and re-subscribes with replay. What changed: - New: src/terminal/daemon-client-singleton.ts. Lazy-initialized DaemonClient pulling SUPERSET_PTY_DAEMON_SOCKET from env. Surfaces daemon-disconnect via console.error; the desktop coordinator is responsible for respawning the daemon and restarting host-service. - terminal.ts: replace pty.spawn / pty.onData / pty.onExit with daemon.open + daemon.subscribe(replay:true). The PTY field becomes a thin DaemonPty facade exposing write/resize/kill/onData/onExit unchanged for callers (teardown.ts, etc). - createTerminalSessionInternal becomes async (await daemon.open). All callers updated: trpc/router/terminal launchSession, workspace-creation/setup-terminal startSetupTerminalIfPresent, runtime/teardown runTeardownScript. - session.unsubscribeDaemon is called on disposeSession to release the primary subscription cleanly. - DaemonPty.onData / onExit register additional subscribers via daemon.subscribe; daemon's multi-subscriber fan-out makes this safe. Tests: - pty-daemon: 24 bun unit + 28 node integration → all green - host-service: 37 bun unit + 5 node DaemonClient integration → all green (existing terminal logic still passes its tests; the daemon is wired in but mocked out at the unit level) - Workspace-wide tsc clean across all 27 packages. Build/test plumbing: - DaemonClient.test.ts → DaemonClient.node-test.ts so bun test won't pick it up (node-pty doesn't work under Bun). packages/host-service: new "test:integration" script invokes node. - tooling/typescript/base.json: enable allowImportingTsExtensions globally. Multiple packages now transitively pull in @superset/pty- daemon, whose source uses .ts extension imports (Node ESM requires explicit extensions for directory-style resolution). Roll back the per-package opt-ins added earlier in this branch. What still needs verifying (separate commit/PR): - End-to-end smoke: launch desktop, open a terminal, kill host-service, observe shell survives and renderer reattaches via existing exponential-backoff WebSocket reconnect. The infrastructure is in place; this commit doesn't run the e2e itself. --- apps/desktop/tsconfig.json | 5 - packages/host-service/package.json | 3 +- .../src/runtime/teardown/teardown.ts | 2 +- ...ient.test.ts => DaemonClient.node-test.ts} | 0 .../src/terminal/daemon-client-singleton.ts | 53 ++++ .../host-service/src/terminal/terminal.ts | 265 +++++++++++++----- .../src/trpc/router/terminal/terminal.ts | 4 +- .../workspace-creation/procedures/create.ts | 2 +- .../shared/finish-checkout.ts | 2 +- .../shared/setup-terminal.ts | 9 +- packages/host-service/tsconfig.json | 6 +- packages/pty-daemon/tsconfig.json | 1 - tooling/typescript/base.json | 3 +- 13 files changed, 257 insertions(+), 98 deletions(-) rename packages/host-service/src/terminal/DaemonClient/{DaemonClient.test.ts => DaemonClient.node-test.ts} (100%) create mode 100644 packages/host-service/src/terminal/daemon-client-singleton.ts diff --git a/apps/desktop/tsconfig.json b/apps/desktop/tsconfig.json index d8ec40abcfa..32c89c5e403 100644 --- a/apps/desktop/tsconfig.json +++ b/apps/desktop/tsconfig.json @@ -3,11 +3,6 @@ "compilerOptions": { "types": ["bun"], "baseUrl": ".", - // Required because we depend on @superset/pty-daemon (transitively - // through @superset/host-service and directly via the daemon entry). - // Its package exports point at .ts source, and Node ESM requires - // explicit extensions in directory-style imports. - "allowImportingTsExtensions": true, "paths": { "*": ["src/*"], "~/*": ["./*"] diff --git a/packages/host-service/package.json b/packages/host-service/package.json index 08f703587c1..12676916c00 100644 --- a/packages/host-service/package.json +++ b/packages/host-service/package.json @@ -38,7 +38,8 @@ "dev": "bun run src/serve.ts", "build:host": "bun run build.ts", "generate": "drizzle-kit generate", - "typecheck": "tsc --noEmit --emitDeclarationOnly false" + "typecheck": "tsc --noEmit --emitDeclarationOnly false", + "test:integration": "node --experimental-strip-types --test src/terminal/DaemonClient/DaemonClient.node-test.ts" }, "dependencies": { "@hono/node-server": "^1.14.1", diff --git a/packages/host-service/src/runtime/teardown/teardown.ts b/packages/host-service/src/runtime/teardown/teardown.ts index 2c428706f89..44c483aa94d 100644 --- a/packages/host-service/src/runtime/teardown/teardown.ts +++ b/packages/host-service/src/runtime/teardown/teardown.ts @@ -56,7 +56,7 @@ export async function runTeardown({ // Single-quoted so no shell interpolation is possible on the path. const initialCommand = `bash ${singleQuote(scriptPath)} ; exit $?`; - const session = createTerminalSessionInternal({ + const session = await createTerminalSessionInternal({ terminalId, workspaceId, db, diff --git a/packages/host-service/src/terminal/DaemonClient/DaemonClient.test.ts b/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts similarity index 100% rename from packages/host-service/src/terminal/DaemonClient/DaemonClient.test.ts rename to packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts diff --git a/packages/host-service/src/terminal/daemon-client-singleton.ts b/packages/host-service/src/terminal/daemon-client-singleton.ts new file mode 100644 index 00000000000..0fee7142a95 --- /dev/null +++ b/packages/host-service/src/terminal/daemon-client-singleton.ts @@ -0,0 +1,53 @@ +// Lazy singleton DaemonClient for host-service. The desktop coordinator +// passes the daemon socket path via SUPERSET_PTY_DAEMON_SOCKET. We connect +// once on first use and reuse the connection for all sessions. +// +// On disconnect we surface via console.error and let the next caller fail — +// the desktop coordinator is responsible for respawning the daemon and +// host-service can be restarted to reconnect. There's no in-band reconnect +// here on purpose; see DaemonClient's "dumb" failure model. + +import { DaemonClient } from "./DaemonClient"; + +let cached: DaemonClient | null = null; +let connecting: Promise | null = null; + +export function ptyDaemonSocketPath(): string { + const path = process.env.SUPERSET_PTY_DAEMON_SOCKET; + if (!path) { + throw new Error( + "SUPERSET_PTY_DAEMON_SOCKET is not set; host-service requires the pty-daemon socket to be passed by the coordinator", + ); + } + return path; +} + +export async function getDaemonClient(): Promise { + if (cached?.isConnected) return cached; + if (connecting) return connecting; + const client = new DaemonClient({ socketPath: ptyDaemonSocketPath() }); + client.onDisconnect((err) => { + console.error( + "[host-service] pty-daemon disconnected:", + err?.message ?? "", + ); + if (cached === client) cached = null; + }); + connecting = client + .connect() + .then(() => { + cached = client; + return client; + }) + .finally(() => { + connecting = null; + }); + return connecting; +} + +/** For tests / shutdown only. */ +export async function disposeDaemonClient(): Promise { + const c = cached; + cached = null; + if (c) await c.dispose(); +} diff --git a/packages/host-service/src/terminal/terminal.ts b/packages/host-service/src/terminal/terminal.ts index cac03519bcc..cb23c02c5c8 100644 --- a/packages/host-service/src/terminal/terminal.ts +++ b/packages/host-service/src/terminal/terminal.ts @@ -13,11 +13,12 @@ import { } from "@superset/shared/terminal-title-scanner"; import { and, eq, ne } from "drizzle-orm"; import type { Hono } from "hono"; -import { type IPty, spawn } from "node-pty"; import type { HostDb } from "../db"; import { projects, terminalSessions, workspaces } from "../db/schema"; import type { EventBus } from "../events"; import { portManager } from "../ports/port-manager"; +import type { DaemonClient } from "./DaemonClient"; +import { getDaemonClient } from "./daemon-client-singleton"; import { buildV2TerminalEnv, getShellLaunchArgs, @@ -25,6 +26,84 @@ import { resolveLaunchShell, } from "./env"; +/** + * Thin adapter exposing approximately the IPty surface that the rest of + * this file (and teardown.ts) was built against, so most of the call + * sites stay unchanged after the daemon extraction. The PTY itself lives + * in pty-daemon; this is a remote control. + * + * onData / onExit register additional subscribers on top of whatever the + * session's primary subscription is doing — daemon supports multi- + * subscriber fan-out per session, so layered observers work fine. + */ +interface PtyDataDisposer { + dispose(): void; +} + +interface DaemonPty { + pid: number; + write(data: string): void; + resize(cols: number, rows: number): void; + kill(signal?: NodeJS.Signals): void; + onData(cb: (data: string) => void): PtyDataDisposer; + onExit( + cb: (info: { exitCode: number; signal: number }) => void, + ): PtyDataDisposer; +} + +function makeDaemonPty( + daemon: DaemonClient, + sessionId: string, + pid: number, +): DaemonPty { + return { + pid, + write(data) { + daemon.input(sessionId, Buffer.from(data, "utf8")); + }, + resize(cols, rows) { + try { + daemon.resize(sessionId, cols, rows); + } catch { + // Daemon may have disconnected; surface via the next op. + } + }, + kill(signal) { + daemon + .close( + sessionId, + (signal as "SIGTERM" | "SIGKILL" | "SIGINT" | "SIGHUP") ?? "SIGTERM", + ) + .catch(() => { + // Already gone or daemon disconnected — no-op. + }); + }, + onData(cb) { + const unsub = daemon.subscribe( + sessionId, + { replay: false }, + { + onOutput: (chunk) => cb(chunk.toString("utf8")), + onExit: () => {}, + }, + ); + return { dispose: unsub }; + }, + onExit(cb) { + const unsub = daemon.subscribe( + sessionId, + { replay: false }, + { + onOutput: () => {}, + onExit: ({ code, signal }) => + cb({ exitCode: code ?? 0, signal: signal ?? 0 }), + }, + ); + return { dispose: unsub }; + }, + }; +} + interface RegisterWorkspaceTerminalRouteOptions { app: Hono; db: HostDb; @@ -97,7 +176,9 @@ type ShellReadyState = "pending" | "ready" | "timed_out" | "unsupported"; interface TerminalSession { terminalId: string; workspaceId: string; - pty: IPty; + pty: DaemonPty; + /** Unsubscribe from the daemon's output/exit stream when disposed. */ + unsubscribeDaemon: (() => void) | null; sockets: Set; buffer: string[]; bufferBytes: number; @@ -293,6 +374,15 @@ export function disposeSession(terminalId: string, db: HostDb) { // PTY may already be dead } } + // Stop receiving daemon callbacks for this session. + if (session.unsubscribeDaemon) { + try { + session.unsubscribeDaemon(); + } catch { + // best-effort + } + session.unsubscribeDaemon = null; + } sessions.delete(terminalId); } @@ -348,7 +438,7 @@ interface CreateTerminalSessionOptions { listed?: boolean; } -export function createTerminalSessionInternal({ +export async function createTerminalSessionInternal({ terminalId, workspaceId, themeType, @@ -356,7 +446,7 @@ export function createTerminalSessionInternal({ eventBus, initialCommand, listed = true, -}: CreateTerminalSessionOptions): TerminalSession | { error: string } { +}: CreateTerminalSessionOptions): Promise { const existing = sessions.get(terminalId); if (existing) { if (listed) existing.listed = true; @@ -405,10 +495,13 @@ export function createTerminalSessionInternal({ hostAgentHookUrl: getHostAgentHookUrl(), }); - let pty: IPty; + let daemon: DaemonClient; + let openResult: { pid: number }; try { - pty = spawn(shell, shellArgs, { - name: "xterm-256color", + daemon = await getDaemonClient(); + openResult = await daemon.open(terminalId, { + shell, + argv: shellArgs, cwd, cols: 120, rows: 32, @@ -420,6 +513,7 @@ export function createTerminalSessionInternal({ error instanceof Error ? error.message : "Failed to start terminal", }; } + const pty: DaemonPty = makeDaemonPty(daemon, terminalId, openResult.pid); const createdAt = Date.now(); @@ -451,6 +545,7 @@ export function createTerminalSessionInternal({ terminalId, workspaceId, pty, + unsubscribeDaemon: null, sockets: new Set(), buffer: [], bufferBytes: 0, @@ -479,57 +574,69 @@ export function createTerminalSessionInternal({ }, SHELL_READY_TIMEOUT_MS); } - pty.onData((rawData) => { - const titleUpdates = scanForTerminalTitle(session.titleScanState, rawData); - for (const title of titleUpdates.updates) { - setSessionTitle(session, title); - } - - // Scan for OSC 133;A and strip it from output - let data = rawData; - if (session.shellReadyState === "pending") { - const result = scanForShellReady(session.scanState, rawData); - data = result.output; - if (result.matched) { - resolveShellReady(session, "ready"); - } - } - if (data.length === 0) return; - - portManager.checkOutputForHint(data); - - if (broadcastMessage(session, { type: "data", data }) === 0) { - bufferOutput(session, data); - } - }); - - pty.onExit(({ exitCode, signal }) => { - session.exited = true; - session.exitCode = exitCode ?? 0; - session.exitSignal = signal ?? 0; - - portManager.unregisterSession(terminalId); - - db.update(terminalSessions) - .set({ status: "exited", endedAt: Date.now() }) - .where(eq(terminalSessions.id, terminalId)) - .run(); - - broadcastMessage(session, { - type: "exit", - exitCode: session.exitCode, - signal: session.exitSignal, - }); - - eventBus?.broadcastTerminalLifecycle({ - workspaceId, - terminalId, - eventType: "exit", - exitCode: session.exitCode, - signal: session.exitSignal, - occurredAt: Date.now(), - }); - }); + // Subscribe to the daemon's output + exit stream for this session. We + // pass replay:true so a fresh host-service after a restart picks up + // whatever the daemon already had buffered for the session. + session.unsubscribeDaemon = daemon.subscribe( + terminalId, + { replay: true }, + { + onOutput(chunk) { + const rawData = chunk.toString("utf8"); + const titleUpdates = scanForTerminalTitle( + session.titleScanState, + rawData, + ); + for (const title of titleUpdates.updates) { + setSessionTitle(session, title); + } + + // Scan for OSC 133;A and strip it from output. + let data = rawData; + if (session.shellReadyState === "pending") { + const result = scanForShellReady(session.scanState, rawData); + data = result.output; + if (result.matched) { + resolveShellReady(session, "ready"); + } + } + if (data.length === 0) return; + + portManager.checkOutputForHint(data); + + if (broadcastMessage(session, { type: "data", data }) === 0) { + bufferOutput(session, data); + } + }, + onExit({ code, signal }) { + session.exited = true; + session.exitCode = code ?? 0; + session.exitSignal = signal ?? 0; + + portManager.unregisterSession(terminalId); + + db.update(terminalSessions) + .set({ status: "exited", endedAt: Date.now() }) + .where(eq(terminalSessions.id, terminalId)) + .run(); + + broadcastMessage(session, { + type: "exit", + exitCode: session.exitCode, + signal: session.exitSignal, + }); + + eventBus?.broadcastTerminalLifecycle({ + workspaceId, + terminalId, + eventType: "exit", + exitCode: session.exitCode, + signal: session.exitSignal, + occurredAt: Date.now(), + }); + }, + }, + ); if (initialCommand) { queueInitialCommand(session, initialCommand); @@ -555,7 +662,7 @@ export function registerWorkspaceTerminalRoute({ return c.json({ error: "Missing terminalId or workspaceId" }, 400); } - const result = createTerminalSessionInternal({ + const result = await createTerminalSessionInternal({ terminalId: body.terminalId, workspaceId: body.workspaceId, themeType: parseThemeType(body.themeType), @@ -621,27 +728,31 @@ export function registerWorkspaceTerminalRoute({ } const themeType = parseThemeType(c.req.query("themeType")); - const result = createTerminalSessionInternal({ - terminalId, - workspaceId, - themeType, - db, - eventBus, - }); + // Daemon open is async; fire-and-forget while keeping the WS alive. + // On success: register the socket; on failure: surface and close. + void (async () => { + const result = await createTerminalSessionInternal({ + terminalId, + workspaceId, + themeType, + db, + eventBus, + }); - if ("error" in result) { - sendMessage(ws, { type: "error", message: result.error }); - ws.close(1011, result.error); - return; - } + if ("error" in result) { + sendMessage(ws, { type: "error", message: result.error }); + ws.close(1011, result.error); + return; + } - result.sockets.add(ws); - sendMessage(ws, { type: "title", title: result.title }); + result.sockets.add(ws); + sendMessage(ws, { type: "title", title: result.title }); - db.update(terminalSessions) - .set({ lastAttachedAt: Date.now() }) - .where(eq(terminalSessions.id, terminalId)) - .run(); + db.update(terminalSessions) + .set({ lastAttachedAt: Date.now() }) + .where(eq(terminalSessions.id, terminalId)) + .run(); + })(); return; } diff --git a/packages/host-service/src/trpc/router/terminal/terminal.ts b/packages/host-service/src/trpc/router/terminal/terminal.ts index b8e36c2a862..e289c50fa3e 100644 --- a/packages/host-service/src/trpc/router/terminal/terminal.ts +++ b/packages/host-service/src/trpc/router/terminal/terminal.ts @@ -20,9 +20,9 @@ export const terminalRouter = router({ themeType: z.string().optional(), }), ) - .mutation(({ ctx, input }) => { + .mutation(async ({ ctx, input }) => { const terminalId = input.terminalId ?? crypto.randomUUID(); - const result = createTerminalSessionInternal({ + const result = await createTerminalSessionInternal({ terminalId, workspaceId: input.workspaceId, themeType: parseThemeType(input.themeType), diff --git a/packages/host-service/src/trpc/router/workspace-creation/procedures/create.ts b/packages/host-service/src/trpc/router/workspace-creation/procedures/create.ts index 2faa576f5f0..ab6b9ba3282 100644 --- a/packages/host-service/src/trpc/router/workspace-creation/procedures/create.ts +++ b/packages/host-service/src/trpc/router/workspace-creation/procedures/create.ts @@ -302,7 +302,7 @@ export const create = protectedProcedure const warnings: string[] = []; if (input.composer.runSetupScript) { - const { terminal, warning } = startSetupTerminalIfPresent({ + const { terminal, warning } = await startSetupTerminalIfPresent({ ctx, workspaceId: cloudRow.id, worktreePath, diff --git a/packages/host-service/src/trpc/router/workspace-creation/shared/finish-checkout.ts b/packages/host-service/src/trpc/router/workspace-creation/shared/finish-checkout.ts index 7be2be8f561..e982e2e479f 100644 --- a/packages/host-service/src/trpc/router/workspace-creation/shared/finish-checkout.ts +++ b/packages/host-service/src/trpc/router/workspace-creation/shared/finish-checkout.ts @@ -143,7 +143,7 @@ export async function finishCheckout( const warnings: string[] = [...args.extraWarnings]; if (args.runSetupScript) { - const { terminal, warning } = startSetupTerminalIfPresent({ + const { terminal, warning } = await startSetupTerminalIfPresent({ ctx, workspaceId: cloudRow.id, worktreePath: args.worktreePath, diff --git a/packages/host-service/src/trpc/router/workspace-creation/shared/setup-terminal.ts b/packages/host-service/src/trpc/router/workspace-creation/shared/setup-terminal.ts index 277061ccabe..9edd312b0d9 100644 --- a/packages/host-service/src/trpc/router/workspace-creation/shared/setup-terminal.ts +++ b/packages/host-service/src/trpc/router/workspace-creation/shared/setup-terminal.ts @@ -4,18 +4,21 @@ import { createTerminalSessionInternal } from "../../../../terminal/terminal"; import type { HostServiceContext } from "../../../../types"; import type { TerminalDescriptor } from "./types"; -export function startSetupTerminalIfPresent(args: { +export async function startSetupTerminalIfPresent(args: { ctx: HostServiceContext; workspaceId: string; worktreePath: string; -}): { terminal: TerminalDescriptor | null; warning: string | null } { +}): Promise<{ + terminal: TerminalDescriptor | null; + warning: string | null; +}> { const setupScriptPath = join(args.worktreePath, ".superset", "setup.sh"); if (!existsSync(setupScriptPath)) { return { terminal: null, warning: null }; } const terminalId = crypto.randomUUID(); - const result = createTerminalSessionInternal({ + const result = await createTerminalSessionInternal({ terminalId, workspaceId: args.workspaceId, db: args.ctx.db, diff --git a/packages/host-service/tsconfig.json b/packages/host-service/tsconfig.json index c2b5b4f768c..1e480221828 100644 --- a/packages/host-service/tsconfig.json +++ b/packages/host-service/tsconfig.json @@ -2,11 +2,7 @@ "extends": "@superset/typescript/base.json", "compilerOptions": { "jsx": "react-jsx", - "types": ["bun-types"], - // Required because we depend on @superset/pty-daemon, whose package - // exports point at .ts source files (Node ESM requires explicit - // extensions in directory-style imports). - "allowImportingTsExtensions": true + "types": ["bun-types"] }, "include": ["src"] } diff --git a/packages/pty-daemon/tsconfig.json b/packages/pty-daemon/tsconfig.json index 555dc9e9cf9..2d1f2809f81 100644 --- a/packages/pty-daemon/tsconfig.json +++ b/packages/pty-daemon/tsconfig.json @@ -3,7 +3,6 @@ "compilerOptions": { "types": ["bun-types", "node"], "noUncheckedIndexedAccess": true, - "allowImportingTsExtensions": true, "emitDeclarationOnly": false, "noEmit": true }, diff --git a/tooling/typescript/base.json b/tooling/typescript/base.json index b73be1ebd38..2d56a6cf542 100644 --- a/tooling/typescript/base.json +++ b/tooling/typescript/base.json @@ -20,7 +20,8 @@ "module": "Preserve", "moduleResolution": "Bundler", - "noEmit": true + "noEmit": true, + "allowImportingTsExtensions": true }, "exclude": ["node_modules", "build", "dist", ".next"] } From b387324e1a8a5c6c8f9fffff7964852a743838c0 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 21:53:04 -0700 Subject: [PATCH 06/33] fix(desktop): make pty-daemon spawn failure non-fatal for host-service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the daemon can't start (dev build without dist/main/pty-daemon.js, node-pty native module mismatch, etc.), the prior commit took host- service down with it — workspaces, git, chat, all unreachable. That's the wrong coupling: the daemon's job is terminal survival, not gating the rest of the API. Now: catch the daemon-spawn error, log it loudly with the cause, and spawn host-service with SUPERSET_PTY_DAEMON_SOCKET="" so terminal ops fail with a specific message ("pty-daemon is not available: ...") and everything else keeps working. The user can still use the app while the daemon issue is investigated. This unblocks the "workspace on sidebar but not found in db" symptom seen during dev: the sidebar shows entries from cloud / local-db, but host-service was never running so its DB queries return nothing. --- .../src/main/lib/host-service-coordinator.ts | 19 +++++++++++++++---- .../src/terminal/daemon-client-singleton.ts | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/apps/desktop/src/main/lib/host-service-coordinator.ts b/apps/desktop/src/main/lib/host-service-coordinator.ts index d43a8ba4bcb..dbb095bf023 100644 --- a/apps/desktop/src/main/lib/host-service-coordinator.ts +++ b/apps/desktop/src/main/lib/host-service-coordinator.ts @@ -393,16 +393,27 @@ export class HostServiceCoordinator extends EventEmitter { this.instances.set(organizationId, instance); this.emitStatus(organizationId, "starting", null); - // Ensure the pty-daemon is up before host-service starts; host-service - // connects to it during boot for terminal ops. - const daemonInstance = await this.ptyDaemon.ensure(organizationId); + // Try to bring up the pty-daemon. If it fails (e.g. dev build doesn't + // have dist/main/pty-daemon.js yet), don't take host-service down with + // it — workspaces, git, chat, etc. should still work. Terminal ops + // will surface a clear error to the renderer instead. + let daemonSocketPath = ""; + try { + const daemonInstance = await this.ptyDaemon.ensure(organizationId); + daemonSocketPath = daemonInstance.socketPath; + } catch (error) { + console.error( + `[host-service:${organizationId}] pty-daemon failed to start; terminals will be unavailable until it recovers:`, + error, + ); + } const childEnv = await this.buildEnv( organizationId, port, secret, config, - daemonInstance.socketPath, + daemonSocketPath, ); // Host-service owns v2 PTYs, so it must survive Electron restarts in // every environment. This mirrors the terminal-host daemon: detach the diff --git a/packages/host-service/src/terminal/daemon-client-singleton.ts b/packages/host-service/src/terminal/daemon-client-singleton.ts index 0fee7142a95..c82ced90ae0 100644 --- a/packages/host-service/src/terminal/daemon-client-singleton.ts +++ b/packages/host-service/src/terminal/daemon-client-singleton.ts @@ -16,7 +16,7 @@ export function ptyDaemonSocketPath(): string { const path = process.env.SUPERSET_PTY_DAEMON_SOCKET; if (!path) { throw new Error( - "SUPERSET_PTY_DAEMON_SOCKET is not set; host-service requires the pty-daemon socket to be passed by the coordinator", + "pty-daemon is not available: SUPERSET_PTY_DAEMON_SOCKET is not set. The desktop coordinator should set this before spawning host-service. Terminals will not work until the daemon comes up.", ); } return path; From 2e8d2167e0e3596c7f8795aa160f9a898c3242ea Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 22:00:29 -0700 Subject: [PATCH 07/33] debug(desktop): surface daemon spawn failures with log tail + child exit code When the daemon fails to come up, the prior coordinator just said "socket did not become ready within 5000ms" with no idea what went wrong. Now: 1. Refuse to spawn if scriptPath doesn't exist (e.g. dist/main/pty- daemon.js missing because electron-vite hasn't bundled the new entry yet). The error tells the user to restart the dev server. 2. Listen for child early-exit; include exit code or signal in the timeout error. 3. On timeout, read the daemon's log file (tail 2 KB) and include it in the thrown error. 4. Console.log the spawn args before fork so the dev terminal shows exactly what's being launched. This makes the next failure self-diagnosing instead of opaque. --- .../src/main/lib/pty-daemon-coordinator.ts | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/apps/desktop/src/main/lib/pty-daemon-coordinator.ts b/apps/desktop/src/main/lib/pty-daemon-coordinator.ts index acfaf4e2500..39d1147756c 100644 --- a/apps/desktop/src/main/lib/pty-daemon-coordinator.ts +++ b/apps/desktop/src/main/lib/pty-daemon-coordinator.ts @@ -136,11 +136,18 @@ export class PtyDaemonCoordinator { fs.mkdirSync(dir, { recursive: true, mode: 0o700 }); } const socketPath = ptyDaemonSocketPath(organizationId); + const logPath = path.join(dir, "pty-daemon.log"); - const logFd = openRotatingLogFd( - path.join(dir, "pty-daemon.log"), - MAX_HOST_LOG_BYTES, - ); + // Sanity: refuse to spawn if the script doesn't exist (e.g. dev build + // hasn't produced dist/main/pty-daemon.js yet). Otherwise the spawn + // will silently exit and we wait the full timeout. + if (!fs.existsSync(this.opts.scriptPath)) { + throw new Error( + `[pty-daemon:${organizationId}] script not found at ${this.opts.scriptPath} — restart electron-vite dev to bundle the new entry`, + ); + } + + const logFd = openRotatingLogFd(logPath, MAX_HOST_LOG_BYTES); const stdio: childProcess.StdioOptions = logFd >= 0 ? ["ignore", logFd, logFd] : ["ignore", "ignore", "ignore"]; @@ -151,6 +158,10 @@ export class PtyDaemonCoordinator { SUPERSET_HOME_DIR, }; + console.log( + `[pty-daemon:${organizationId}] spawning ${this.opts.scriptPath} → ${socketPath} (log: ${logPath})`, + ); + let child: ReturnType; try { child = childProcess.spawn( @@ -178,6 +189,14 @@ export class PtyDaemonCoordinator { throw new Error(`[pty-daemon:${organizationId}] failed to spawn`); } + // Capture an early exit so the timeout error reports the actual cause. + let earlyExitCode: number | null = null; + let earlyExitSignal: NodeJS.Signals | null = null; + child.once("exit", (code, signal) => { + earlyExitCode = code; + earlyExitSignal = signal; + }); + // Wait for the socket file to appear AND become connectable. const ready = await waitForSocket(socketPath, SOCKET_READY_TIMEOUT_MS); if (!ready) { @@ -186,8 +205,15 @@ export class PtyDaemonCoordinator { } catch { // best-effort } + let logTail = ""; + try { + const buf = fs.readFileSync(logPath, "utf-8"); + logTail = buf.slice(-2000); + } catch { + logTail = "(no log file written)"; + } throw new Error( - `[pty-daemon:${organizationId}] socket did not become ready within ${SOCKET_READY_TIMEOUT_MS}ms`, + `[pty-daemon:${organizationId}] socket did not become ready within ${SOCKET_READY_TIMEOUT_MS}ms (childPid=${childPid}, earlyExit=${earlyExitCode ?? earlyExitSignal ?? "still alive"}). Log tail:\n${logTail}`, ); } From df81d8b151a7a2884ce00d8df1eadafdd8356cff Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 22:02:40 -0700 Subject: [PATCH 08/33] fix(desktop): allow .env / shell to provide SUPERSET_PTY_DAEMON_SOCKET MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the coordinator's own daemon spawn fails, we no longer overwrite the env var with empty string — we leave whatever the parent process has. That makes ".env workaround" actually work: run a daemon manually, export SUPERSET_PTY_DAEMON_SOCKET=/path/to/sock in your shell or .env, and host-service will pick it up and terminals will function again until the spawn-side bug is fixed. --- .../src/main/lib/host-service-coordinator.ts | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/apps/desktop/src/main/lib/host-service-coordinator.ts b/apps/desktop/src/main/lib/host-service-coordinator.ts index dbb095bf023..d2838cdb168 100644 --- a/apps/desktop/src/main/lib/host-service-coordinator.ts +++ b/apps/desktop/src/main/lib/host-service-coordinator.ts @@ -492,7 +492,10 @@ export class HostServiceCoordinator extends EventEmitter { const row = localDb.select().from(settings).get(); const exposeViaRelay = row?.exposeHostServiceViaRelay ?? false; - const childEnv = await getProcessEnvWithShellPath({ + // Allow .env / shell SUPERSET_PTY_DAEMON_SOCKET to take effect when our + // own daemon spawn failed. Set it explicitly only when we have a real + // path; otherwise inherit whatever the parent has. + const baseEnv: Record = { ...(process.env as Record), ELECTRON_RUN_AS_NODE: "1", ORGANIZATION_ID: organizationId, @@ -509,10 +512,14 @@ export class HostServiceCoordinator extends EventEmitter { SUPERSET_HOME_DIR: SUPERSET_HOME_DIR, SUPERSET_AGENT_HOOK_PORT: String(sharedEnv.DESKTOP_NOTIFICATIONS_PORT), SUPERSET_AGENT_HOOK_VERSION: HOOK_PROTOCOL_VERSION, - SUPERSET_PTY_DAEMON_SOCKET: ptyDaemonSocket, AUTH_TOKEN: config.authToken, CLOUD_API_URL: config.cloudApiUrl, - }); + }; + if (ptyDaemonSocket) { + baseEnv.SUPERSET_PTY_DAEMON_SOCKET = ptyDaemonSocket; + } + + const childEnv = await getProcessEnvWithShellPath(baseEnv); // `getProcessEnvWithShellPath` merges in the user's interactive shell env, // which in dev has `RELAY_URL` set. Enforce the toggle *after* that merge From 05ae50c207539ee08df8106c518a9a5616b33f8e Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 22:06:10 -0700 Subject: [PATCH 09/33] fix(desktop): use short /tmp path for pty-daemon socket (Darwin sun_path) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In dev, SUPERSET_HOME_DIR resolves to /superset-dev-data, which made the daemon socket path 159+ characters: /Users/.../worktrees/1c99c8eb-.../elastic-lens/superset-dev-data/host/<36-char-orgId>/pty-daemon.sock Darwin's sun_path is 104 bytes — kernel rejects listen() with EINVAL ("invalid argument") before the daemon ever gets to write to its log. Production paths are shorter but still uncomfortably close to the limit. Move the socket file to os.tmpdir() with a 12-char SHA256 hash of the org id: /var/folders//T/superset-ptyd-<12hex>.sock (~80 chars) Owner-only file mode (0600) is the security boundary, set by the daemon's Server.listen() — the directory permissions don't matter. Manifest still lives at $SUPERSET_HOME_DIR/host//pty-daemon- manifest.json with the socket path recorded inside; adoption logic reads it from there. --- .../src/main/lib/pty-daemon-coordinator.ts | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/apps/desktop/src/main/lib/pty-daemon-coordinator.ts b/apps/desktop/src/main/lib/pty-daemon-coordinator.ts index 39d1147756c..df3670af486 100644 --- a/apps/desktop/src/main/lib/pty-daemon-coordinator.ts +++ b/apps/desktop/src/main/lib/pty-daemon-coordinator.ts @@ -9,8 +9,10 @@ // without losing user shells. import * as childProcess from "node:child_process"; +import { createHash } from "node:crypto"; import * as fs from "node:fs"; import * as net from "node:net"; +import * as os from "node:os"; import * as path from "node:path"; import { SUPERSET_HOME_DIR } from "./app-environment"; import { isProcessAlive } from "./host-service-manifest"; @@ -32,11 +34,20 @@ interface DaemonInstance { const SOCKET_READY_TIMEOUT_MS = 5_000; /** - * Per-organization socket path. Owner-only directory inherits from the - * existing $SUPERSET_HOME_DIR/host/{orgId}/ tree. + * Per-organization socket path. **Must stay short** — Darwin's `sun_path` + * is 104 bytes, and `$SUPERSET_HOME_DIR/host/{orgId}/pty-daemon.sock` blows + * past that in dev (worktree-relative SUPERSET_HOME_DIR + 36-char UUID). + * + * We put the socket in `os.tmpdir()` with a hash of the org id. Owner-only + * file mode (0600, set by the daemon's Server.listen) is the auth boundary; + * the directory permissions don't matter. */ function ptyDaemonSocketPath(organizationId: string): string { - return path.join(ptyDaemonManifestDir(organizationId), "pty-daemon.sock"); + const shortId = createHash("sha256") + .update(organizationId) + .digest("hex") + .slice(0, 12); + return path.join(os.tmpdir(), `superset-ptyd-${shortId}.sock`); } export interface PtyDaemonCoordinatorOptions { From aae131eb32bf9c4ff48918533ba90e49d2a5130c Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 23:23:46 -0700 Subject: [PATCH 10/33] fix(host-service): adopt existing daemon sessions on host-service restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Headline bug from the smoke test: after host-service restarts, the renderer reconnects, host-service has an empty in-memory sessions Map, calls daemon.open(id) blindly, daemon already has the session → "session already exists" → renderer retries → tight loop, terminal unusable. The whole point of the daemon is for sessions to survive host-service restarts. The fix: - In createTerminalSessionInternal, wrap daemon.open in an inner try/catch. On "session already exists", call daemon.list(), find the existing session by id, treat it as adoption: reuse the pid, skip workspace-spawn args, set isAdopted=true. - For adopted sessions: shellReadyState starts as "ready" (the shell is already past the OSC 133;A marker that originally fired in the prior host-service lifetime), and initialCommandQueued=true so we don't re-fire the initial command. - The daemon's existing fan-out + ring-buffer behavior already handles the data plane: subscribe(replay:true) below pulls the buffered output and continues live streaming. Tests we should have written before shipping: packages/pty-daemon/test/control-plane.test.ts gains a "cross-client continuity (host-service restart simulation)" suite with 4 cases: 1. Client B finds session A's id via list after A disconnects. 2. Re-opening an existing id returns EEXIST (the trigger we depend on). 3. Client B subscribes-with-replay to A's session and gets buffered output + live stream from the still-living shell. 4. Daemon `list` returns sessions whose only client just dropped (the daemon must NOT garbage-collect on last-client-disconnect). packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test gains an "adoption flow" test that drives the exact sequence host- service does after restart: open → drop → re-open (errors) → list → subscribe(replay:true) → see prior output AND new input flow to the still-living shell. Total: 5 new integration tests covering the hot path that just shipped broken to production. All 35 daemon-touching tests pass under node --test. --- .../DaemonClient/DaemonClient.node-test.ts | 83 +++++++++ .../host-service/src/terminal/terminal.ts | 55 ++++-- .../pty-daemon/test/control-plane.test.ts | 160 ++++++++++++++++++ 3 files changed, 286 insertions(+), 12 deletions(-) diff --git a/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts b/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts index 30b52c0be29..d37e0403fce 100644 --- a/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts +++ b/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts @@ -178,6 +178,89 @@ test("disconnect callback fires when daemon goes away", async () => { await c.dispose(); }); +test("adoption flow: client A opens, drops, client B finds + subscribes-with-replay", async () => { + // This is the exact host-service-restart sequence we hit in production: + // host-service v1 opens a daemon session, then dies. host-service v2 + // starts fresh, calls daemon.open() blindly → "session already exists" + // → must fall back to list() + subscribe(replay:true). Regression test + // for the "session already exists" tight loop. + const a = new DaemonClient({ socketPath: sockPath }); + await a.connect(); + const id = "host-restart-adopt"; + const openA = await a.open(id, { + shell: "/bin/sh", + argv: ["-i"], + cols: 80, + rows: 24, + }); + const aChunks: Buffer[] = []; + const unsubA = a.subscribe( + id, + { replay: false }, + { onOutput: (c) => aChunks.push(c), onExit: () => {} }, + ); + a.input(id, Buffer.from("echo before-host-restart\n")); + await waitFor( + () => Buffer.concat(aChunks).toString().includes("before-host-restart"), + 3000, + ); + unsubA(); + await a.dispose(); + + // Brief settle so the daemon registers A's disconnect. + await new Promise((r) => setTimeout(r, 100)); + + // "host-service v2" connects fresh. + const b = new DaemonClient({ socketPath: sockPath }); + await b.connect(); + + // Naive open should error with "session already exists" — that's the + // signal host-service uses to switch to adoption mode. + let openErr: Error | null = null; + try { + await b.open(id, { + shell: "/bin/sh", + argv: ["-i"], + cols: 80, + rows: 24, + }); + } catch (e) { + openErr = e as Error; + } + assert.ok(openErr, "second open of same id must throw"); + assert.match(openErr?.message ?? "", /session already exists/); + + // list() finds the live session. + const list = await b.list(); + const found = list.find((s) => s.id === id); + assert.ok(found, "list must surface the existing session"); + assert.equal(found?.alive, true); + assert.equal(found?.pid, openA.pid); + + // Subscribe with replay → see the buffered output from A's lifetime. + const bChunks: Buffer[] = []; + const unsubB = b.subscribe( + id, + { replay: true }, + { onOutput: (c) => bChunks.push(c), onExit: () => {} }, + ); + await waitFor( + () => Buffer.concat(bChunks).toString().includes("before-host-restart"), + 3000, + ); + + // And new input through B reaches the (still-living) shell. + b.input(id, Buffer.from("echo after-host-restart\n")); + await waitFor( + () => Buffer.concat(bChunks).toString().includes("after-host-restart"), + 3000, + ); + + unsubB(); + await b.close(id, "SIGTERM"); + await b.dispose(); +}); + async function waitFor(predicate: () => boolean, ms: number): Promise { const start = Date.now(); while (!predicate()) { diff --git a/packages/host-service/src/terminal/terminal.ts b/packages/host-service/src/terminal/terminal.ts index cb23c02c5c8..f2a9e7cf390 100644 --- a/packages/host-service/src/terminal/terminal.ts +++ b/packages/host-service/src/terminal/terminal.ts @@ -497,16 +497,38 @@ export async function createTerminalSessionInternal({ let daemon: DaemonClient; let openResult: { pid: number }; + let isAdopted = false; try { daemon = await getDaemonClient(); - openResult = await daemon.open(terminalId, { - shell, - argv: shellArgs, - cwd, - cols: 120, - rows: 32, - env: ptyEnv, - }); + try { + openResult = await daemon.open(terminalId, { + shell, + argv: shellArgs, + cwd, + cols: 120, + rows: 32, + env: ptyEnv, + }); + } catch (err) { + // After host-service restart the daemon may already own this + // session. Adopt it instead of looping forever on "session already + // exists". The daemon kept the buffer + the live shell; we just + // need to stitch up a TerminalSession record on this side and + // subscribe-with-replay below. + const msg = err instanceof Error ? err.message : String(err); + if (msg.includes("session already exists")) { + const list = await daemon.list(); + const found = list.find((s) => s.id === terminalId && s.alive); + if (!found) throw err; + openResult = { pid: found.pid }; + isAdopted = true; + console.log( + `[terminal] adopted existing daemon session ${terminalId} pid=${found.pid}`, + ); + } else { + throw err; + } + } } catch (error) { return { error: @@ -530,9 +552,12 @@ export async function createTerminalSessionInternal({ }) .run(); - // Determine shell readiness support + // Determine shell readiness support. Adopted sessions are already past + // shell startup, so treat them as immediately ready — the OSC 133;A + // marker has already flown by and we don't want to gate writes on it. const shellName = shell.split("/").pop() || shell; - const shellSupportsReady = SHELLS_WITH_READY_MARKER.has(shellName); + const shellSupportsReady = + !isAdopted && SHELLS_WITH_READY_MARKER.has(shellName); let shellReadyResolve: (() => void) | null = null; const shellReadyPromise = shellSupportsReady @@ -556,12 +581,18 @@ export async function createTerminalSessionInternal({ listed, title: null, titleScanState: createTerminalTitleScanState(), - shellReadyState: shellSupportsReady ? "pending" : "unsupported", + shellReadyState: shellSupportsReady + ? "pending" + : isAdopted + ? "ready" + : "unsupported", shellReadyResolve, shellReadyPromise, shellReadyTimeoutId: null, scanState: createScanState(), - initialCommandQueued: false, + // Adopted sessions have already run their initialCommand in the prior + // host-service lifetime — flag it as queued so we don't double-fire it. + initialCommandQueued: isAdopted, }; sessions.set(terminalId, session); portManager.upsertSession(terminalId, workspaceId, pty.pid); diff --git a/packages/pty-daemon/test/control-plane.test.ts b/packages/pty-daemon/test/control-plane.test.ts index 659ecbdb55a..05935c33b7b 100644 --- a/packages/pty-daemon/test/control-plane.test.ts +++ b/packages/pty-daemon/test/control-plane.test.ts @@ -515,6 +515,166 @@ describe("list", () => { }); }); +// ---------------- Cross-client continuity (host-service restart story) ---------------- + +describe("cross-client continuity (host-service restart simulation)", () => { + // This is the headline path the daemon exists for. Client A (host-service v1) + // opens a session, then disconnects (host-service crashed). Client B + // (host-service v2) connects fresh, discovers the session via list, and + // must NOT try to re-open it — it should subscribe-with-replay and + // continue. Regression test for the "session already exists" tight loop + // observed in production after the first integration land. + + test("client B finds session A's id via list after A disconnects", async () => { + const a = await connectAndHello(sockPath); + const id = uniqueId("restart"); + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "echo from-A; sleep 5"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + // Force-close A's connection without unsubscribing — this simulates a + // host-service crash. The session must keep running on the daemon. + a.socket.destroy(); + + // Brief settle so the daemon notices the close. + await new Promise((r) => setTimeout(r, 100)); + + const b = await connectAndHello(sockPath); + b.send({ type: "list" }); + const reply = await b.waitFor((m) => m.type === "list-reply"); + assert.equal(reply.type, "list-reply"); + if (reply.type === "list-reply") { + const found = reply.sessions.find((s) => s.id === id); + assert.ok(found, `session ${id} should still be in list after A's drop`); + assert.equal(found?.alive, true); + } + + b.send({ type: "close", id, signal: "SIGTERM" }); + await b.close(); + }); + + test("re-opening an existing session id returns EEXIST (the trigger for adoption)", async () => { + // Regression: host-service was caught in a tight loop because it + // blindly called `open` after restart and got "session already exists". + // We rely on this exact error code/message to drive the adoption path + // in host-service's createTerminalSessionInternal. + const a = await connectAndHello(sockPath); + const id = uniqueId("eexist"); + a.send({ type: "open", id, meta: baseMeta }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + + const b = await connectAndHello(sockPath); + b.send({ type: "open", id, meta: baseMeta }); + const err = await b.waitFor((m) => m.type === "error" && m.id === id, 2000); + assert.equal(err.type, "error"); + if (err.type === "error") { + assert.equal(err.code, "EEXIST"); + assert.match(err.message, /session already exists/); + } + + a.send({ type: "close", id, signal: "SIGTERM" }); + await Promise.all([a.close(), b.close()]); + }); + + test("client B subscribes-with-replay to A's session and gets buffered output + live stream", async () => { + // The actual adoption flow: A opens, A produces output, A drops, B + // subscribes with replay. B must see the prior output AND any new + // output produced after B's subscribe. This is what host-service + // does after restart to give the renderer a continuous experience. + const a = await connectAndHello(sockPath); + const id = uniqueId("adopt"); + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + + a.send({ type: "subscribe", id, replay: false }); + a.send({ + type: "input", + id, + data: Buffer.from("echo before-restart\n").toString("base64"), + }); + await a.waitFor( + (m) => + m.type === "output" && + Buffer.from(m.data, "base64").toString().includes("before-restart"), + 3000, + ); + + // A drops without cleanup — host-service "crashed." + a.socket.destroy(); + await new Promise((r) => setTimeout(r, 100)); + + // B picks up the session. First confirms via list, then subscribes + // with replay to get the buffered "before-restart" output. + const b = await connectAndHello(sockPath); + b.send({ type: "list" }); + const list = await b.waitFor((m) => m.type === "list-reply"); + assert.ok( + list.type === "list-reply" && + list.sessions.some((s) => s.id === id && s.alive), + ); + + b.send({ type: "subscribe", id, replay: true }); + await b.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("before-restart"), + 3000, + ); + + // New input from B reaches the (still-living) shell. + b.send({ + type: "input", + id, + data: Buffer.from("echo after-restart\n").toString("base64"), + }); + await b.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("after-restart"), + 3000, + ); + + b.send({ type: "close", id, signal: "SIGTERM" }); + await b.close(); + }); + + test("daemon `list` returns sessions whose only client just dropped", async () => { + // Defensive: the daemon must NOT garbage-collect a session just + // because its last client disconnected. host-service relies on the + // session staying alive across the disconnect. + const a = await connectAndHello(sockPath); + const id = uniqueId("orphan"); + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "sleep 30"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + + a.socket.destroy(); + await new Promise((r) => setTimeout(r, 200)); + + const b = await connectAndHello(sockPath); + b.send({ type: "list" }); + const reply = await b.waitFor((m) => m.type === "list-reply"); + if (reply.type === "list-reply") { + const me = reply.sessions.find((s) => s.id === id); + assert.ok(me, "session must persist past last-client disconnect"); + assert.equal(me?.alive, true); + } + b.send({ type: "close", id, signal: "SIGKILL" }); + await b.close(); + }); +}); + // ---------------- Malformed / abusive input ---------------- describe("hostile input", () => { From 2bbb0846ce697e635e4c1849a9b87982fce852b3 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 23:24:49 -0700 Subject: [PATCH 11/33] test(pty-daemon): replay-on-exited-session edge case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sister to the "host-service restart" suite: covers the case where the shell exited during the host-service downtime. New host-service subscribes-with-replay → must get the buffered output of the dying shell and observe via list() that the session has alive:false. Without this, the renderer hangs waiting for output that will never come. The exit-event side is documented in the test as best-effort — daemon's wireSession fires onExit once at the moment the shell dies; late subscribers see the buffer plus alive:false in list. host-service already supplements with a list check (the adoption flow looks for alive:true), so this test asserts the contract host-service depends on. --- .../pty-daemon/test/control-plane.test.ts | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/packages/pty-daemon/test/control-plane.test.ts b/packages/pty-daemon/test/control-plane.test.ts index 05935c33b7b..6999e34c944 100644 --- a/packages/pty-daemon/test/control-plane.test.ts +++ b/packages/pty-daemon/test/control-plane.test.ts @@ -646,6 +646,52 @@ describe("cross-client continuity (host-service restart simulation)", () => { await b.close(); }); + test("subscribe-with-replay on an already-exited session yields buffered output + immediate exit event", async () => { + // After a host-service restart, a shell that exited during the gap + // should still surface its final output AND its exit event when the + // new host-service subscribes. Otherwise the renderer hangs waiting + // for output that will never come. + const a = await connectAndHello(sockPath); + const id = uniqueId("postexit"); + a.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-c", "echo final-words; exit 7"] }, + }); + await a.waitFor((m) => m.type === "open-ok" && m.id === id); + a.send({ type: "subscribe", id, replay: true }); + await a.waitFor((m) => m.type === "exit" && m.id === id, 3000); + // Connection A drops without explicit close — session enters + // alive:false state but is still in the daemon's map. + a.socket.destroy(); + await new Promise((r) => setTimeout(r, 100)); + + const b = await connectAndHello(sockPath); + b.send({ type: "subscribe", id, replay: true }); + await b.waitFor( + (m) => + m.type === "output" && + m.id === id && + Buffer.from(m.data, "base64").toString().includes("final-words"), + 2000, + ); + // Note: an exit event for an already-exited session is best-effort — + // the daemon's `wireSession` only fires onExit once when the shell + // actually dies. A late subscriber sees the buffered output and can + // observe `alive:false` via `list`. This test asserts the buffer + // behavior; the host-service supplements with a `list` check before + // declaring the session dead. + b.send({ type: "list" }); + const reply = await b.waitFor((m) => m.type === "list-reply"); + if (reply.type === "list-reply") { + const me = reply.sessions.find((s) => s.id === id); + assert.ok(me, "exited session should still be in list"); + assert.equal(me?.alive, false); + } + b.send({ type: "close", id, signal: "SIGTERM" }); + await b.close(); + }); + test("daemon `list` returns sessions whose only client just dropped", async () => { // Defensive: the daemon must NOT garbage-collect a session just // because its last client disconnected. host-service relies on the From 525d3ec94c114f71c76617f94e6633970ea59d77 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Wed, 29 Apr 2026 23:40:25 -0700 Subject: [PATCH 12/33] test(host-service): full E2E adoption test under Electron-as-Node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit End-to-end coverage of createTerminalSessionInternal's adoption path against the real stack: - Real pty-daemon Server (in-process, spawning real /bin/sh). - Real SQLite host DB (better-sqlite3) with workspace/project rows. - Test-only escape hatch __resetSessionsForTesting() to simulate a host-service process restart in-process. The test asserts the headline property: after host-service restart, re-calling createTerminalSessionInternal with the same terminalId returns a session with the SAME shell pid (proving adoption, not respawn) and new input flows to the still-living shell. Why it runs under Electron-as-Node, not raw `node`: host-service uses better-sqlite3, which is compiled against Electron's Node ABI for production. Running the test under Electron-as-Node matches that ABI so the production native module loads cleanly. The bundled Electron binary is in node_modules anyway (via the electron npm package), so this adds no test-time dependency. Mechanical changes: - Convert relative imports to use explicit `.ts` extensions across the host-service modules transitively reachable from terminal.ts (db, events, ports, runtime/filesystem) and across @superset/port- scanner. Required because Node ESM doesn't allow extension-less directory imports; needed for the test to load under --experimental-strip-types. Bun tolerates either form, so production is unaffected. Workspace tsc --noEmit clean across all 27 packages. - New `bun run test:e2e` script (packages/host-service/scripts/ test-e2e.ts) that resolves the workspace's Electron binary and runs the test under it with the right env. - New `__resetSessionsForTesting()` export in terminal.ts (test-only escape hatch, documented as such). Test results: - ✔ fresh open spawns a shell via the daemon - ✔ adopts existing daemon session after host-service restart simulation - ✔ adopted session keeps listed/exited bookkeeping 3/3 pass in ~550ms. This is the test that would have caught the "session already exists" loop bug shipped earlier in this branch. --- packages/host-service/package.json | 3 +- packages/host-service/scripts/test-e2e.ts | 58 +++++ packages/host-service/src/db/db.ts | 2 +- packages/host-service/src/db/index.ts | 4 +- packages/host-service/src/events/event-bus.ts | 12 +- .../host-service/src/events/git-watcher.ts | 6 +- packages/host-service/src/events/index.ts | 6 +- packages/host-service/src/events/types.ts | 2 +- .../host-service/src/ports/port-manager.ts | 2 +- .../src/runtime/filesystem/filesystem.ts | 4 +- .../src/runtime/filesystem/index.ts | 4 +- .../src/terminal/DaemonClient/index.ts | 4 +- .../src/terminal/daemon-client-singleton.ts | 2 +- packages/host-service/src/terminal/env.ts | 15 +- .../terminal/terminal.adoption.node-test.ts | 226 ++++++++++++++++++ .../host-service/src/terminal/terminal.ts | 35 ++- packages/port-scanner/src/index.ts | 8 +- packages/port-scanner/src/port-manager.ts | 4 +- packages/port-scanner/src/procfs.ts | 2 +- packages/port-scanner/src/scanner.ts | 2 +- 20 files changed, 355 insertions(+), 46 deletions(-) create mode 100644 packages/host-service/scripts/test-e2e.ts create mode 100644 packages/host-service/src/terminal/terminal.adoption.node-test.ts diff --git a/packages/host-service/package.json b/packages/host-service/package.json index 12676916c00..12f9e43638c 100644 --- a/packages/host-service/package.json +++ b/packages/host-service/package.json @@ -39,7 +39,8 @@ "build:host": "bun run build.ts", "generate": "drizzle-kit generate", "typecheck": "tsc --noEmit --emitDeclarationOnly false", - "test:integration": "node --experimental-strip-types --test src/terminal/DaemonClient/DaemonClient.node-test.ts" + "test:integration": "node --experimental-strip-types --test src/terminal/DaemonClient/DaemonClient.node-test.ts", + "test:e2e": "bun run scripts/test-e2e.ts" }, "dependencies": { "@hono/node-server": "^1.14.1", diff --git a/packages/host-service/scripts/test-e2e.ts b/packages/host-service/scripts/test-e2e.ts new file mode 100644 index 00000000000..441af59da13 --- /dev/null +++ b/packages/host-service/scripts/test-e2e.ts @@ -0,0 +1,58 @@ +// Runs the host-service end-to-end adoption test under Electron-as-Node. +// +// Why Electron and not raw `node`: host-service uses better-sqlite3, whose +// native module is compiled against the Electron bundled Node ABI for +// production. Running the test under Electron-as-Node ensures the same +// native-module ABI as production. Raw `node` would fail with +// NODE_MODULE_VERSION mismatch. +// +// Usage: bun run test:e2e + +import * as childProcess from "node:child_process"; +import * as fs from "node:fs"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const repoRoot = path.resolve(__dirname, "../../.."); + +// Resolve the Electron binary from the workspace's node_modules. Bun's flat +// .bun/@/node_modules/ layout makes this a glob. +function findElectronBinary(): string { + const candidates = childProcess + .execSync("find . -path '*/electron/dist/*.app/Contents/MacOS/Electron'", { + cwd: repoRoot, + encoding: "utf-8", + }) + .split("\n") + .filter(Boolean); + if (candidates.length === 0) { + throw new Error( + "Electron binary not found. Run `bun install` from the repo root first.", + ); + } + return path.join(repoRoot, candidates[0]!); +} + +const electronBin = findElectronBinary(); +const testFile = path.resolve( + __dirname, + "..", + "src/terminal/terminal.adoption.node-test.ts", +); + +if (!fs.existsSync(testFile)) { + console.error(`Test file missing: ${testFile}`); + process.exit(1); +} + +const result = childProcess.spawnSync( + electronBin, + ["--experimental-strip-types", "--test", "--test-reporter=spec", testFile], + { + stdio: "inherit", + env: { ...process.env, ELECTRON_RUN_AS_NODE: "1" }, + }, +); + +process.exit(result.status ?? 1); diff --git a/packages/host-service/src/db/db.ts b/packages/host-service/src/db/db.ts index ab74757267e..f5d07674b4d 100644 --- a/packages/host-service/src/db/db.ts +++ b/packages/host-service/src/db/db.ts @@ -3,7 +3,7 @@ import { dirname } from "node:path"; import Database from "better-sqlite3"; import { drizzle } from "drizzle-orm/better-sqlite3"; import { migrate } from "drizzle-orm/better-sqlite3/migrator"; -import * as schema from "./schema"; +import * as schema from "./schema.ts"; export type HostDb = ReturnType; diff --git a/packages/host-service/src/db/index.ts b/packages/host-service/src/db/index.ts index e6cb0767895..9a32c356c6c 100644 --- a/packages/host-service/src/db/index.ts +++ b/packages/host-service/src/db/index.ts @@ -1,2 +1,2 @@ -export { createDb, type HostDb } from "./db"; -export * from "./schema"; +export { createDb, type HostDb } from "./db.ts"; +export * from "./schema.ts"; diff --git a/packages/host-service/src/events/event-bus.ts b/packages/host-service/src/events/event-bus.ts index b550f568e13..5d96c3cdd4f 100644 --- a/packages/host-service/src/events/event-bus.ts +++ b/packages/host-service/src/events/event-bus.ts @@ -2,12 +2,12 @@ import type { NodeWebSocket } from "@hono/node-ws"; import type { DetectedPort } from "@superset/port-scanner"; import type { FsWatchEvent } from "@superset/workspace-fs/host"; import type { Hono } from "hono"; -import type { HostDb } from "../db"; -import { portManager } from "../ports/port-manager"; -import { getLabelsForWorkspace } from "../ports/static-ports"; -import type { WorkspaceFilesystemManager } from "../runtime/filesystem"; -import { GitWatcher } from "./git-watcher"; -import type { ClientMessage, ServerMessage } from "./types"; +import type { HostDb } from "../db/index.ts"; +import { portManager } from "../ports/port-manager.ts"; +import { getLabelsForWorkspace } from "../ports/static-ports.ts"; +import type { WorkspaceFilesystemManager } from "../runtime/filesystem/index.ts"; +import { GitWatcher } from "./git-watcher.ts"; +import type { ClientMessage, ServerMessage } from "./types.ts"; type WsSocket = { send: (data: string) => void; diff --git a/packages/host-service/src/events/git-watcher.ts b/packages/host-service/src/events/git-watcher.ts index 802863ae640..a3e36f05284 100644 --- a/packages/host-service/src/events/git-watcher.ts +++ b/packages/host-service/src/events/git-watcher.ts @@ -2,9 +2,9 @@ import { execFile } from "node:child_process"; import { type FSWatcher, watch } from "node:fs"; import { promisify } from "node:util"; import type { FsWatchEvent } from "@superset/workspace-fs/host"; -import type { HostDb } from "../db"; -import { workspaces } from "../db/schema"; -import type { WorkspaceFilesystemManager } from "../runtime/filesystem"; +import type { HostDb } from "../db/index.ts"; +import { workspaces } from "../db/schema.ts"; +import type { WorkspaceFilesystemManager } from "../runtime/filesystem/index.ts"; const execFileAsync = promisify(execFile); diff --git a/packages/host-service/src/events/index.ts b/packages/host-service/src/events/index.ts index a7e05c678a1..fbe6b206c97 100644 --- a/packages/host-service/src/events/index.ts +++ b/packages/host-service/src/events/index.ts @@ -1,8 +1,8 @@ -export { EventBus, registerEventBusRoute } from "./event-bus"; +export { EventBus, registerEventBusRoute } from "./event-bus.ts"; export { type AgentLifecycleEventType, mapEventType, -} from "./map-event-type"; +} from "./map-event-type.ts"; export type { AgentLifecycleMessage, ClientMessage, @@ -14,4 +14,4 @@ export type { PortChangedMessage, ServerMessage, TerminalLifecycleMessage, -} from "./types"; +} from "./types.ts"; diff --git a/packages/host-service/src/events/types.ts b/packages/host-service/src/events/types.ts index 1ac18bf82d4..38ea6b11595 100644 --- a/packages/host-service/src/events/types.ts +++ b/packages/host-service/src/events/types.ts @@ -1,6 +1,6 @@ import type { DetectedPort } from "@superset/port-scanner"; import type { FsWatchEvent } from "@superset/workspace-fs/host"; -import type { AgentLifecycleEventType } from "./map-event-type"; +import type { AgentLifecycleEventType } from "./map-event-type.ts"; // ── Server → Client ──────────────────────────────────────────────── diff --git a/packages/host-service/src/ports/port-manager.ts b/packages/host-service/src/ports/port-manager.ts index b4faa3da452..912ba007d5f 100644 --- a/packages/host-service/src/ports/port-manager.ts +++ b/packages/host-service/src/ports/port-manager.ts @@ -1,5 +1,5 @@ import { PortManager } from "@superset/port-scanner"; -import { treeKillWithEscalation } from "./tree-kill"; +import { treeKillWithEscalation } from "./tree-kill.ts"; export const portManager = new PortManager({ killFn: treeKillWithEscalation, diff --git a/packages/host-service/src/runtime/filesystem/filesystem.ts b/packages/host-service/src/runtime/filesystem/filesystem.ts index 81ff604aa47..a46e9d54631 100644 --- a/packages/host-service/src/runtime/filesystem/filesystem.ts +++ b/packages/host-service/src/runtime/filesystem/filesystem.ts @@ -5,8 +5,8 @@ import { getSearchIndex, } from "@superset/workspace-fs/host"; import { eq } from "drizzle-orm"; -import type { HostDb } from "../../db"; -import { projects, workspaces } from "../../db/schema"; +import { projects, workspaces } from "../../db/schema.ts"; +import type { HostDb } from "../../db/index.ts"; export interface WorkspaceFilesystemManagerOptions { db: HostDb; diff --git a/packages/host-service/src/runtime/filesystem/index.ts b/packages/host-service/src/runtime/filesystem/index.ts index 4af25866e5e..d074530d95a 100644 --- a/packages/host-service/src/runtime/filesystem/index.ts +++ b/packages/host-service/src/runtime/filesystem/index.ts @@ -1,2 +1,2 @@ -export type { WorkspaceFilesystemManagerOptions } from "./filesystem"; -export { WorkspaceFilesystemManager } from "./filesystem"; +export type { WorkspaceFilesystemManagerOptions } from "./filesystem.ts"; +export { WorkspaceFilesystemManager } from "./filesystem.ts"; diff --git a/packages/host-service/src/terminal/DaemonClient/index.ts b/packages/host-service/src/terminal/DaemonClient/index.ts index a98b4755a24..ed4338ebe70 100644 --- a/packages/host-service/src/terminal/DaemonClient/index.ts +++ b/packages/host-service/src/terminal/DaemonClient/index.ts @@ -4,5 +4,5 @@ export type { OpenResult, Signal, SubscribeCallbacks, -} from "./DaemonClient"; -export { DaemonClient } from "./DaemonClient"; +} from "./DaemonClient.ts"; +export { DaemonClient } from "./DaemonClient.ts"; diff --git a/packages/host-service/src/terminal/daemon-client-singleton.ts b/packages/host-service/src/terminal/daemon-client-singleton.ts index c82ced90ae0..1ef5abcd68e 100644 --- a/packages/host-service/src/terminal/daemon-client-singleton.ts +++ b/packages/host-service/src/terminal/daemon-client-singleton.ts @@ -7,7 +7,7 @@ // host-service can be restarted to reconnect. There's no in-band reconnect // here on purpose; see DaemonClient's "dumb" failure model. -import { DaemonClient } from "./DaemonClient"; +import { DaemonClient } from "./DaemonClient/index.ts"; let cached: DaemonClient | null = null; let connecting: Promise | null = null; diff --git a/packages/host-service/src/terminal/env.ts b/packages/host-service/src/terminal/env.ts index 818615c0a35..6b58a336b1a 100644 --- a/packages/host-service/src/terminal/env.ts +++ b/packages/host-service/src/terminal/env.ts @@ -5,23 +5,26 @@ * at startup — never from desktop main or the live host-service process.env. */ -export { stripTerminalRuntimeEnv } from "./env-strip"; -export type { ShellBootstrapParams, ShellLaunchParams } from "./shell-launch"; +export { stripTerminalRuntimeEnv } from "./env-strip.ts"; +export type { + ShellBootstrapParams, + ShellLaunchParams, +} from "./shell-launch.ts"; export { getShellBootstrapEnv, getShellLaunchArgs, getSupersetShellPaths, resolveLaunchShell, -} from "./shell-launch"; +} from "./shell-launch.ts"; import fs from "node:fs"; import os from "node:os"; import { clearStrictShellEnvCache, getStrictShellEnvironment, -} from "./clean-shell-env"; -import { stripTerminalRuntimeEnv } from "./env-strip"; -import { getShellBootstrapEnv } from "./shell-launch"; +} from "./clean-shell-env.ts"; +import { stripTerminalRuntimeEnv } from "./env-strip.ts"; +import { getShellBootstrapEnv } from "./shell-launch.ts"; const MACOS_SYSTEM_CERT_FILE = "/etc/ssl/cert.pem"; let cachedMacosSystemCertAvailable: boolean | null = null; diff --git a/packages/host-service/src/terminal/terminal.adoption.node-test.ts b/packages/host-service/src/terminal/terminal.adoption.node-test.ts new file mode 100644 index 00000000000..fb8a2e7700b --- /dev/null +++ b/packages/host-service/src/terminal/terminal.adoption.node-test.ts @@ -0,0 +1,226 @@ +// End-to-end adoption test. Drives host-service's createTerminalSessionInternal +// against a real pty-daemon Server (in-process), real SQLite host DB, +// and real shells. Simulates a host-service process restart by clearing the +// in-memory sessions Map (via the test-only escape hatch) and disposing the +// DaemonClient singleton, then re-invokes createTerminalSessionInternal with +// the same terminalId and asserts the adoption path: +// - Same shell pid as the original session. +// - Subsequent input reaches the still-living shell. +// +// This is exactly what the daemon's process isolation enables: the daemon +// owns the PTY runtime; the host can test its integration end-to-end without +// any subprocess gymnastics. +// +// Runs under Node (`node --experimental-strip-types --test`). + +import { strict as assert } from "node:assert"; +import { randomUUID } from "node:crypto"; +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; +import { after, before, describe, test } from "node:test"; +import { fileURLToPath } from "node:url"; +import { Server } from "@superset/pty-daemon"; +import { createDb, type HostDb } from "../db/index.ts"; +import { projects, workspaces } from "../db/schema.ts"; +import { disposeDaemonClient } from "./daemon-client-singleton.ts"; +import { initTerminalBaseEnv } from "./env.ts"; +import { + __resetSessionsForTesting, + createTerminalSessionInternal, + disposeSession, + listTerminalSessions, +} from "./terminal.ts"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const TEST_HOME = path.join(os.tmpdir(), `host-svc-adopt-${process.pid}`); +const SOCK = path.join(os.tmpdir(), `host-svc-adopt-${process.pid}.sock`); +const MIGRATIONS = path.resolve(__dirname, "../../drizzle"); + +let server: Server; +let db: HostDb; +let projectId: string; +let workspaceId: string; +let worktreePath: string; + +before(async () => { + fs.mkdirSync(TEST_HOME, { recursive: true }); + worktreePath = path.join(TEST_HOME, "worktree"); + fs.mkdirSync(worktreePath, { recursive: true }); + + server = new Server({ + socketPath: SOCK, + daemonVersion: "0.0.0-adoption-e2e", + }); + await server.listen(); + + process.env.SUPERSET_PTY_DAEMON_SOCKET = SOCK; + process.env.SUPERSET_HOME_DIR = TEST_HOME; + process.env.HOST_SERVICE_VERSION = "0.0.0-adoption-e2e"; + process.env.NODE_ENV = "development"; + + initTerminalBaseEnv({ + PATH: process.env.PATH ?? "/usr/bin:/bin", + HOME: process.env.HOME ?? TEST_HOME, + SHELL: "/bin/sh", + }); + + db = createDb(path.join(TEST_HOME, "host.db"), MIGRATIONS); + + projectId = randomUUID(); + workspaceId = randomUUID(); + db.insert(projects).values({ id: projectId, repoPath: worktreePath }).run(); + db.insert(workspaces) + .values({ + id: workspaceId, + projectId, + worktreePath, + branch: "main", + }) + .run(); +}); + +after(async () => { + __resetSessionsForTesting(); + await disposeDaemonClient(); + await server.close(); + try { + fs.rmSync(TEST_HOME, { recursive: true, force: true }); + } catch { + // best-effort + } +}); + +describe("createTerminalSessionInternal — host-service restart adoption", () => { + test("fresh open spawns a shell via the daemon", async () => { + const terminalId = `e2e-fresh-${randomUUID().slice(0, 8)}`; + const result = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok( + !("error" in result), + `expected session, got error: ${JSON.stringify(result)}`, + ); + if ("error" in result) return; + + assert.equal(result.terminalId, terminalId); + assert.ok(result.pty.pid > 0, "pty pid should be populated"); + + const list = listTerminalSessions({ workspaceId }); + assert.ok( + list.find((s) => s.terminalId === terminalId), + "new session should be in listTerminalSessions", + ); + + disposeSession(terminalId, db); + }); + + test("adopts existing daemon session after host-service restart simulation", async () => { + const terminalId = `e2e-adopt-${randomUUID().slice(0, 8)}`; + + const first = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok(!("error" in first)); + if ("error" in first) return; + const originalPid = first.pty.pid; + + first.pty.write("echo before-host-restart\n"); + await waitForOutput(first.pty, "before-host-restart", 3000); + + // Simulate host-service crash + restart. + __resetSessionsForTesting(); + await disposeDaemonClient(); + + const second = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok(!("error" in second)); + if ("error" in second) return; + + assert.equal( + second.pty.pid, + originalPid, + "adopted session should have same shell pid", + ); + assert.equal(second.terminalId, terminalId); + + let buf = ""; + const disposer = second.pty.onData((d) => { + buf += d; + }); + second.pty.write("echo after-host-restart\n"); + await waitFor(() => buf.includes("after-host-restart"), 3000); + disposer.dispose(); + + disposeSession(terminalId, db); + }); + + test("adopted session keeps listed/exited bookkeeping", async () => { + const terminalId = `e2e-bookkeeping-${randomUUID().slice(0, 8)}`; + const first = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok(!("error" in first)); + + __resetSessionsForTesting(); + await disposeDaemonClient(); + + const second = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok(!("error" in second)); + if ("error" in second) return; + + assert.equal(second.exited, false); + assert.equal(second.listed, true); + assert.ok( + listTerminalSessions({ workspaceId }).find( + (s) => s.terminalId === terminalId, + ), + ); + + disposeSession(terminalId, db); + }); +}); + +// ---------------- helpers ---------------- + +async function waitFor(predicate: () => boolean, ms: number): Promise { + const start = Date.now(); + while (!predicate()) { + if (Date.now() - start > ms) throw new Error("waitFor timed out"); + await new Promise((r) => setTimeout(r, 25)); + } +} + +async function waitForOutput( + pty: { onData: (cb: (d: string) => void) => { dispose(): void } }, + marker: string, + ms: number, +): Promise { + let buf = ""; + const disposer = pty.onData((d) => { + buf += d; + }); + try { + await waitFor(() => buf.includes(marker), ms); + } finally { + disposer.dispose(); + } +} diff --git a/packages/host-service/src/terminal/terminal.ts b/packages/host-service/src/terminal/terminal.ts index f2a9e7cf390..2ea1b25ab1e 100644 --- a/packages/host-service/src/terminal/terminal.ts +++ b/packages/host-service/src/terminal/terminal.ts @@ -13,18 +13,18 @@ import { } from "@superset/shared/terminal-title-scanner"; import { and, eq, ne } from "drizzle-orm"; import type { Hono } from "hono"; -import type { HostDb } from "../db"; -import { projects, terminalSessions, workspaces } from "../db/schema"; -import type { EventBus } from "../events"; -import { portManager } from "../ports/port-manager"; -import type { DaemonClient } from "./DaemonClient"; -import { getDaemonClient } from "./daemon-client-singleton"; +import type { HostDb } from "../db/index.ts"; +import { projects, terminalSessions, workspaces } from "../db/schema.ts"; +import type { EventBus } from "../events/index.ts"; +import { portManager } from "../ports/port-manager.ts"; +import type { DaemonClient } from "./DaemonClient/index.ts"; +import { getDaemonClient } from "./daemon-client-singleton.ts"; import { buildV2TerminalEnv, getShellLaunchArgs, getTerminalBaseEnv, resolveLaunchShell, -} from "./env"; +} from "./env.ts"; /** * Thin adapter exposing approximately the IPty surface that the rest of @@ -202,6 +202,27 @@ interface TerminalSession { /** PTY lifetime is independent of socket lifetime — sockets detach/reattach freely. */ const sessions = new Map(); +/** + * Test-only escape hatch: simulates a host-service process restart by clearing + * the in-memory session map without touching the daemon. After calling this, + * createTerminalSessionInternal() is forced down the adoption-on-EEXIST path + * for any session id the daemon already owns. + * + * NEVER call this from production code paths. + */ +export function __resetSessionsForTesting(): void { + for (const session of sessions.values()) { + if (session.unsubscribeDaemon) { + try { + session.unsubscribeDaemon(); + } catch { + // best-effort + } + } + } + sessions.clear(); +} + function pruneAndCountOpenSockets(session: TerminalSession): number { let openSockets = 0; for (const socket of session.sockets) { diff --git a/packages/port-scanner/src/index.ts b/packages/port-scanner/src/index.ts index dc5bd82a156..6a7e6efe696 100644 --- a/packages/port-scanner/src/index.ts +++ b/packages/port-scanner/src/index.ts @@ -2,15 +2,15 @@ export { type KillFn, PortManager, type PortManagerOptions, -} from "./port-manager"; +} from "./port-manager.ts"; export { getListeningPortsForPids, getProcessTree, type PortInfo, -} from "./scanner"; +} from "./scanner.ts"; export { parseStaticPortsConfig, type StaticPortLabel, type StaticPortsParseResult, -} from "./static-ports"; -export type { DetectedPort } from "./types"; +} from "./static-ports.ts"; +export type { DetectedPort } from "./types.ts"; diff --git a/packages/port-scanner/src/port-manager.ts b/packages/port-scanner/src/port-manager.ts index 7ccc634cd1c..bbd5a3af2fd 100644 --- a/packages/port-scanner/src/port-manager.ts +++ b/packages/port-scanner/src/port-manager.ts @@ -3,8 +3,8 @@ import { getListeningPortsForPids, getProcessTree, type PortInfo, -} from "./scanner"; -import type { DetectedPort } from "./types"; +} from "./scanner.ts"; +import type { DetectedPort } from "./types.ts"; /** How often to poll for port changes (in ms) */ const SCAN_INTERVAL_MS = 2500; diff --git a/packages/port-scanner/src/procfs.ts b/packages/port-scanner/src/procfs.ts index 954c7303aa5..650d85aa552 100644 --- a/packages/port-scanner/src/procfs.ts +++ b/packages/port-scanner/src/procfs.ts @@ -1,5 +1,5 @@ import { promises as fs } from "node:fs"; -import type { PortInfo } from "./scanner"; +import type { PortInfo } from "./scanner.ts"; /** * Linux-only: resolve listening TCP ports for a set of PIDs by reading diff --git a/packages/port-scanner/src/scanner.ts b/packages/port-scanner/src/scanner.ts index abe0b603f50..75ab20681e5 100644 --- a/packages/port-scanner/src/scanner.ts +++ b/packages/port-scanner/src/scanner.ts @@ -2,7 +2,7 @@ import { execFile } from "node:child_process"; import os from "node:os"; import { promisify } from "node:util"; import pidtree from "pidtree"; -import { getListeningPortsLinuxProcfs } from "./procfs"; +import { getListeningPortsLinuxProcfs } from "./procfs.ts"; const execFileAsync = promisify(execFile); From a6f09d36a8968b8f3bade6bd8eeff2577e1dd45c Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 00:05:40 -0700 Subject: [PATCH 13/33] fix(pty-daemon) + test(host-service): three more edge cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Found while expanding the e2e adoption suite to cover paths the inventory survey turned up: 1. **Daemon-side bug**: handleOpen was rejecting EEXIST on dead sessions (alive:false) too. dispose-then-recreate-with-same-id tight-looped because the daemon kept the session row around (kept for late-subscriber replay) but treated it as a collision. Fix: handleOpen now treats already-exited entries as recyclable — drops the dead entry and lets the spawn proceed. Live shells still get EEXIST so host-service drives the adoption-via-list path. handleClose stays as-is (the natural-exit replay path that late subscribers depend on still works). 2. **Test gap**: adopted session does NOT re-fire initialCommand. This would have been catastrophic for setup.sh terminals — every host-service restart would re-run setup. Verifies the initialCommandQueued: isAdopted shortcut from the original fix. Sentinel file + mtime check. 3. **Test gap**: adoption when the original workspace row is gone returns a clear error (not crash, not loop). Race: user deletes workspace cloud-side while host-service is down; daemon still has the live session; renderer reconnects. Must surface "Workspace worktree not found" cleanly. 4. **Test gap**: dispose then re-create with the same id works without zombie state. Catches the daemon-side bug above. Asserts the second create gets a different shell pid (real fresh spawn, not adoption of the dead session). Final test counts: - pty-daemon: 24 bun unit + 30 control-plane (node) - host-service: 5 DaemonClient (node) + 6 e2e adoption (Electron-as-Node) Total: 65 tests across 4 layers. --- .../src/runtime/filesystem/filesystem.ts | 2 +- .../terminal/terminal.adoption.node-test.ts | 158 ++++++++++++++++++ packages/pty-daemon/src/handlers/handlers.ts | 13 +- 3 files changed, 170 insertions(+), 3 deletions(-) diff --git a/packages/host-service/src/runtime/filesystem/filesystem.ts b/packages/host-service/src/runtime/filesystem/filesystem.ts index a46e9d54631..10ef8ec6078 100644 --- a/packages/host-service/src/runtime/filesystem/filesystem.ts +++ b/packages/host-service/src/runtime/filesystem/filesystem.ts @@ -5,8 +5,8 @@ import { getSearchIndex, } from "@superset/workspace-fs/host"; import { eq } from "drizzle-orm"; -import { projects, workspaces } from "../../db/schema.ts"; import type { HostDb } from "../../db/index.ts"; +import { projects, workspaces } from "../../db/schema.ts"; export interface WorkspaceFilesystemManagerOptions { db: HostDb; diff --git a/packages/host-service/src/terminal/terminal.adoption.node-test.ts b/packages/host-service/src/terminal/terminal.adoption.node-test.ts index fb8a2e7700b..b0fd3b9bc02 100644 --- a/packages/host-service/src/terminal/terminal.adoption.node-test.ts +++ b/packages/host-service/src/terminal/terminal.adoption.node-test.ts @@ -21,6 +21,7 @@ import * as path from "node:path"; import { after, before, describe, test } from "node:test"; import { fileURLToPath } from "node:url"; import { Server } from "@superset/pty-daemon"; +import { eq } from "drizzle-orm"; import { createDb, type HostDb } from "../db/index.ts"; import { projects, workspaces } from "../db/schema.ts"; import { disposeDaemonClient } from "./daemon-client-singleton.ts"; @@ -197,6 +198,163 @@ describe("createTerminalSessionInternal — host-service restart adoption", () = disposeSession(terminalId, db); }); + + test("adopted session does NOT re-fire initialCommand", async () => { + // Regression guard: setup.sh terminals pass an initialCommand. After + // host-service restart, adopting the same terminalId must NOT run + // the command a second time — that would re-execute setup.sh + // every host-service restart, which would be catastrophic. + const terminalId = `e2e-initcmd-${randomUUID().slice(0, 8)}`; + const sentinelFile = path.join(TEST_HOME, `initcmd-${terminalId}.sentinel`); + // Run on first lifetime: write a file. We then assert it isn't + // rewritten (would have a new mtime) on the second lifetime. + const initialCommand = `echo $$ > ${sentinelFile}`; + + const first = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: false, + initialCommand, + }); + assert.ok(!("error" in first)); + + // Wait for sentinel file (proves initialCommand ran). + await waitFor(() => fs.existsSync(sentinelFile), 5000); + const firstMtime = fs.statSync(sentinelFile).mtimeMs; + + // Simulate host-service restart and adopt, passing the SAME + // initialCommand (host-service has no way to know it already ran). + __resetSessionsForTesting(); + await disposeDaemonClient(); + + const second = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: false, + initialCommand, + }); + assert.ok(!("error" in second)); + + // Wait long enough for the command to have run if it were going to. + await new Promise((r) => setTimeout(r, 800)); + + // Sentinel mtime unchanged → initialCommand was suppressed on adopt. + const secondMtime = fs.statSync(sentinelFile).mtimeMs; + assert.equal( + secondMtime, + firstMtime, + "initialCommand re-fired on adopted session — would re-run setup.sh on every host-service restart", + ); + + disposeSession(terminalId, db); + }); + + test("adoption when the original workspace row is gone returns a clear error", async () => { + // Race: host-service is down, user deletes the workspace cloud-side, + // the workspace row is removed from the host DB. Daemon still has + // the live session. host-service comes back, renderer reconnects + // with the same terminalId. createTerminalSessionInternal must + // surface a clean error (not crash, not loop). + const ghostWorkspaceId = randomUUID(); + const ghostWorktree = path.join(TEST_HOME, "ghost-worktree"); + fs.mkdirSync(ghostWorktree, { recursive: true }); + db.insert(projects) + .values({ id: randomUUID(), repoPath: ghostWorktree }) + .run(); + const ghostProject = randomUUID(); + db.insert(projects) + .values({ id: ghostProject, repoPath: ghostWorktree }) + .run(); + db.insert(workspaces) + .values({ + id: ghostWorkspaceId, + projectId: ghostProject, + worktreePath: ghostWorktree, + branch: "main", + }) + .run(); + + const terminalId = `e2e-ghost-${randomUUID().slice(0, 8)}`; + const first = await createTerminalSessionInternal({ + terminalId, + workspaceId: ghostWorkspaceId, + db, + listed: true, + }); + assert.ok(!("error" in first)); + + // User deletes workspace mid-restart: row gone, worktree dir removed. + __resetSessionsForTesting(); + await disposeDaemonClient(); + db.delete(workspaces).where(eq(workspaces.id, ghostWorkspaceId)).run(); + fs.rmSync(ghostWorktree, { recursive: true, force: true }); + + const second = await createTerminalSessionInternal({ + terminalId, + workspaceId: ghostWorkspaceId, + db, + listed: true, + }); + assert.ok( + "error" in second, + "adoption with missing workspace must return error, not throw or loop", + ); + if ("error" in second) { + assert.match(second.error, /Workspace worktree not found/); + } + + // Daemon still has the orphan session — clean it up directly so the + // test suite leaves nothing behind. Production needs a periodic + // "orphan session sweep" but that's a separate cleanup concern. + disposeSession(terminalId, db); + }); + + test("dispose then re-create with the same id works (no zombie state)", async () => { + // Rapid lifecycle: user creates terminal, kills it, creates again + // with the same id. Daemon-side cleanup must be done by the time + // the second create runs, otherwise we'd hit "session already + // exists" without an alive shell to adopt. + const terminalId = `e2e-recycle-${randomUUID().slice(0, 8)}`; + + const first = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok(!("error" in first)); + const firstPid = "error" in first ? -1 : first.pty.pid; + + disposeSession(terminalId, db); + + // Wait for the daemon's onExit handler to mark the session exited + // (SIGTERM → shell exits → wireSession.onExit fires → session.exited + // flips to true → handleOpen can then recycle the id). + await new Promise((r) => setTimeout(r, 800)); + + const second = await createTerminalSessionInternal({ + terminalId, + workspaceId, + db, + listed: true, + }); + assert.ok( + !("error" in second), + `re-create after dispose failed: ${JSON.stringify(second)}`, + ); + if ("error" in second) return; + + // Different shell pid (real fresh spawn) — not adoption. + assert.notEqual( + second.pty.pid, + firstPid, + "re-create after dispose should be a fresh spawn, not adoption of the dead session", + ); + + disposeSession(terminalId, db); + }); }); // ---------------- helpers ---------------- diff --git a/packages/pty-daemon/src/handlers/handlers.ts b/packages/pty-daemon/src/handlers/handlers.ts index c752982493e..145f9b08360 100644 --- a/packages/pty-daemon/src/handlers/handlers.ts +++ b/packages/pty-daemon/src/handlers/handlers.ts @@ -40,8 +40,17 @@ export interface HandlerCtx { } export function handleOpen(ctx: HandlerCtx, msg: OpenMessage): ServerMessage { - if (ctx.store.get(msg.id)) { - return errorFor(msg.id, `session already exists: ${msg.id}`, "EEXIST"); + const existing = ctx.store.get(msg.id); + if (existing) { + // If the existing entry is for an already-exited shell, treat the open + // as recycling the id: drop the dead entry and let the spawn proceed. + // Live shells still reject with EEXIST so host-service drives the + // adoption-via-list path. + if (existing.exited) { + ctx.store.delete(msg.id); + } else { + return errorFor(msg.id, `session already exists: ${msg.id}`, "EEXIST"); + } } let session: Session; const spawnFn = ctx.spawnPty ?? defaultSpawn; From faa76bf1059669266c589e5c85b1ee0db10a502a Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 00:19:10 -0700 Subject: [PATCH 14/33] docs(desktop): pty-daemon implementation report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Concise audit of the pty-daemon-host-integration branch against the implementation plan. Calls out: - 18 plan decisions correctly implemented - 6 deviations from the plan (each with a DECISION marker for accept-as- written / revert-to-plan / defer) - 7 explicit plan items not yet done (telemetry, crash supervision, real kill-9 tests, Linux verification, disconnect → close WS, /tmp sweep) - 5 decisions I made that weren't in the plan (worth documenting) - 1 wrong claim from a prior summary corrected (app-quit handling — the daemon SHOULD outlive app quit, mirroring host-service) Ends with a 5-question summary the reviewer can answer in one pass to sign off this report and update the plan. --- ...260430-pty-daemon-implementation-report.md | 223 ++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 apps/desktop/plans/20260430-pty-daemon-implementation-report.md diff --git a/apps/desktop/plans/20260430-pty-daemon-implementation-report.md b/apps/desktop/plans/20260430-pty-daemon-implementation-report.md new file mode 100644 index 00000000000..1149d81f5f9 --- /dev/null +++ b/apps/desktop/plans/20260430-pty-daemon-implementation-report.md @@ -0,0 +1,223 @@ +# pty-daemon Implementation Report + +**Status:** Phase 1 implemented; awaiting review. +**Date:** 2026-04-30 +**Branch:** `pty-daemon-host-integration` +**PR:** #3896 +**Plan:** `20260429-pty-daemon-implementation.md` + +Concise audit of every change against the plan. Each deviation has a +**DECISION** marker with the choice you need to make: accept the +deviation (and I'll update the plan), revert to plan (and I'll update +the code), or defer. + +## TL;DR + +- **Architecture:** as planned — daemon outlives host-service via + manifest-based adoption, identical lifetime model to host-service. +- **Tests:** 65 across 4 layers (24 daemon unit + 30 daemon + control-plane + 5 host-side DaemonClient + 6 host-service E2E). +- **Plan-compliance:** 18 decisions correctly implemented as specified; + 6 deviations from the plan (most are improvements or pragmatic + trade-offs); 7 explicit plan items not done; 1 wrong assertion of + mine that this report corrects. +- **Operationally ready:** the architecture is correct; the + observability and failure-mode hooks the plan called out (telemetry, + crash supervision) are not yet wired. + +## What shipped + +``` +packages/pty-daemon/ +├── src/ +│ ├── main.ts # Node entry: argv → Server.listen +│ ├── index.ts # public exports +│ ├── protocol/ +│ │ ├── version.ts # CURRENT_PROTOCOL_VERSION + supported list +│ │ ├── messages.ts # ClientMessage / ServerMessage unions +│ │ ├── framing.ts # encodeFrame / FrameDecoder +│ │ └── index.ts +│ ├── Pty/Pty.ts # node-pty wrapper + dim validation +│ ├── SessionStore/SessionStore.ts # in-memory map + ring buffer per session +│ ├── handlers/handlers.ts # open/input/resize/close/list/subscribe +│ └── Server/Server.ts # AF_UNIX accept loop, handshake, dispatch +├── test/ +│ ├── helpers/client.ts # reusable DaemonClient +│ ├── integration.test.ts # smoke (3 tests) +│ └── control-plane.test.ts # exhaustive (30 tests, 11 suites) +└── build.ts # Bun.build target=node → dist/pty-daemon.js + +packages/host-service/src/terminal/ +├── DaemonClient/ +│ ├── DaemonClient.ts # Unix-socket client w/ multi-subscriber fan-out +│ └── DaemonClient.node-test.ts # 5 integration tests under node:test +├── daemon-client-singleton.ts # lazy DaemonClient singleton +├── terminal.ts # refactored to use DaemonClient (was node-pty.spawn) +└── terminal.adoption.node-test.ts # 6 E2E tests under Electron-as-Node + +apps/desktop/src/main/ +├── lib/ +│ ├── pty-daemon-coordinator.ts # spawn/adopt; sibling of HostServiceCoordinator +│ └── pty-daemon-manifest.ts # manifest read/write helpers +└── pty-daemon/index.ts # main entry that registers in electron.vite.config.ts +``` + +Source: ~870 LOC daemon, ~270 LOC DaemonClient, ~250 LOC coordinator. +Tests: ~1100 LOC across 4 layers. + +## Plan-compliance audit + +### ✅ Correctly implemented as specified (18) + +| # | Plan decision | Verified by | +|---|---|---| +| 1 | Architecture E (daemon now, fd-handoff Phase 2 deferred) | Code structure | +| 2 | Daemon runtime: Node + node-pty | `build.ts`, `bin` field, `engines.node` | +| 3 | Daemon scope: pure PTY runtime, stateless from client perspective | No HTTP/auth/DB/business logic anywhere in `packages/pty-daemon/src` | +| 4 | Transport: AF_UNIX SOCK_STREAM + length-prefixed binary frames | `protocol/framing.ts`, `Server/Server.ts` | +| 5 | Auth: Unix socket file mode 0600 | `Server.listen()` chmod | +| 6 | In-memory ring buffer per session, ~64 KB | `SessionStore.ts` | +| 7 | All v1 anti-patterns omitted (HistoryWriter, cold restore, tombstones, EventEmitter, dedup, priority semaphore, ANSI parsing, sticky state, deferred-cleanup setTimeout) | Grep | +| 8 | Per-session snapshot on attach (pid, cols, rows, alive) | `open-ok` + `list-reply` messages | +| 9 | Resize bounds validation | `Pty.ts:validateDims` | +| 10 | Signal abstraction as strings | Protocol message types | +| 11 | Graceful shutdown ordering | `Server.close()` | +| 12 | Versioned handshake | `protocol/version.ts` + Server dispatch | +| 13 | Renderer code zero changes | No diffs in `apps/desktop/src/renderer` | +| 14 | PSK auth boundary unchanged at host-service | Hono WS upgrade unchanged | +| 15 | terminalSessions DB table unchanged; daemon never touches DB | Daemon has no `better-sqlite3` import | +| 16 | Daemon binary bundled via electron-vite alongside host-service | `electron.vite.config.ts:115` adds entry; outputs `dist/main/pty-daemon.js` | +| 17 | node-pty version pinned (1.1.0) | `package.json` | +| 18 | **Daemon outlives host-service restart and app quit; killed only on explicit restart (and dev-mode reload by HostServiceCoordinator's enableDevReload)** | `tryAdopt()` finds detached daemon at next launch; no `before-quit` hook | + +### ⚠️ Deviations from the plan (6) — DECISIONS NEEDED + +#### Deviation #1: Host-side ring buffer kept + +- **Plan:** "Move the ring buffer entirely to the daemon. host-service no longer holds replay state; it asks the daemon for replay-on-attach via `subscribe { replay: true }`." +- **What I did:** Kept the 64 KB host-side buffer (`terminal.ts:64,101-102,206-225`) for in-process fan-out to multiple WS subscribers. The daemon also has its own 64 KB buffer; that one is the cross-restart source of truth. +- **Why:** Removing the host buffer would require either (a) a separate daemon subscription per WS connection, or (b) buffer-aware replay logic that re-asks the daemon on each WS attach. Keeping the host buffer is the smallest, most behaviour-preserving change. +- **Trade-off:** Two layers of 64 KB buffers per session. Memory cost is negligible. The deviation removes one of the v1-bloat rationales (host should be stateless re PTY data plane), but only partially. +- **DECISION:** + - [ ] **A: Accept deviation** — update the plan to reflect "host-side fan-out buffer + daemon source-of-truth buffer." + - [ ] B: Revert to plan — remove host buffer, add per-WS daemon subscriptions in a follow-up. + - [ ] C: Defer to a cleanup PR; ship as-is. + +#### Deviation #2: Per-organization daemon (not per-workspace) + +- **Plan:** "Per-workspace daemon (mirrors current host-service-per-workspace)." +- **What I did:** Per-organization daemon, exactly mirroring `HostServiceCoordinator` which is keyed by `organizationId`. +- **Why:** The plan's parenthetical claim is wrong: host-service is per-organization, not per-workspace. I matched real host-service. +- **DECISION:** + - [ ] **A: Accept deviation** — fix the plan to say "Per-organization, mirroring host-service-per-organization." + - [ ] B: Revert to plan — refactor to per-workspace (no production reason to do this; would create N daemons per org). + +#### Deviation #3: Manifest `startedAt` is epoch ms, not ISO 8601 string + +- **Plan:** `startedAt: string` ISO 8601. +- **What I did:** `startedAt: number` epoch ms, matching `HostServiceManifest`. +- **DECISION:** + - [ ] **A: Accept deviation** — keep epoch ms, fix the plan. + - [ ] B: Revert to plan — switch to ISO string. (Trivial change; no real impact either way.) + +#### Deviation #4: Protocol module split into 3 files + +- **Plan:** single `protocol/protocol.ts`. +- **What I did:** `protocol/version.ts`, `protocol/messages.ts`, `protocol/framing.ts`. +- **Why:** Cleaner separation of concerns; tests only need to import what they use. +- **DECISION:** + - [ ] **A: Accept deviation** — update the plan to show three files. + - [ ] B: Revert — collapse into one file. (No real benefit; current shape is more readable.) + +#### Deviation #5: Adoption check skips protocol-version verification + +- **Plan:** "If PID alive **and socket connectable and protocol version compatible** → adopt." +- **What I did:** Adoption checks PID alive + socket connectable; **does not** connect-and-handshake to verify protocol compatibility before adopting. +- **Why:** v1 is the only protocol; pure overhead today. The check matters when Phase 2 introduces a v2 binary alongside v1 daemons. +- **DECISION:** + - [ ] **A: Accept until Phase 2 lands** — flag this in the plan as deferred. + - [ ] B: Implement now — costs ~30 LOC; trivial. Adds a connect/handshake/disconnect cycle to every adoption. + +#### Deviation #6: `subscribe` / `unsubscribe` as explicit protocol ops + +- **Plan:** ops list mentioned `subscribe-output` (one op). +- **What I did:** `subscribe` (with `replay: bool`) and `unsubscribe` as separate ops; daemon supports multi-subscriber fan-out per session. +- **DECISION:** + - [ ] **A: Accept deviation** — update the plan to show both ops. + - [ ] B: Revert — collapse to one op. (No real benefit; current shape is the minimum needed for renderer reattach + observer mode.) + +### ❗ Plan items NOT done (7) + +| # | Plan item | Status | Risk if shipped without | +|---|---|---|---| +| 1 | **Telemetry: 6 events** (`pty_daemon_spawn/adopt/session_open/session_exit/crash`, `host_service_restart_sessions_preserved`) | None wired | **Can't measure success or detect crashes.** The headline metric of the entire project is unobservable. | +| 2 | Daemon crash supervision: "3 crashes in 60s → stop respawning, surface to user" (Open Decision #3 in plan) | Not implemented; coordinator doesn't even auto-respawn after exit | Daemon crashes mid-session = silent terminal death until host-service restart | +| 3 | host-service crash integration test (real `kill -9` + verify renderer reattaches) | Adoption tested via `__resetSessionsForTesting`, not real `kill -9` | Real-world signal handling (no graceful close events) untested | +| 4 | Daemon crash integration test | Not explicitly tested | Same gap | +| 5 | Linux + macOS x86_64 Phase 0 / Phase 1 verification | Not done; macOS arm64 only | Architecture is portable but unverified — defer until shipping to those platforms | +| 6 | Daemon-disconnect → close terminal WS streams | `daemon-client-singleton.ts` clears its cache but doesn't close ws sockets to the renderer | Renderer thinks the terminal is alive; input silently fails | +| 7 | `/tmp/superset-ptyd-*.sock` sweep on coordinator init | Not done | Cosmetic; `/tmp` accumulates over time | + +**DECISION:** for each, mark **before ship** / **after ship** / **never** to set scope: + - [ ] #1 telemetry — recommended **before ship** + - [ ] #2 crash supervision — recommended **before ship** + - [ ] #3, #4 crash tests — recommended **before ship** + - [ ] #5 Linux verification — recommended **before shipping to Linux** + - [ ] #6 disconnect → close WS — recommended **before ship** + - [ ] #7 `/tmp` sweep — recommended **after ship** or **never** (cosmetic) + +### ❗ Decisions I made that weren't in the plan (5) + +| # | What I decided | Why | Plan should mention? | +|---|---|---|---| +| 1 | Socket path in `os.tmpdir()/superset-ptyd-<12hex>.sock`, not `$SUPERSET_HOME_DIR/host//pty-daemon.sock` | Darwin's 104-byte `sun_path` limit; original path was 159+ chars in dev | **Yes** — add to plan as the reason for this path | +| 2 | Adoption-on-EEXIST path in `createTerminalSessionInternal` | Race: host-service restart finds daemon already has the session id; bare `daemon.open` errors with EEXIST → tight loop until adopted | **Yes** — add as a critical post-restart code path | +| 3 | `__resetSessionsForTesting()` test escape hatch exported from production `terminal.ts` | Needed for in-process e2e testing of the adoption path | **Maybe** — note the test-only contract | +| 4 | Daemon's `handleOpen` recycles already-`exited` session entries (drops dead entry, spawns fresh); live entries still EEXIST | Without this, dispose-then-recreate-with-same-id loops forever — late-subscriber replay needs the entry to stick around after exit, but a fresh `open` should not see it as a collision | **Yes** — small protocol semantic to document | +| 5 | Initial-command suppression on adoption (`initialCommandQueued: isAdopted`) | Without this, setup.sh would re-run on every host-service restart for setup terminals | **Yes** — document | + +**DECISION:** for each `Yes`, I'll update the plan if you accept. + +### ❌ Wrong assertion I made earlier + +In the prior "shippability" assessment I said: + +> "App-quit lifecycle for the daemon ⚠️ — should fix before ship: daemon should be killed when user quits app." + +This is **wrong**. The plan and `HOST_SERVICE_LIFECYCLE.md` specify the daemon **outlives app quit** (manifest-based adoption picks it up next launch — same model as host-service). Only `enableDevReload` in `HostServiceCoordinator` tears down running services for hot-reload during dev. + +Action: do **not** add a `before-quit` hook; the current behavior is correct. + +## Five-question summary you can answer in one pass + +| # | Question | Recommendation | +|---|---|---| +| 1 | Accept all 6 plan deviations? (host-side buffer, per-org daemon, manifest format, protocol split, adoption proto-version skip, subscribe/unsubscribe ops) | **Yes**, update plan | +| 2 | Wire telemetry before ship? | **Yes** (~50 LOC, the project's headline metric is currently unobservable) | +| 3 | Wire daemon crash supervision before ship? | **Yes** (~80 LOC, agreed crash policy isn't actually implemented) | +| 4 | Wire daemon-disconnect → close WS streams before ship? | **Yes** (~30 LOC, otherwise silent terminal failure) | +| 5 | Add real `kill -9` integration tests? | **Yes** (~40 LOC of test code) | + +If you say yes to all five: ~200 LOC of additional production code + ~100 LOC of tests. Roughly half a day. Then this is genuinely shippable to users. + +## What's currently in PR #3896 + +7 commits on `pty-daemon-host-integration`: + +1. `9bdbf7b85` feat(host-service): DaemonClient — Unix-socket client for pty-daemon +2. `b1eb105f0` feat(desktop): pty-daemon coordinator + manifest + main entry +3. `401e203fe` feat(host-service): route terminal sessions through pty-daemon +4. `b387324e1` fix(desktop): make pty-daemon spawn failure non-fatal for host-service +5. `df81d8b15` fix(desktop): allow .env / shell to provide SUPERSET_PTY_DAEMON_SOCKET +6. `2e8d2167e` debug(desktop): surface daemon spawn failures with log tail + child exit code +7. `05ae50c20` fix(desktop): use short /tmp path for pty-daemon socket (Darwin sun_path) +8. `aae131eb3` fix(host-service): adopt existing daemon sessions on host-service restart +9. `2bbb0846c` test(pty-daemon): replay-on-exited-session edge case +10. `525d3ec94` test(host-service): full E2E adoption test under Electron-as-Node +11. `a6f09d36a` fix(pty-daemon) + test(host-service): three more edge cases + +## Status flag + +Once you've made the five decisions above, this report becomes +**signed-off** and I update the implementation plan to match the +final accepted state. From 9a6c8fc326436a10f786b45a99a978f234448f6d Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 00:40:57 -0700 Subject: [PATCH 15/33] fix(host-service): close terminal WS streams on daemon disconnect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, a daemon crash leaves the renderer's WebSocket sockets open while host-service's DaemonClient is dead. Input/resize silently fail, and the renderer thinks the terminal is alive. Now: daemon-client-singleton emits an onDaemonDisconnect event; terminal listens and closes every WS socket with code 1011. The renderer's existing exponential-backoff reconnect kicks in. On reconnect host- service rebuilds the DaemonClient (next getDaemonClient call), and the adoption-via-list path re-attaches to live sessions on the respawned daemon. Two related drive-bys from PR review: - daemon-client-singleton catches connect failure to dispose the partially-initialized client (was leaking on connect failure). - disposeDaemonClient now also handles the in-flight connecting promise. - Fire-and-forget WS handler checks ws.readyState before adding to session set after the daemon-open await — prevents adding a closed WS to broadcast. --- .../src/terminal/daemon-client-singleton.ts | 44 ++++++++++++++++-- .../host-service/src/terminal/terminal.ts | 45 ++++++++++++++++++- 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/packages/host-service/src/terminal/daemon-client-singleton.ts b/packages/host-service/src/terminal/daemon-client-singleton.ts index 1ef5abcd68e..f60dc6f4d3b 100644 --- a/packages/host-service/src/terminal/daemon-client-singleton.ts +++ b/packages/host-service/src/terminal/daemon-client-singleton.ts @@ -2,16 +2,31 @@ // passes the daemon socket path via SUPERSET_PTY_DAEMON_SOCKET. We connect // once on first use and reuse the connection for all sessions. // -// On disconnect we surface via console.error and let the next caller fail — -// the desktop coordinator is responsible for respawning the daemon and -// host-service can be restarted to reconnect. There's no in-band reconnect -// here on purpose; see DaemonClient's "dumb" failure model. +// On disconnect we surface via console.error, notify subscribers (terminal.ts +// uses this to close WS sockets so the renderer reconnects against the +// respawned daemon), and let the next caller's getDaemonClient() rebuild +// the client. There's no in-band reconnect here — see DaemonClient's "dumb" +// failure model. import { DaemonClient } from "./DaemonClient/index.ts"; let cached: DaemonClient | null = null; let connecting: Promise | null = null; +/** + * Subscribers notified whenever the active DaemonClient disconnects. + * terminal.ts hooks this to close WS sockets and clear in-memory session + * state — without it, sockets stay open and input/resize silently fails. + */ +const disconnectListeners = new Set<(err?: Error) => void>(); + +export function onDaemonDisconnect(cb: (err?: Error) => void): () => void { + disconnectListeners.add(cb); + return () => { + disconnectListeners.delete(cb); + }; +} + export function ptyDaemonSocketPath(): string { const path = process.env.SUPERSET_PTY_DAEMON_SOCKET; if (!path) { @@ -32,6 +47,16 @@ export async function getDaemonClient(): Promise { err?.message ?? "", ); if (cached === client) cached = null; + for (const listener of disconnectListeners) { + try { + listener(err); + } catch (cbErr) { + console.error( + "[host-service] daemon-disconnect listener threw:", + cbErr, + ); + } + } }); connecting = client .connect() @@ -39,6 +64,11 @@ export async function getDaemonClient(): Promise { cached = client; return client; }) + .catch(async (error) => { + // Failed connect — clean up the partially initialized client. + await client.dispose().catch(() => {}); + throw error; + }) .finally(() => { connecting = null; }); @@ -48,6 +78,12 @@ export async function getDaemonClient(): Promise { /** For tests / shutdown only. */ export async function disposeDaemonClient(): Promise { const c = cached; + const inFlight = connecting; cached = null; + connecting = null; if (c) await c.dispose(); + if (inFlight) { + const client = await inFlight.catch(() => null); + if (client) await client.dispose(); + } } diff --git a/packages/host-service/src/terminal/terminal.ts b/packages/host-service/src/terminal/terminal.ts index 2ea1b25ab1e..79695539750 100644 --- a/packages/host-service/src/terminal/terminal.ts +++ b/packages/host-service/src/terminal/terminal.ts @@ -18,7 +18,10 @@ import { projects, terminalSessions, workspaces } from "../db/schema.ts"; import type { EventBus } from "../events/index.ts"; import { portManager } from "../ports/port-manager.ts"; import type { DaemonClient } from "./DaemonClient/index.ts"; -import { getDaemonClient } from "./daemon-client-singleton.ts"; +import { + getDaemonClient, + onDaemonDisconnect, +} from "./daemon-client-singleton.ts"; import { buildV2TerminalEnv, getShellLaunchArgs, @@ -202,6 +205,42 @@ interface TerminalSession { /** PTY lifetime is independent of socket lifetime — sockets detach/reattach freely. */ const sessions = new Map(); +// When the daemon disconnects, close every WS socket so the renderer's +// existing exponential-backoff reconnect kicks in. On reconnect, host-service +// rebuilds the DaemonClient (next getDaemonClient() call), and the adoption- +// via-list path re-attaches to live sessions on the respawned daemon. Without +// this, sockets stay open and input/resize silently fail because the daemon +// reference is dead. +// +// We also clear the in-memory sessions map so a stale subscription closure +// doesn't keep firing for sessions that no longer match daemon state. +onDaemonDisconnect((err) => { + const sessionCount = sessions.size; + if (sessionCount === 0) return; + console.warn( + `[terminal] pty-daemon disconnected (${err?.message ?? "no message"}); closing ${sessionCount} terminal WS socket(s) to trigger renderer reconnect`, + ); + for (const session of sessions.values()) { + for (const socket of session.sockets) { + try { + socket.close(1011, "pty-daemon disconnected"); + } catch { + // best-effort + } + } + session.sockets.clear(); + if (session.unsubscribeDaemon) { + try { + session.unsubscribeDaemon(); + } catch { + // best-effort + } + session.unsubscribeDaemon = null; + } + } + sessions.clear(); +}); + /** * Test-only escape hatch: simulates a host-service process restart by clearing * the in-memory session map without touching the daemon. After calling this, @@ -797,6 +836,10 @@ export function registerWorkspaceTerminalRoute({ return; } + // WS may have closed during the daemon-open await; don't + // register a dead socket into the session's broadcast set. + if (ws.readyState !== SOCKET_OPEN) return; + result.sockets.add(ws); sendMessage(ws, { type: "title", title: result.title }); From 0286beb95893ad5ad2305de447a754f90988d42a Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 00:42:49 -0700 Subject: [PATCH 16/33] feat(desktop): pty-daemon crash supervision (3-in-60s circuit breaker) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements Open Decision #3 from the implementation plan: detect crash loops and stop respawning the daemon when something is fundamentally broken, instead of burning CPU on a forever-loop respawn. Behavior: - Daemon exits we initiated (coordinator.stop) don't count toward the crash counter — tracked via a `stopping` set. - Unexpected exits add a timestamp to the per-org crashTimes list. Older-than-60s timestamps are dropped on each accounting. - Up to 3 crashes within 60s → auto-respawn. - The 4th crash within the window → circuit OPEN. No more respawns until clearCrashCircuit(orgId) is called from the UI's "retry" affordance, or the desktop app restarts. - ensure() fails fast with a clear error message when the circuit is open, instead of trying to spawn-and-time-out repeatedly. Plumbing for the UI surface (telemetry + retry affordance) lands in a follow-up commit. --- .../src/main/lib/pty-daemon-coordinator.ts | 79 ++++++++++++++++++- 1 file changed, 76 insertions(+), 3 deletions(-) diff --git a/apps/desktop/src/main/lib/pty-daemon-coordinator.ts b/apps/desktop/src/main/lib/pty-daemon-coordinator.ts index df3670af486..a49a3b6ff79 100644 --- a/apps/desktop/src/main/lib/pty-daemon-coordinator.ts +++ b/apps/desktop/src/main/lib/pty-daemon-coordinator.ts @@ -33,6 +33,15 @@ interface DaemonInstance { const SOCKET_READY_TIMEOUT_MS = 5_000; +/** + * Crash supervision parameters. If the daemon for an organization crashes + * more than CRASH_BUDGET times within CRASH_WINDOW_MS, we stop respawning + * and surface a hard error — repeated crashes are a bug, not transient + * recovery. Per the implementation plan's Open Decision #3. + */ +const CRASH_BUDGET = 3; +const CRASH_WINDOW_MS = 60_000; + /** * Per-organization socket path. **Must stay short** — Darwin's `sun_path` * is 104 bytes, and `$SUPERSET_HOME_DIR/host/{orgId}/pty-daemon.sock` blows @@ -59,17 +68,45 @@ export class PtyDaemonCoordinator { private readonly opts: PtyDaemonCoordinatorOptions; private readonly instances = new Map(); private readonly pendingStarts = new Map>(); + /** Recent crash timestamps per orgId, for the circuit breaker. */ + private readonly crashTimes = new Map(); + /** Orgs we've explicitly stopped — exit isn't a crash, don't respawn. */ + private readonly stopping = new Set(); + /** Orgs that tripped the circuit breaker — refuse respawn until cleared. */ + private readonly circuitOpen = new Set(); constructor(opts: PtyDaemonCoordinatorOptions) { this.opts = opts; } + /** + * Has the org tripped the crash circuit breaker? Once tripped, ensure() + * fails fast with a clear error until clearCrashCircuit() is called. + */ + isCircuitOpen(organizationId: string): boolean { + return this.circuitOpen.has(organizationId); + } + + /** + * Reset the crash counter and close the circuit. Call this from a UI + * "retry" action after surfacing the error to the user. + */ + clearCrashCircuit(organizationId: string): void { + this.circuitOpen.delete(organizationId); + this.crashTimes.delete(organizationId); + } + /** * Spawn the daemon if not already running for this organization, or * adopt the running one. Returns the socket path host-service should * connect to. */ async ensure(organizationId: string): Promise { + if (this.circuitOpen.has(organizationId)) { + throw new Error( + `[pty-daemon:${organizationId}] crash circuit open: ${CRASH_BUDGET} crashes within ${CRASH_WINDOW_MS / 1000}s. Restart the desktop app to retry.`, + ); + } const existing = this.instances.get(organizationId); if (existing) return existing; const pending = this.pendingStarts.get(organizationId); @@ -90,6 +127,9 @@ export class PtyDaemonCoordinator { const instance = this.instances.get(organizationId); this.instances.delete(organizationId); if (!instance) return; + // Mark this exit as intentional so the on-exit handler doesn't count + // it toward the crash circuit breaker. + this.stopping.add(organizationId); try { process.kill(instance.pid, "SIGTERM"); } catch { @@ -232,10 +272,43 @@ export class PtyDaemonCoordinator { child.on("exit", (code) => { console.log(`[pty-daemon:${organizationId}] exited with code ${code}`); const current = this.instances.get(organizationId); - if (current?.pid === childPid) { - this.instances.delete(organizationId); - removePtyDaemonManifest(organizationId); + if (current?.pid !== childPid) return; + this.instances.delete(organizationId); + removePtyDaemonManifest(organizationId); + + // Was this exit intentional (we called stop)? If so, no crash + // accounting and no respawn. + if (this.stopping.has(organizationId)) { + this.stopping.delete(organizationId); + return; } + + // Unexpected exit — record the crash and decide whether to + // auto-respawn or trip the circuit breaker. + const now = Date.now(); + const recent = (this.crashTimes.get(organizationId) ?? []).filter( + (t) => now - t < CRASH_WINDOW_MS, + ); + recent.push(now); + this.crashTimes.set(organizationId, recent); + + if (recent.length > CRASH_BUDGET) { + this.circuitOpen.add(organizationId); + console.error( + `[pty-daemon:${organizationId}] crash circuit OPEN — ${recent.length} crashes in ${CRASH_WINDOW_MS / 1000}s; refusing further respawns until clearCrashCircuit() is called`, + ); + return; + } + + console.warn( + `[pty-daemon:${organizationId}] auto-respawning after unexpected exit (${recent.length}/${CRASH_BUDGET} in window)`, + ); + void this.ensure(organizationId).catch((err) => { + console.error( + `[pty-daemon:${organizationId}] auto-respawn failed:`, + err, + ); + }); }); const startedAt = Date.now(); From 42f0bc93a92a662972a45d001e9ff25b0b0c8958 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 00:45:12 -0700 Subject: [PATCH 17/33] feat(desktop): pty-daemon telemetry events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the coordinator-side events the implementation plan called out. Uses the existing main/lib/analytics track() helper that already feeds PostHog with telemetry consent gating. Events emitted: - pty_daemon_spawn { organizationId, pid, socketPath } - pty_daemon_adopt { organizationId, pid, ageSeconds } - pty_daemon_spawn_failed { organizationId, reason, timeoutMs, earlyExitCode, earlyExitSignal } - pty_daemon_crash { organizationId, exitCode, crashesInWindow, windowSeconds, ageSeconds } - pty_daemon_circuit_open { organizationId, crashesInWindow } Known gap (not in this commit): the host-service-side events from the plan — pty_daemon_session_open, pty_daemon_session_exit, host_service_restart_sessions_preserved (the headline metric) — need host-service → desktop-main IPC since host-service runs as a separate Node process with no PostHog client of its own. Tracked separately; doesn't block the operational signals (spawn/adopt/crash/circuit-open) from being available for monitoring. --- .../src/main/lib/pty-daemon-coordinator.ts | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/apps/desktop/src/main/lib/pty-daemon-coordinator.ts b/apps/desktop/src/main/lib/pty-daemon-coordinator.ts index a49a3b6ff79..fb7251fb663 100644 --- a/apps/desktop/src/main/lib/pty-daemon-coordinator.ts +++ b/apps/desktop/src/main/lib/pty-daemon-coordinator.ts @@ -14,6 +14,7 @@ import * as fs from "node:fs"; import * as net from "node:net"; import * as os from "node:os"; import * as path from "node:path"; +import { track } from "./analytics"; import { SUPERSET_HOME_DIR } from "./app-environment"; import { isProcessAlive } from "./host-service-manifest"; import { MAX_HOST_LOG_BYTES, openRotatingLogFd } from "./host-service-utils"; @@ -147,11 +148,22 @@ export class PtyDaemonCoordinator { console.log( `[pty-daemon:${organizationId}] adopted existing daemon pid=${adopted.pid}`, ); + track("pty_daemon_adopt", { + organizationId, + pid: adopted.pid, + ageSeconds: Math.round((Date.now() - adopted.startedAt) / 1000), + }); return adopted; } // Otherwise spawn a fresh one. - return this.spawn(organizationId); + const instance = await this.spawn(organizationId); + track("pty_daemon_spawn", { + organizationId, + pid: instance.pid, + socketPath: instance.socketPath, + }); + return instance; } private async tryAdopt( @@ -263,6 +275,13 @@ export class PtyDaemonCoordinator { } catch { logTail = "(no log file written)"; } + track("pty_daemon_spawn_failed", { + organizationId, + reason: "socket-not-ready", + timeoutMs: SOCKET_READY_TIMEOUT_MS, + earlyExitCode, + earlyExitSignal, + }); throw new Error( `[pty-daemon:${organizationId}] socket did not become ready within ${SOCKET_READY_TIMEOUT_MS}ms (childPid=${childPid}, earlyExit=${earlyExitCode ?? earlyExitSignal ?? "still alive"}). Log tail:\n${logTail}`, ); @@ -292,11 +311,23 @@ export class PtyDaemonCoordinator { recent.push(now); this.crashTimes.set(organizationId, recent); + track("pty_daemon_crash", { + organizationId, + exitCode: code, + crashesInWindow: recent.length, + windowSeconds: CRASH_WINDOW_MS / 1000, + ageSeconds: Math.round((now - current.startedAt) / 1000), + }); + if (recent.length > CRASH_BUDGET) { this.circuitOpen.add(organizationId); console.error( `[pty-daemon:${organizationId}] crash circuit OPEN — ${recent.length} crashes in ${CRASH_WINDOW_MS / 1000}s; refusing further respawns until clearCrashCircuit() is called`, ); + track("pty_daemon_circuit_open", { + organizationId, + crashesInWindow: recent.length, + }); return; } From a928c164ad5e3c3eadde440b73838c170bb80f5e Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 00:47:11 -0700 Subject: [PATCH 18/33] test(pty-daemon): real SIGKILL recovery test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a test that spawns the bundled daemon as a child process, sends it a real SIGKILL (no Server.close, no graceful shutdown, no exit event broadcast), and asserts that connected clients see the socket close cleanly without hanging. Different from the existing control-plane Server.close test, which exercises the cooperative shutdown path. Real production crashes don't go through Server.close — this test covers the actual path. Wired into `bun run test:integration` script. --- packages/pty-daemon/package.json | 2 +- .../pty-daemon/test/signal-recovery.test.ts | 185 ++++++++++++++++++ 2 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 packages/pty-daemon/test/signal-recovery.test.ts diff --git a/packages/pty-daemon/package.json b/packages/pty-daemon/package.json index 695a085aadb..1a116d060f5 100644 --- a/packages/pty-daemon/package.json +++ b/packages/pty-daemon/package.json @@ -25,7 +25,7 @@ "build:daemon": "bun run build.ts", "typecheck": "tsc --noEmit --emitDeclarationOnly false", "test": "bun test src/protocol src/SessionStore src/handlers src/Pty/Pty.test.ts", - "test:integration": "node --experimental-strip-types --test test/integration.test.ts test/control-plane.test.ts" + "test:integration": "node --experimental-strip-types --test test/integration.test.ts test/control-plane.test.ts test/signal-recovery.test.ts" }, "dependencies": { "node-pty": "1.1.0" diff --git a/packages/pty-daemon/test/signal-recovery.test.ts b/packages/pty-daemon/test/signal-recovery.test.ts new file mode 100644 index 00000000000..1bc9b0d4ef5 --- /dev/null +++ b/packages/pty-daemon/test/signal-recovery.test.ts @@ -0,0 +1,185 @@ +// Real-signal recovery tests: spawn the bundled daemon as a child process, +// then SIGKILL it (no graceful close events) and verify the client surfaces +// the disconnect cleanly. Different from the existing control-plane tests, +// which use Server.close() — that's the cooperative shutdown path. Real +// production crashes don't go through Server.close. +// +// Runs under Node (`node --experimental-strip-types --test`). + +import { strict as assert } from "node:assert"; +import * as childProcess from "node:child_process"; +import * as fs from "node:fs"; +import * as net from "node:net"; +import * as os from "node:os"; +import * as path from "node:path"; +import { after, before, describe, test } from "node:test"; +import { fileURLToPath } from "node:url"; +import { + CURRENT_PROTOCOL_VERSION, + encodeFrame, + FrameDecoder, + type ServerMessage, +} from "../src/protocol/index.ts"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const DAEMON_BUNDLE = path.resolve(__dirname, "../dist/pty-daemon.js"); +const SOCK = path.join(os.tmpdir(), `pty-daemon-sigkill-${process.pid}.sock`); + +let daemonProcess: childProcess.ChildProcess | null = null; + +before(async () => { + if (!fs.existsSync(DAEMON_BUNDLE)) { + throw new Error( + `Missing daemon bundle at ${DAEMON_BUNDLE}. Run \`bun run build:daemon\` first.`, + ); + } + + daemonProcess = childProcess.spawn( + process.execPath, + [DAEMON_BUNDLE, `--socket=${SOCK}`], + { + stdio: ["ignore", "pipe", "pipe"], + env: { ...process.env, NODE_ENV: "test" }, + }, + ); + daemonProcess.stderr?.on("data", (chunk) => { + process.stderr.write(`[daemon-stderr] ${chunk}`); + }); + + // Wait for socket to become connectable. + const deadline = Date.now() + 5000; + while (Date.now() < deadline) { + if (fs.existsSync(SOCK)) { + const ok = await new Promise((resolve) => { + const s = net.createConnection({ path: SOCK }); + const t = setTimeout(() => { + s.destroy(); + resolve(false); + }, 200); + s.once("connect", () => { + clearTimeout(t); + s.end(); + resolve(true); + }); + s.once("error", () => { + clearTimeout(t); + resolve(false); + }); + }); + if (ok) break; + } + await new Promise((r) => setTimeout(r, 50)); + } +}); + +after(async () => { + if (daemonProcess && !daemonProcess.killed) { + daemonProcess.kill("SIGKILL"); + await new Promise((r) => daemonProcess?.once("exit", r)); + } + try { + fs.unlinkSync(SOCK); + } catch { + // best-effort + } +}); + +describe("daemon SIGKILL recovery", () => { + test("clients receive close events when daemon dies via SIGKILL", async () => { + // Open a connection, complete handshake, send a list to confirm health. + const client = await connect(); + client.send({ type: "hello", protocols: [CURRENT_PROTOCOL_VERSION] }); + await client.waitFor((m) => m.type === "hello-ack", 2000); + + // Capture disconnect. + const disconnected = new Promise((resolve) => + client.socket.once("close", () => resolve()), + ); + + // Now SIGKILL the daemon. No graceful Server.close, no exit broadcast. + assert.ok(daemonProcess); + daemonProcess.kill("SIGKILL"); + await new Promise((r) => daemonProcess?.once("exit", r)); + + // Client should see the socket close within reasonable time. + await Promise.race([ + disconnected, + new Promise((_, rej) => + setTimeout(() => rej(new Error("disconnect not surfaced")), 2000), + ), + ]); + + // Subsequent send fails synchronously (writable: false) or async. + // Either way, no hang. + try { + client.send({ type: "list" }); + } catch { + // Either path is acceptable — just don't hang. + } + + // Process is gone; ensure cleanup so `after` doesn't block. + daemonProcess = null; + try { + fs.unlinkSync(SOCK); + } catch { + // best-effort — daemon's atexit didn't run because of SIGKILL + } + }); +}); + +// ---------------- helpers ---------------- + +interface Client { + socket: net.Socket; + messages: ServerMessage[]; + send(m: unknown): void; + waitFor( + predicate: (m: ServerMessage) => boolean, + ms: number, + ): Promise; +} + +function connect(): Promise { + return new Promise((resolve, reject) => { + const socket = net.createConnection({ path: SOCK }); + const decoder = new FrameDecoder(); + const messages: ServerMessage[] = []; + + socket.on("data", (chunk) => { + decoder.push(chunk); + for (const raw of decoder.drain()) { + messages.push(raw as ServerMessage); + } + }); + + socket.once("error", reject); + socket.once("connect", () => { + resolve({ + socket, + messages, + send(m) { + if (!socket.destroyed) socket.write(encodeFrame(m)); + }, + waitFor(predicate, ms) { + return new Promise((res, rej) => { + const found = messages.find(predicate); + if (found) return res(found); + const onData = () => { + const m = messages.find(predicate); + if (m) { + socket.off("data", onData); + clearTimeout(t); + res(m); + } + }; + const t = setTimeout(() => { + socket.off("data", onData); + rej(new Error("waitFor timed out")); + }, ms); + socket.on("data", onData); + }); + }, + }); + }); + }); +} From d4b9a4b86edb2c8cc8d3cb77ccb256d43ac83da7 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 12:25:32 -0700 Subject: [PATCH 19/33] refactor(host-service): own pty-daemon supervision MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the pty-daemon supervisor (spawn / adopt / restart / version-detect / crash-circuit / manifest) from the desktop main process into host-service. The daemon is supervised by host-service so it can be deployed independently of Electron — that's the v2 thesis. Daemon outlives host-service crashes via detached spawn + manifest adoption (unchanged). Renderer reads daemon state through `workspaceTrpc.terminal.daemon.*` instead of `electronTrpc.ptyDaemon.*`. Telemetry track() calls become structured `console.log` lines (JSON with `component: "pty-daemon-supervisor"`) — host-service has no PostHog plumbing yet. Boot is fire-and-track: host-service kicks off `ensureDaemon(orgId)` at startup without awaiting; terminal request handlers `await waitForDaemonReady()` before using the supervisor's socket path. Non-terminal ops are unaffected if the daemon takes time to come up. The `SUPERSET_PTY_DAEMON_SOCKET` env-var contract from desktop → host-service goes away in production. Kept as a test escape hatch for the in-process adoption integration test. Tests: 21 supervisor unit tests moved to host-service. The 3 desktop real-spawn version-roundtrip tests are dropped — equivalent coverage already lives at the daemon package boundary. Plan: apps/desktop/plans/20260430-pty-daemon-host-service-migration.md (in the dull-protocol design-doc branch). --- ...260430-pty-daemon-implementation-report.md | 223 ------ .../src/main/lib/host-service-coordinator.ts | 45 +- .../src/main/lib/pty-daemon-coordinator.ts | 403 ---------- apps/desktop/src/main/pty-daemon/index.ts | 57 -- .../TerminalSettings/TerminalSettings.tsx | 8 +- .../V2SessionsSection/V2SessionsSection.tsx | 251 +++++++ .../components/V2SessionsSection/index.ts | 1 + bun.lock | 2 + packages/host-service/package.json | 2 + .../src/daemon/DaemonSupervisor.test.ts | 450 ++++++++++++ .../src/daemon/DaemonSupervisor.ts | 686 ++++++++++++++++++ .../src/daemon/expected-version.ts | 16 + packages/host-service/src/daemon/index.ts | 13 + packages/host-service/src/daemon/log-fd.ts | 33 + .../host-service/src/daemon/manifest.ts | 35 +- packages/host-service/src/daemon/singleton.ts | 95 +++ packages/host-service/src/serve.ts | 9 + .../src/terminal/daemon-client-singleton.ts | 31 +- .../src/trpc/router/terminal/terminal.ts | 25 + packages/pty-daemon/src/main.ts | 6 +- 20 files changed, 1647 insertions(+), 744 deletions(-) delete mode 100644 apps/desktop/plans/20260430-pty-daemon-implementation-report.md delete mode 100644 apps/desktop/src/main/lib/pty-daemon-coordinator.ts delete mode 100644 apps/desktop/src/main/pty-daemon/index.ts create mode 100644 apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/V2SessionsSection.tsx create mode 100644 apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/index.ts create mode 100644 packages/host-service/src/daemon/DaemonSupervisor.test.ts create mode 100644 packages/host-service/src/daemon/DaemonSupervisor.ts create mode 100644 packages/host-service/src/daemon/expected-version.ts create mode 100644 packages/host-service/src/daemon/index.ts create mode 100644 packages/host-service/src/daemon/log-fd.ts rename apps/desktop/src/main/lib/pty-daemon-manifest.ts => packages/host-service/src/daemon/manifest.ts (75%) create mode 100644 packages/host-service/src/daemon/singleton.ts diff --git a/apps/desktop/plans/20260430-pty-daemon-implementation-report.md b/apps/desktop/plans/20260430-pty-daemon-implementation-report.md deleted file mode 100644 index 1149d81f5f9..00000000000 --- a/apps/desktop/plans/20260430-pty-daemon-implementation-report.md +++ /dev/null @@ -1,223 +0,0 @@ -# pty-daemon Implementation Report - -**Status:** Phase 1 implemented; awaiting review. -**Date:** 2026-04-30 -**Branch:** `pty-daemon-host-integration` -**PR:** #3896 -**Plan:** `20260429-pty-daemon-implementation.md` - -Concise audit of every change against the plan. Each deviation has a -**DECISION** marker with the choice you need to make: accept the -deviation (and I'll update the plan), revert to plan (and I'll update -the code), or defer. - -## TL;DR - -- **Architecture:** as planned — daemon outlives host-service via - manifest-based adoption, identical lifetime model to host-service. -- **Tests:** 65 across 4 layers (24 daemon unit + 30 daemon - control-plane + 5 host-side DaemonClient + 6 host-service E2E). -- **Plan-compliance:** 18 decisions correctly implemented as specified; - 6 deviations from the plan (most are improvements or pragmatic - trade-offs); 7 explicit plan items not done; 1 wrong assertion of - mine that this report corrects. -- **Operationally ready:** the architecture is correct; the - observability and failure-mode hooks the plan called out (telemetry, - crash supervision) are not yet wired. - -## What shipped - -``` -packages/pty-daemon/ -├── src/ -│ ├── main.ts # Node entry: argv → Server.listen -│ ├── index.ts # public exports -│ ├── protocol/ -│ │ ├── version.ts # CURRENT_PROTOCOL_VERSION + supported list -│ │ ├── messages.ts # ClientMessage / ServerMessage unions -│ │ ├── framing.ts # encodeFrame / FrameDecoder -│ │ └── index.ts -│ ├── Pty/Pty.ts # node-pty wrapper + dim validation -│ ├── SessionStore/SessionStore.ts # in-memory map + ring buffer per session -│ ├── handlers/handlers.ts # open/input/resize/close/list/subscribe -│ └── Server/Server.ts # AF_UNIX accept loop, handshake, dispatch -├── test/ -│ ├── helpers/client.ts # reusable DaemonClient -│ ├── integration.test.ts # smoke (3 tests) -│ └── control-plane.test.ts # exhaustive (30 tests, 11 suites) -└── build.ts # Bun.build target=node → dist/pty-daemon.js - -packages/host-service/src/terminal/ -├── DaemonClient/ -│ ├── DaemonClient.ts # Unix-socket client w/ multi-subscriber fan-out -│ └── DaemonClient.node-test.ts # 5 integration tests under node:test -├── daemon-client-singleton.ts # lazy DaemonClient singleton -├── terminal.ts # refactored to use DaemonClient (was node-pty.spawn) -└── terminal.adoption.node-test.ts # 6 E2E tests under Electron-as-Node - -apps/desktop/src/main/ -├── lib/ -│ ├── pty-daemon-coordinator.ts # spawn/adopt; sibling of HostServiceCoordinator -│ └── pty-daemon-manifest.ts # manifest read/write helpers -└── pty-daemon/index.ts # main entry that registers in electron.vite.config.ts -``` - -Source: ~870 LOC daemon, ~270 LOC DaemonClient, ~250 LOC coordinator. -Tests: ~1100 LOC across 4 layers. - -## Plan-compliance audit - -### ✅ Correctly implemented as specified (18) - -| # | Plan decision | Verified by | -|---|---|---| -| 1 | Architecture E (daemon now, fd-handoff Phase 2 deferred) | Code structure | -| 2 | Daemon runtime: Node + node-pty | `build.ts`, `bin` field, `engines.node` | -| 3 | Daemon scope: pure PTY runtime, stateless from client perspective | No HTTP/auth/DB/business logic anywhere in `packages/pty-daemon/src` | -| 4 | Transport: AF_UNIX SOCK_STREAM + length-prefixed binary frames | `protocol/framing.ts`, `Server/Server.ts` | -| 5 | Auth: Unix socket file mode 0600 | `Server.listen()` chmod | -| 6 | In-memory ring buffer per session, ~64 KB | `SessionStore.ts` | -| 7 | All v1 anti-patterns omitted (HistoryWriter, cold restore, tombstones, EventEmitter, dedup, priority semaphore, ANSI parsing, sticky state, deferred-cleanup setTimeout) | Grep | -| 8 | Per-session snapshot on attach (pid, cols, rows, alive) | `open-ok` + `list-reply` messages | -| 9 | Resize bounds validation | `Pty.ts:validateDims` | -| 10 | Signal abstraction as strings | Protocol message types | -| 11 | Graceful shutdown ordering | `Server.close()` | -| 12 | Versioned handshake | `protocol/version.ts` + Server dispatch | -| 13 | Renderer code zero changes | No diffs in `apps/desktop/src/renderer` | -| 14 | PSK auth boundary unchanged at host-service | Hono WS upgrade unchanged | -| 15 | terminalSessions DB table unchanged; daemon never touches DB | Daemon has no `better-sqlite3` import | -| 16 | Daemon binary bundled via electron-vite alongside host-service | `electron.vite.config.ts:115` adds entry; outputs `dist/main/pty-daemon.js` | -| 17 | node-pty version pinned (1.1.0) | `package.json` | -| 18 | **Daemon outlives host-service restart and app quit; killed only on explicit restart (and dev-mode reload by HostServiceCoordinator's enableDevReload)** | `tryAdopt()` finds detached daemon at next launch; no `before-quit` hook | - -### ⚠️ Deviations from the plan (6) — DECISIONS NEEDED - -#### Deviation #1: Host-side ring buffer kept - -- **Plan:** "Move the ring buffer entirely to the daemon. host-service no longer holds replay state; it asks the daemon for replay-on-attach via `subscribe { replay: true }`." -- **What I did:** Kept the 64 KB host-side buffer (`terminal.ts:64,101-102,206-225`) for in-process fan-out to multiple WS subscribers. The daemon also has its own 64 KB buffer; that one is the cross-restart source of truth. -- **Why:** Removing the host buffer would require either (a) a separate daemon subscription per WS connection, or (b) buffer-aware replay logic that re-asks the daemon on each WS attach. Keeping the host buffer is the smallest, most behaviour-preserving change. -- **Trade-off:** Two layers of 64 KB buffers per session. Memory cost is negligible. The deviation removes one of the v1-bloat rationales (host should be stateless re PTY data plane), but only partially. -- **DECISION:** - - [ ] **A: Accept deviation** — update the plan to reflect "host-side fan-out buffer + daemon source-of-truth buffer." - - [ ] B: Revert to plan — remove host buffer, add per-WS daemon subscriptions in a follow-up. - - [ ] C: Defer to a cleanup PR; ship as-is. - -#### Deviation #2: Per-organization daemon (not per-workspace) - -- **Plan:** "Per-workspace daemon (mirrors current host-service-per-workspace)." -- **What I did:** Per-organization daemon, exactly mirroring `HostServiceCoordinator` which is keyed by `organizationId`. -- **Why:** The plan's parenthetical claim is wrong: host-service is per-organization, not per-workspace. I matched real host-service. -- **DECISION:** - - [ ] **A: Accept deviation** — fix the plan to say "Per-organization, mirroring host-service-per-organization." - - [ ] B: Revert to plan — refactor to per-workspace (no production reason to do this; would create N daemons per org). - -#### Deviation #3: Manifest `startedAt` is epoch ms, not ISO 8601 string - -- **Plan:** `startedAt: string` ISO 8601. -- **What I did:** `startedAt: number` epoch ms, matching `HostServiceManifest`. -- **DECISION:** - - [ ] **A: Accept deviation** — keep epoch ms, fix the plan. - - [ ] B: Revert to plan — switch to ISO string. (Trivial change; no real impact either way.) - -#### Deviation #4: Protocol module split into 3 files - -- **Plan:** single `protocol/protocol.ts`. -- **What I did:** `protocol/version.ts`, `protocol/messages.ts`, `protocol/framing.ts`. -- **Why:** Cleaner separation of concerns; tests only need to import what they use. -- **DECISION:** - - [ ] **A: Accept deviation** — update the plan to show three files. - - [ ] B: Revert — collapse into one file. (No real benefit; current shape is more readable.) - -#### Deviation #5: Adoption check skips protocol-version verification - -- **Plan:** "If PID alive **and socket connectable and protocol version compatible** → adopt." -- **What I did:** Adoption checks PID alive + socket connectable; **does not** connect-and-handshake to verify protocol compatibility before adopting. -- **Why:** v1 is the only protocol; pure overhead today. The check matters when Phase 2 introduces a v2 binary alongside v1 daemons. -- **DECISION:** - - [ ] **A: Accept until Phase 2 lands** — flag this in the plan as deferred. - - [ ] B: Implement now — costs ~30 LOC; trivial. Adds a connect/handshake/disconnect cycle to every adoption. - -#### Deviation #6: `subscribe` / `unsubscribe` as explicit protocol ops - -- **Plan:** ops list mentioned `subscribe-output` (one op). -- **What I did:** `subscribe` (with `replay: bool`) and `unsubscribe` as separate ops; daemon supports multi-subscriber fan-out per session. -- **DECISION:** - - [ ] **A: Accept deviation** — update the plan to show both ops. - - [ ] B: Revert — collapse to one op. (No real benefit; current shape is the minimum needed for renderer reattach + observer mode.) - -### ❗ Plan items NOT done (7) - -| # | Plan item | Status | Risk if shipped without | -|---|---|---|---| -| 1 | **Telemetry: 6 events** (`pty_daemon_spawn/adopt/session_open/session_exit/crash`, `host_service_restart_sessions_preserved`) | None wired | **Can't measure success or detect crashes.** The headline metric of the entire project is unobservable. | -| 2 | Daemon crash supervision: "3 crashes in 60s → stop respawning, surface to user" (Open Decision #3 in plan) | Not implemented; coordinator doesn't even auto-respawn after exit | Daemon crashes mid-session = silent terminal death until host-service restart | -| 3 | host-service crash integration test (real `kill -9` + verify renderer reattaches) | Adoption tested via `__resetSessionsForTesting`, not real `kill -9` | Real-world signal handling (no graceful close events) untested | -| 4 | Daemon crash integration test | Not explicitly tested | Same gap | -| 5 | Linux + macOS x86_64 Phase 0 / Phase 1 verification | Not done; macOS arm64 only | Architecture is portable but unverified — defer until shipping to those platforms | -| 6 | Daemon-disconnect → close terminal WS streams | `daemon-client-singleton.ts` clears its cache but doesn't close ws sockets to the renderer | Renderer thinks the terminal is alive; input silently fails | -| 7 | `/tmp/superset-ptyd-*.sock` sweep on coordinator init | Not done | Cosmetic; `/tmp` accumulates over time | - -**DECISION:** for each, mark **before ship** / **after ship** / **never** to set scope: - - [ ] #1 telemetry — recommended **before ship** - - [ ] #2 crash supervision — recommended **before ship** - - [ ] #3, #4 crash tests — recommended **before ship** - - [ ] #5 Linux verification — recommended **before shipping to Linux** - - [ ] #6 disconnect → close WS — recommended **before ship** - - [ ] #7 `/tmp` sweep — recommended **after ship** or **never** (cosmetic) - -### ❗ Decisions I made that weren't in the plan (5) - -| # | What I decided | Why | Plan should mention? | -|---|---|---|---| -| 1 | Socket path in `os.tmpdir()/superset-ptyd-<12hex>.sock`, not `$SUPERSET_HOME_DIR/host//pty-daemon.sock` | Darwin's 104-byte `sun_path` limit; original path was 159+ chars in dev | **Yes** — add to plan as the reason for this path | -| 2 | Adoption-on-EEXIST path in `createTerminalSessionInternal` | Race: host-service restart finds daemon already has the session id; bare `daemon.open` errors with EEXIST → tight loop until adopted | **Yes** — add as a critical post-restart code path | -| 3 | `__resetSessionsForTesting()` test escape hatch exported from production `terminal.ts` | Needed for in-process e2e testing of the adoption path | **Maybe** — note the test-only contract | -| 4 | Daemon's `handleOpen` recycles already-`exited` session entries (drops dead entry, spawns fresh); live entries still EEXIST | Without this, dispose-then-recreate-with-same-id loops forever — late-subscriber replay needs the entry to stick around after exit, but a fresh `open` should not see it as a collision | **Yes** — small protocol semantic to document | -| 5 | Initial-command suppression on adoption (`initialCommandQueued: isAdopted`) | Without this, setup.sh would re-run on every host-service restart for setup terminals | **Yes** — document | - -**DECISION:** for each `Yes`, I'll update the plan if you accept. - -### ❌ Wrong assertion I made earlier - -In the prior "shippability" assessment I said: - -> "App-quit lifecycle for the daemon ⚠️ — should fix before ship: daemon should be killed when user quits app." - -This is **wrong**. The plan and `HOST_SERVICE_LIFECYCLE.md` specify the daemon **outlives app quit** (manifest-based adoption picks it up next launch — same model as host-service). Only `enableDevReload` in `HostServiceCoordinator` tears down running services for hot-reload during dev. - -Action: do **not** add a `before-quit` hook; the current behavior is correct. - -## Five-question summary you can answer in one pass - -| # | Question | Recommendation | -|---|---|---| -| 1 | Accept all 6 plan deviations? (host-side buffer, per-org daemon, manifest format, protocol split, adoption proto-version skip, subscribe/unsubscribe ops) | **Yes**, update plan | -| 2 | Wire telemetry before ship? | **Yes** (~50 LOC, the project's headline metric is currently unobservable) | -| 3 | Wire daemon crash supervision before ship? | **Yes** (~80 LOC, agreed crash policy isn't actually implemented) | -| 4 | Wire daemon-disconnect → close WS streams before ship? | **Yes** (~30 LOC, otherwise silent terminal failure) | -| 5 | Add real `kill -9` integration tests? | **Yes** (~40 LOC of test code) | - -If you say yes to all five: ~200 LOC of additional production code + ~100 LOC of tests. Roughly half a day. Then this is genuinely shippable to users. - -## What's currently in PR #3896 - -7 commits on `pty-daemon-host-integration`: - -1. `9bdbf7b85` feat(host-service): DaemonClient — Unix-socket client for pty-daemon -2. `b1eb105f0` feat(desktop): pty-daemon coordinator + manifest + main entry -3. `401e203fe` feat(host-service): route terminal sessions through pty-daemon -4. `b387324e1` fix(desktop): make pty-daemon spawn failure non-fatal for host-service -5. `df81d8b15` fix(desktop): allow .env / shell to provide SUPERSET_PTY_DAEMON_SOCKET -6. `2e8d2167e` debug(desktop): surface daemon spawn failures with log tail + child exit code -7. `05ae50c20` fix(desktop): use short /tmp path for pty-daemon socket (Darwin sun_path) -8. `aae131eb3` fix(host-service): adopt existing daemon sessions on host-service restart -9. `2bbb0846c` test(pty-daemon): replay-on-exited-session edge case -10. `525d3ec94` test(host-service): full E2E adoption test under Electron-as-Node -11. `a6f09d36a` fix(pty-daemon) + test(host-service): three more edge cases - -## Status flag - -Once you've made the five decisions above, this report becomes -**signed-off** and I update the implementation plan to match the -final accepted state. diff --git a/apps/desktop/src/main/lib/host-service-coordinator.ts b/apps/desktop/src/main/lib/host-service-coordinator.ts index d2838cdb168..c78366ec4da 100644 --- a/apps/desktop/src/main/lib/host-service-coordinator.ts +++ b/apps/desktop/src/main/lib/host-service-coordinator.ts @@ -27,7 +27,6 @@ import { pollHealthCheck, } from "./host-service-utils"; import { localDb } from "./local-db"; -import { PtyDaemonCoordinator } from "./pty-daemon-coordinator"; import { HOOK_PROTOCOL_VERSION } from "./terminal/env"; /** @@ -83,13 +82,10 @@ export class HostServiceCoordinator extends EventEmitter { private scriptPath = path.join(__dirname, "host-service.js"); private machineId = getHostId(); private devReloadWatcher: fs.FSWatcher | null = null; - // Sibling coordinator for the long-lived pty-daemon. Owns PTYs so that - // host-service restarts don't kill user shells. Its scriptPath sits next - // to ours after the build (apps/desktop/src/main + dist/host-service.js + - // dist/pty-daemon.js — see runtime-dependencies.ts for packaging). - private ptyDaemon = new PtyDaemonCoordinator({ - scriptPath: path.join(__dirname, "pty-daemon.js"), - }); + // Note: pty-daemon supervision moved into host-service itself — + // see packages/host-service/src/daemon. Host-service spawns and adopts + // the daemon when it boots, so the desktop coordinator no longer needs + // to know about it. async start( organizationId: string, @@ -393,28 +389,10 @@ export class HostServiceCoordinator extends EventEmitter { this.instances.set(organizationId, instance); this.emitStatus(organizationId, "starting", null); - // Try to bring up the pty-daemon. If it fails (e.g. dev build doesn't - // have dist/main/pty-daemon.js yet), don't take host-service down with - // it — workspaces, git, chat, etc. should still work. Terminal ops - // will surface a clear error to the renderer instead. - let daemonSocketPath = ""; - try { - const daemonInstance = await this.ptyDaemon.ensure(organizationId); - daemonSocketPath = daemonInstance.socketPath; - } catch (error) { - console.error( - `[host-service:${organizationId}] pty-daemon failed to start; terminals will be unavailable until it recovers:`, - error, - ); - } - - const childEnv = await this.buildEnv( - organizationId, - port, - secret, - config, - daemonSocketPath, - ); + // pty-daemon is supervised by host-service itself; this coordinator + // only spawns host-service and steps out. See + // packages/host-service/src/daemon for the supervisor lifecycle. + const childEnv = await this.buildEnv(organizationId, port, secret, config); // Host-service owns v2 PTYs, so it must survive Electron restarts in // every environment. This mirrors the terminal-host daemon: detach the // child and back stdio with real files so parent teardown cannot close @@ -486,15 +464,11 @@ export class HostServiceCoordinator extends EventEmitter { port: number, secret: string, config: SpawnConfig, - ptyDaemonSocket: string, ): Promise> { const organizationDir = manifestDir(organizationId); const row = localDb.select().from(settings).get(); const exposeViaRelay = row?.exposeHostServiceViaRelay ?? false; - // Allow .env / shell SUPERSET_PTY_DAEMON_SOCKET to take effect when our - // own daemon spawn failed. Set it explicitly only when we have a real - // path; otherwise inherit whatever the parent has. const baseEnv: Record = { ...(process.env as Record), ELECTRON_RUN_AS_NODE: "1", @@ -515,9 +489,6 @@ export class HostServiceCoordinator extends EventEmitter { AUTH_TOKEN: config.authToken, CLOUD_API_URL: config.cloudApiUrl, }; - if (ptyDaemonSocket) { - baseEnv.SUPERSET_PTY_DAEMON_SOCKET = ptyDaemonSocket; - } const childEnv = await getProcessEnvWithShellPath(baseEnv); diff --git a/apps/desktop/src/main/lib/pty-daemon-coordinator.ts b/apps/desktop/src/main/lib/pty-daemon-coordinator.ts deleted file mode 100644 index fb7251fb663..00000000000 --- a/apps/desktop/src/main/lib/pty-daemon-coordinator.ts +++ /dev/null @@ -1,403 +0,0 @@ -// PtyDaemonCoordinator — sibling of HostServiceCoordinator, owns the -// per-organization pty-daemon process. Spawns or adopts the daemon and -// returns its Unix-socket path. host-service is told the path via -// SUPERSET_PTY_DAEMON_SOCKET so its DaemonClient can connect. -// -// Why detached spawn (matches host-service's approach): the daemon must -// outlive the desktop main process and host-service restarts. PTY ownership -// lives here so the rest of the system can be killed/restarted freely -// without losing user shells. - -import * as childProcess from "node:child_process"; -import { createHash } from "node:crypto"; -import * as fs from "node:fs"; -import * as net from "node:net"; -import * as os from "node:os"; -import * as path from "node:path"; -import { track } from "./analytics"; -import { SUPERSET_HOME_DIR } from "./app-environment"; -import { isProcessAlive } from "./host-service-manifest"; -import { MAX_HOST_LOG_BYTES, openRotatingLogFd } from "./host-service-utils"; -import { - type PtyDaemonManifest, - ptyDaemonManifestDir, - readPtyDaemonManifest, - removePtyDaemonManifest, - writePtyDaemonManifest, -} from "./pty-daemon-manifest"; - -interface DaemonInstance { - pid: number; - socketPath: string; - startedAt: number; -} - -const SOCKET_READY_TIMEOUT_MS = 5_000; - -/** - * Crash supervision parameters. If the daemon for an organization crashes - * more than CRASH_BUDGET times within CRASH_WINDOW_MS, we stop respawning - * and surface a hard error — repeated crashes are a bug, not transient - * recovery. Per the implementation plan's Open Decision #3. - */ -const CRASH_BUDGET = 3; -const CRASH_WINDOW_MS = 60_000; - -/** - * Per-organization socket path. **Must stay short** — Darwin's `sun_path` - * is 104 bytes, and `$SUPERSET_HOME_DIR/host/{orgId}/pty-daemon.sock` blows - * past that in dev (worktree-relative SUPERSET_HOME_DIR + 36-char UUID). - * - * We put the socket in `os.tmpdir()` with a hash of the org id. Owner-only - * file mode (0600, set by the daemon's Server.listen) is the auth boundary; - * the directory permissions don't matter. - */ -function ptyDaemonSocketPath(organizationId: string): string { - const shortId = createHash("sha256") - .update(organizationId) - .digest("hex") - .slice(0, 12); - return path.join(os.tmpdir(), `superset-ptyd-${shortId}.sock`); -} - -export interface PtyDaemonCoordinatorOptions { - /** Path to the daemon entry script (e.g. dist/pty-daemon.js). */ - scriptPath: string; -} - -export class PtyDaemonCoordinator { - private readonly opts: PtyDaemonCoordinatorOptions; - private readonly instances = new Map(); - private readonly pendingStarts = new Map>(); - /** Recent crash timestamps per orgId, for the circuit breaker. */ - private readonly crashTimes = new Map(); - /** Orgs we've explicitly stopped — exit isn't a crash, don't respawn. */ - private readonly stopping = new Set(); - /** Orgs that tripped the circuit breaker — refuse respawn until cleared. */ - private readonly circuitOpen = new Set(); - - constructor(opts: PtyDaemonCoordinatorOptions) { - this.opts = opts; - } - - /** - * Has the org tripped the crash circuit breaker? Once tripped, ensure() - * fails fast with a clear error until clearCrashCircuit() is called. - */ - isCircuitOpen(organizationId: string): boolean { - return this.circuitOpen.has(organizationId); - } - - /** - * Reset the crash counter and close the circuit. Call this from a UI - * "retry" action after surfacing the error to the user. - */ - clearCrashCircuit(organizationId: string): void { - this.circuitOpen.delete(organizationId); - this.crashTimes.delete(organizationId); - } - - /** - * Spawn the daemon if not already running for this organization, or - * adopt the running one. Returns the socket path host-service should - * connect to. - */ - async ensure(organizationId: string): Promise { - if (this.circuitOpen.has(organizationId)) { - throw new Error( - `[pty-daemon:${organizationId}] crash circuit open: ${CRASH_BUDGET} crashes within ${CRASH_WINDOW_MS / 1000}s. Restart the desktop app to retry.`, - ); - } - const existing = this.instances.get(organizationId); - if (existing) return existing; - const pending = this.pendingStarts.get(organizationId); - if (pending) return pending; - - const startPromise = this.start(organizationId).finally(() => { - this.pendingStarts.delete(organizationId); - }); - this.pendingStarts.set(organizationId, startPromise); - return startPromise; - } - - getSocketPath(organizationId: string): string | null { - return this.instances.get(organizationId)?.socketPath ?? null; - } - - async stop(organizationId: string): Promise { - const instance = this.instances.get(organizationId); - this.instances.delete(organizationId); - if (!instance) return; - // Mark this exit as intentional so the on-exit handler doesn't count - // it toward the crash circuit breaker. - this.stopping.add(organizationId); - try { - process.kill(instance.pid, "SIGTERM"); - } catch { - // Already dead. - } - removePtyDaemonManifest(organizationId); - } - - private async start(organizationId: string): Promise { - // Try to adopt an existing daemon if its manifest is fresh and - // process is alive and the socket is connectable. - const adopted = await this.tryAdopt(organizationId); - if (adopted) { - this.instances.set(organizationId, adopted); - console.log( - `[pty-daemon:${organizationId}] adopted existing daemon pid=${adopted.pid}`, - ); - track("pty_daemon_adopt", { - organizationId, - pid: adopted.pid, - ageSeconds: Math.round((Date.now() - adopted.startedAt) / 1000), - }); - return adopted; - } - - // Otherwise spawn a fresh one. - const instance = await this.spawn(organizationId); - track("pty_daemon_spawn", { - organizationId, - pid: instance.pid, - socketPath: instance.socketPath, - }); - return instance; - } - - private async tryAdopt( - organizationId: string, - ): Promise { - const manifest = readPtyDaemonManifest(organizationId); - if (!manifest) return null; - if (!isProcessAlive(manifest.pid)) { - removePtyDaemonManifest(organizationId); - return null; - } - const reachable = await isSocketConnectable(manifest.socketPath, 1000); - if (!reachable) { - // PID alive but socket gone — daemon is wedged. Kill and respawn. - try { - process.kill(manifest.pid, "SIGTERM"); - } catch { - // Already dead. - } - removePtyDaemonManifest(organizationId); - return null; - } - return { - pid: manifest.pid, - socketPath: manifest.socketPath, - startedAt: manifest.startedAt, - }; - } - - private async spawn(organizationId: string): Promise { - const dir = ptyDaemonManifestDir(organizationId); - if (!fs.existsSync(dir)) { - fs.mkdirSync(dir, { recursive: true, mode: 0o700 }); - } - const socketPath = ptyDaemonSocketPath(organizationId); - const logPath = path.join(dir, "pty-daemon.log"); - - // Sanity: refuse to spawn if the script doesn't exist (e.g. dev build - // hasn't produced dist/main/pty-daemon.js yet). Otherwise the spawn - // will silently exit and we wait the full timeout. - if (!fs.existsSync(this.opts.scriptPath)) { - throw new Error( - `[pty-daemon:${organizationId}] script not found at ${this.opts.scriptPath} — restart electron-vite dev to bundle the new entry`, - ); - } - - const logFd = openRotatingLogFd(logPath, MAX_HOST_LOG_BYTES); - const stdio: childProcess.StdioOptions = - logFd >= 0 ? ["ignore", logFd, logFd] : ["ignore", "ignore", "ignore"]; - - const childEnv = { - ...(process.env as Record), - ELECTRON_RUN_AS_NODE: "1", - ORGANIZATION_ID: organizationId, - SUPERSET_HOME_DIR, - }; - - console.log( - `[pty-daemon:${organizationId}] spawning ${this.opts.scriptPath} → ${socketPath} (log: ${logPath})`, - ); - - let child: ReturnType; - try { - child = childProcess.spawn( - process.execPath, - [this.opts.scriptPath, `--socket=${socketPath}`], - { - detached: true, - stdio, - env: childEnv, - windowsHide: true, - }, - ); - } finally { - if (logFd >= 0) { - try { - fs.closeSync(logFd); - } catch { - // best-effort - } - } - } - - const childPid = child.pid; - if (!childPid) { - throw new Error(`[pty-daemon:${organizationId}] failed to spawn`); - } - - // Capture an early exit so the timeout error reports the actual cause. - let earlyExitCode: number | null = null; - let earlyExitSignal: NodeJS.Signals | null = null; - child.once("exit", (code, signal) => { - earlyExitCode = code; - earlyExitSignal = signal; - }); - - // Wait for the socket file to appear AND become connectable. - const ready = await waitForSocket(socketPath, SOCKET_READY_TIMEOUT_MS); - if (!ready) { - try { - child.kill("SIGTERM"); - } catch { - // best-effort - } - let logTail = ""; - try { - const buf = fs.readFileSync(logPath, "utf-8"); - logTail = buf.slice(-2000); - } catch { - logTail = "(no log file written)"; - } - track("pty_daemon_spawn_failed", { - organizationId, - reason: "socket-not-ready", - timeoutMs: SOCKET_READY_TIMEOUT_MS, - earlyExitCode, - earlyExitSignal, - }); - throw new Error( - `[pty-daemon:${organizationId}] socket did not become ready within ${SOCKET_READY_TIMEOUT_MS}ms (childPid=${childPid}, earlyExit=${earlyExitCode ?? earlyExitSignal ?? "still alive"}). Log tail:\n${logTail}`, - ); - } - - child.unref(); - child.on("exit", (code) => { - console.log(`[pty-daemon:${organizationId}] exited with code ${code}`); - const current = this.instances.get(organizationId); - if (current?.pid !== childPid) return; - this.instances.delete(organizationId); - removePtyDaemonManifest(organizationId); - - // Was this exit intentional (we called stop)? If so, no crash - // accounting and no respawn. - if (this.stopping.has(organizationId)) { - this.stopping.delete(organizationId); - return; - } - - // Unexpected exit — record the crash and decide whether to - // auto-respawn or trip the circuit breaker. - const now = Date.now(); - const recent = (this.crashTimes.get(organizationId) ?? []).filter( - (t) => now - t < CRASH_WINDOW_MS, - ); - recent.push(now); - this.crashTimes.set(organizationId, recent); - - track("pty_daemon_crash", { - organizationId, - exitCode: code, - crashesInWindow: recent.length, - windowSeconds: CRASH_WINDOW_MS / 1000, - ageSeconds: Math.round((now - current.startedAt) / 1000), - }); - - if (recent.length > CRASH_BUDGET) { - this.circuitOpen.add(organizationId); - console.error( - `[pty-daemon:${organizationId}] crash circuit OPEN — ${recent.length} crashes in ${CRASH_WINDOW_MS / 1000}s; refusing further respawns until clearCrashCircuit() is called`, - ); - track("pty_daemon_circuit_open", { - organizationId, - crashesInWindow: recent.length, - }); - return; - } - - console.warn( - `[pty-daemon:${organizationId}] auto-respawning after unexpected exit (${recent.length}/${CRASH_BUDGET} in window)`, - ); - void this.ensure(organizationId).catch((err) => { - console.error( - `[pty-daemon:${organizationId}] auto-respawn failed:`, - err, - ); - }); - }); - - const startedAt = Date.now(); - const manifest: PtyDaemonManifest = { - pid: childPid, - socketPath, - protocolVersions: [1], - daemonVersion: "unknown", // filled in by hello-ack on first connect - startedAt, - organizationId, - }; - writePtyDaemonManifest(manifest); - - const instance: DaemonInstance = { - pid: childPid, - socketPath, - startedAt, - }; - this.instances.set(organizationId, instance); - console.log( - `[pty-daemon:${organizationId}] spawned pid=${childPid} socket=${socketPath}`, - ); - return instance; - } -} - -async function waitForSocket( - socketPath: string, - timeoutMs: number, -): Promise { - const deadline = Date.now() + timeoutMs; - while (Date.now() < deadline) { - if (fs.existsSync(socketPath)) { - if (await isSocketConnectable(socketPath, 200)) return true; - } - await new Promise((r) => setTimeout(r, 50)); - } - return false; -} - -function isSocketConnectable( - socketPath: string, - timeoutMs: number, -): Promise { - return new Promise((resolve) => { - const sock = net.createConnection({ path: socketPath }); - const timer = setTimeout(() => { - sock.destroy(); - resolve(false); - }, timeoutMs); - sock.once("connect", () => { - clearTimeout(timer); - sock.end(); - resolve(true); - }); - sock.once("error", () => { - clearTimeout(timer); - resolve(false); - }); - }); -} diff --git a/apps/desktop/src/main/pty-daemon/index.ts b/apps/desktop/src/main/pty-daemon/index.ts deleted file mode 100644 index c97755a175c..00000000000 --- a/apps/desktop/src/main/pty-daemon/index.ts +++ /dev/null @@ -1,57 +0,0 @@ -/** - * pty-daemon — Desktop Entry Point - * - * Long-lived process that owns all PTY sessions. host-service is a client - * over a Unix socket. The PtyDaemonCoordinator (sibling of - * HostServiceCoordinator) spawns this and passes --socket=PATH. - * - * Mirrors the host-service entry shape: imports from the workspace package - * and provides the bare runtime glue (argv parsing, signal handling). - */ - -import { Server } from "@superset/pty-daemon"; - -interface CliArgs { - socket: string; -} - -function parseArgs(argv: string[]): CliArgs { - const args: Partial = {}; - for (const arg of argv) { - if (arg.startsWith("--socket=")) { - args.socket = arg.slice("--socket=".length); - } - } - if (!args.socket) { - throw new Error("--socket=PATH is required"); - } - return args as CliArgs; -} - -async function main(): Promise { - const args = parseArgs(process.argv.slice(2)); - const daemonVersion = process.env.SUPERSET_PTY_DAEMON_VERSION ?? "0.1.0"; - const server = new Server({ - socketPath: args.socket, - daemonVersion, - }); - await server.listen(); - process.stderr.write( - `[pty-daemon] listening on ${args.socket} (v${daemonVersion})\n`, - ); - - const shutdown = async (signal: NodeJS.Signals) => { - process.stderr.write(`[pty-daemon] received ${signal}, shutting down\n`); - await server.close(); - process.exit(0); - }; - process.on("SIGINT", () => void shutdown("SIGINT")); - process.on("SIGTERM", () => void shutdown("SIGTERM")); -} - -void main().catch((error) => { - process.stderr.write( - `[pty-daemon] failed to start: ${(error as Error).stack ?? error}\n`, - ); - process.exit(1); -}); diff --git a/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/TerminalSettings.tsx b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/TerminalSettings.tsx index 502ace48ab9..f2195247076 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/TerminalSettings.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/TerminalSettings.tsx @@ -9,6 +9,7 @@ import { LinkBehaviorSetting } from "./components/LinkBehaviorSetting"; import { PresetsSection } from "./components/PresetsSection"; import { SessionsSection } from "./components/SessionsSection"; import { V2PresetsSection } from "./components/V2PresetsSection"; +import { V2SessionsSection } from "./components/V2SessionsSection"; interface TerminalSettingsProps { visibleItems?: SettingItemId[] | null; @@ -97,7 +98,12 @@ export function TerminalSettings({ /> ))} {showLinkBehavior && } - {showSessions && } + {showSessions && + (isV2CloudEnabled ? ( + + ) : ( + + ))} ); diff --git a/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/V2SessionsSection.tsx b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/V2SessionsSection.tsx new file mode 100644 index 00000000000..2b65e181aab --- /dev/null +++ b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/V2SessionsSection.tsx @@ -0,0 +1,251 @@ +// V2 Settings → Terminal → Manage daemon section. +// +// Talks to host-service's `terminal.daemon` namespace — the supervisor +// that owns pty-daemon's lifecycle lives there, not in desktop main. +// What's *not* duplicated from v1: kill-all-sessions, clear-history, +// per-row kill. Restart already achieves the kill-all effect for v2; +// scrollback is owned per-session by the daemon's ring buffer with no +// disk persistence; per-row kill belongs in the renderer's pane controls. + +import { + AlertDialog, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, +} from "@superset/ui/alert-dialog"; +import { Button } from "@superset/ui/button"; +import { Label } from "@superset/ui/label"; +import { toast } from "@superset/ui/sonner"; +import { workspaceTrpc } from "@superset/workspace-client"; +import { useState } from "react"; + +const REFETCH_WHILE_OPEN_MS = 5_000; + +export function V2SessionsSection() { + const [confirmRestartOpen, setConfirmRestartOpen] = useState(false); + const [showSessionList, setShowSessionList] = useState(false); + + const updateStatusQuery = + workspaceTrpc.terminal.daemon.getUpdateStatus.useQuery(undefined, { + refetchOnWindowFocus: true, + }); + const sessionsQuery = workspaceTrpc.terminal.daemon.listSessions.useQuery( + undefined, + { + // Poll while the user keeps the list expanded — sessions + // die/come up while they watch. Otherwise refetch on focus only. + refetchInterval: showSessionList ? REFETCH_WHILE_OPEN_MS : false, + refetchOnWindowFocus: true, + }, + ); + + const restartDaemon = workspaceTrpc.terminal.daemon.restart.useMutation({ + onSuccess: () => { + const versions = updateStatusQuery.data; + toast.success("Daemon restarted", { + description: + versions && versions.running !== versions.expected + ? `Now running ${versions.expected} (was ${versions.running}). All sessions were closed.` + : "All sessions were closed and a fresh daemon is running.", + }); + void updateStatusQuery.refetch(); + void sessionsQuery.refetch(); + }, + onError: (error) => { + toast.error("Failed to restart daemon", { description: error.message }); + }, + }); + + const sessions = sessionsQuery.data ?? null; + const aliveCount = + sessions === null ? null : sessions.filter((s) => s.alive).length; + const updatePending = updateStatusQuery.data?.pending === true; + const versions = updateStatusQuery.data; + + const sessionCountLabel = (() => { + if (sessions === null) return "Daemon unavailable"; + if (aliveCount === 0) return "No sessions running"; + return `${aliveCount} session${aliveCount === 1 ? "" : "s"} running`; + })(); + + const versionLabel = (() => { + if (!versions) return null; + if (versions.running === "unknown") { + return `bundled ${versions.expected}`; + } + if (updatePending) { + return `${versions.running} → ${versions.expected} pending`; + } + return versions.running; + })(); + + return ( + <> +
+
+
+ +

+ The terminal daemon owns all PTY sessions. It survives app + restarts so your shells, builds, and agents keep running. +

+
+ +
+ +
+ + 0 + ? "size-1.5 rounded-full bg-emerald-500" + : "size-1.5 rounded-full bg-muted-foreground/60" + } + /> + {sessionCountLabel} + + {versionLabel ? ( + + {versionLabel} + + ) : null} + {updatePending ? ( + + Update available + + ) : null} +
+ +
+ + +
+ + {showSessionList && sessions && sessions.length > 0 ? ( +
+
+ + + + + + + + + + + {sessions.map((s) => ( + + + + + + + ))} + +
SessionPIDSizeStatus
{s.id} + {s.pid || "—"} + + {s.cols}×{s.rows} + + + {s.alive ? "Alive" : "Exited"} + +
+
+
+ ) : null} +
+ + + + + + {updatePending + ? "Restart and apply update?" + : "Restart terminal daemon?"} + + +
+ + This closes every terminal session for your organization + {aliveCount && aliveCount > 0 + ? ` (${aliveCount} running)` + : ""}{" "} + and starts a fresh daemon. + + {updatePending && versions ? ( + + Restarting will load{" "} + {versions.expected}{" "} + (currently running{" "} + {versions.running}). + + ) : null} +
+
+
+ + + + +
+
+ + ); +} diff --git a/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/index.ts b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/index.ts new file mode 100644 index 00000000000..13f0812ce81 --- /dev/null +++ b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/index.ts @@ -0,0 +1 @@ +export { V2SessionsSection } from "./V2SessionsSection"; diff --git a/bun.lock b/bun.lock index 9ab19d94e0b..c681b381495 100644 --- a/bun.lock +++ b/bun.lock @@ -783,6 +783,7 @@ "hono": "^4.8.5", "mastracode": "0.15.0-alpha.3", "node-pty": "1.1.0", + "semver": "^7.7.4", "simple-git": "^3.30.0", "superjson": "^2.2.5", "tree-kill": "^1.2.2", @@ -792,6 +793,7 @@ "@superset/typescript": "workspace:*", "@types/better-sqlite3": "^7.6.13", "@types/node": "^24.9.1", + "@types/semver": "^7.7.1", "bun-types": "^1.3.1", "drizzle-kit": "0.31.8", "typescript": "^5.9.3", diff --git a/packages/host-service/package.json b/packages/host-service/package.json index 12f9e43638c..104bd93d05d 100644 --- a/packages/host-service/package.json +++ b/packages/host-service/package.json @@ -62,6 +62,7 @@ "hono": "^4.8.5", "mastracode": "0.15.0-alpha.3", "node-pty": "1.1.0", + "semver": "^7.7.4", "simple-git": "^3.30.0", "superjson": "^2.2.5", "tree-kill": "^1.2.2", @@ -71,6 +72,7 @@ "@superset/typescript": "workspace:*", "@types/better-sqlite3": "^7.6.13", "@types/node": "^24.9.1", + "@types/semver": "^7.7.1", "bun-types": "^1.3.1", "drizzle-kit": "0.31.8", "typescript": "^5.9.3" diff --git a/packages/host-service/src/daemon/DaemonSupervisor.test.ts b/packages/host-service/src/daemon/DaemonSupervisor.test.ts new file mode 100644 index 00000000000..bae54c71eaa --- /dev/null +++ b/packages/host-service/src/daemon/DaemonSupervisor.test.ts @@ -0,0 +1,450 @@ +// Tests for the DaemonSupervisor: +// - probeDaemonVersion (one-shot hello/hello-ack against an in-process +// fake daemon — exercises the *real* probe code, not a parallel impl) +// - update-pending event debouncing on adoption +// - getUpdateStatus semantics +// - restart() race-await + circuit-clear semantics +// +// Telemetry events are emitted as structured `console.log` lines (per the +// host-service-migration plan, decision D2). We spy on console.log and +// filter for our component prefix. + +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; +import * as net from "node:net"; +import * as os from "node:os"; +import * as path from "node:path"; +import { + type ClientMessage, + encodeFrame, + FrameDecoder, +} from "@superset/pty-daemon/protocol"; +import { DaemonSupervisor, probeDaemonVersion } from "./DaemonSupervisor.ts"; + +// Capture supervisor-emitted log events. We replace console.log for the +// duration of the test, then filter for our supervisor's component prefix. +const loggedEvents: { event: string; props: Record }[] = []; +const realConsoleLog = console.log; + +beforeEach(() => { + loggedEvents.length = 0; + console.log = (...args: unknown[]) => { + // Try to parse the first arg as JSON — supervisor logs in JSON; + // non-JSON lines (e.g. plain "[pty-daemon:...] adopted ...") fall + // through silently. + const first = args[0]; + if (typeof first === "string") { + try { + const parsed = JSON.parse(first) as Record; + if (parsed.component === "pty-daemon-supervisor") { + const { event, ...props } = parsed; + loggedEvents.push({ event: String(event), props }); + return; + } + } catch { + // not JSON, fall through + } + } + // keep one breadcrumb for debugging on test failure + realConsoleLog(...args); + }; +}); + +afterEach(() => { + console.log = realConsoleLog; +}); + +interface FakeDaemonOptions { + respondWithVersion?: string; + respondRaw?: Buffer; + hangUpAfterHello?: boolean; + respondWithWrongMessageFirst?: boolean; + silent?: boolean; +} + +async function startFakeDaemon(opts: FakeDaemonOptions): Promise<{ + socketPath: string; + close: () => Promise; +}> { + const socketPath = path.join( + os.tmpdir(), + `fake-pty-daemon-${process.pid}-${Math.random().toString(36).slice(2, 8)}.sock`, + ); + const server = net.createServer((sock) => { + const decoder = new FrameDecoder(); + sock.on("data", (chunk: Buffer) => { + decoder.push(chunk); + for (const raw of decoder.drain()) { + const msg = raw as ClientMessage; + if (msg.type !== "hello") continue; + if (opts.silent) return; + if (opts.hangUpAfterHello) { + sock.end(); + return; + } + if (opts.respondRaw) { + sock.write(opts.respondRaw); + return; + } + if (opts.respondWithWrongMessageFirst) { + sock.write( + encodeFrame({ + type: "error", + code: "EBOGUS", + message: "test", + }), + ); + return; + } + if (opts.respondWithVersion) { + sock.write( + encodeFrame({ + type: "hello-ack", + protocol: 1, + daemonVersion: opts.respondWithVersion, + }), + ); + return; + } + } + }); + sock.on("error", () => {}); + }); + await new Promise((resolve) => server.listen(socketPath, resolve)); + return { + socketPath, + close: () => + new Promise((resolve) => { + server.close(() => resolve()); + }), + }; +} + +describe("probeDaemonVersion", () => { + test("returns daemonVersion on valid hello-ack", async () => { + const fake = await startFakeDaemon({ respondWithVersion: "0.1.0" }); + try { + expect(await probeDaemonVersion(fake.socketPath, 1500)).toBe("0.1.0"); + } finally { + await fake.close(); + } + }); + + test("returns null when there is no listener on the socket path", async () => { + const dead = path.join( + os.tmpdir(), + `nonexistent-${process.pid}-${Math.random().toString(36).slice(2, 8)}.sock`, + ); + expect(await probeDaemonVersion(dead, 500)).toBeNull(); + }); + + test("returns null on probe timeout (silent daemon)", async () => { + const fake = await startFakeDaemon({ silent: true }); + try { + expect(await probeDaemonVersion(fake.socketPath, 200)).toBeNull(); + } finally { + await fake.close(); + } + }); + + test("returns null when daemon hangs up before hello-ack", async () => { + const fake = await startFakeDaemon({ hangUpAfterHello: true }); + try { + expect(await probeDaemonVersion(fake.socketPath, 1500)).toBeNull(); + } finally { + await fake.close(); + } + }); + + test("returns null on malformed/garbage response", async () => { + const fake = await startFakeDaemon({ + respondRaw: Buffer.from([0x00, 0xff, 0xab, 0xcd]), + }); + try { + expect(await probeDaemonVersion(fake.socketPath, 800)).toBeNull(); + } finally { + await fake.close(); + } + }); + + test("returns null when daemon sends a non-hello-ack message first", async () => { + const fake = await startFakeDaemon({ respondWithWrongMessageFirst: true }); + try { + expect(await probeDaemonVersion(fake.socketPath, 800)).toBeNull(); + } finally { + await fake.close(); + } + }); + + test("does not leak sockets across many invocations", async () => { + const fake = await startFakeDaemon({ respondWithVersion: "0.1.0" }); + try { + for (let i = 0; i < 50; i++) { + expect(await probeDaemonVersion(fake.socketPath, 1000)).toBe("0.1.0"); + } + } finally { + await fake.close(); + } + }); +}); + +describe("DaemonSupervisor.getUpdateStatus", () => { + let sup: DaemonSupervisor; + + beforeEach(() => { + sup = new DaemonSupervisor({ scriptPath: "/nonexistent" }); + }); + + test("returns null when no instance is registered", () => { + expect(sup.getUpdateStatus("org-no-such")).toBeNull(); + }); + + test("reflects updatePending=false for fresh-spawned instances", () => { + seedInstance(sup, "org-fresh", { + runningVersion: "0.1.0", + expectedVersion: "0.1.0", + updatePending: false, + }); + expect(sup.getUpdateStatus("org-fresh")).toEqual({ + pending: false, + running: "0.1.0", + expected: "0.1.0", + }); + }); + + test("reflects updatePending=true for stale-adopted instances", () => { + seedInstance(sup, "org-stale", { + runningVersion: "0.0.9", + expectedVersion: "0.1.0", + updatePending: true, + }); + expect(sup.getUpdateStatus("org-stale")).toEqual({ + pending: true, + running: "0.0.9", + expected: "0.1.0", + }); + }); + + test("'unknown' running version surfaces but is never pending", () => { + seedInstance(sup, "org-probe-failed", { + runningVersion: "unknown", + expectedVersion: "0.1.0", + updatePending: false, + }); + const status = sup.getUpdateStatus("org-probe-failed"); + expect(status?.pending).toBe(false); + expect(status?.running).toBe("unknown"); + }); +}); + +describe("update-pending event debounce", () => { + let sup: DaemonSupervisor; + + beforeEach(() => { + sup = new DaemonSupervisor({ scriptPath: "/nonexistent" }); + }); + + test("logs once per (running,expected) pair", () => { + const adopted = staleInstance("0.0.9"); + invokeMaybeFire(sup, "org", adopted); + invokeMaybeFire(sup, "org", adopted); + invokeMaybeFire(sup, "org", adopted); + const updateLogs = loggedEvents.filter( + (e) => e.event === "pty_daemon_update_pending", + ); + expect(updateLogs).toHaveLength(1); + expect(updateLogs[0]?.props).toMatchObject({ + organizationId: "org", + runningVersion: "0.0.9", + expectedVersion: "0.1.0", + }); + }); + + test("re-fires when the running version changes", () => { + invokeMaybeFire(sup, "org", staleInstance("0.0.8")); + invokeMaybeFire(sup, "org", staleInstance("0.0.9")); + expect( + loggedEvents.filter((e) => e.event === "pty_daemon_update_pending"), + ).toHaveLength(2); + }); + + test("clears debounce when an instance becomes non-pending", () => { + invokeMaybeFire(sup, "org", staleInstance("0.0.9")); + invokeMaybeFire(sup, "org", freshInstance()); + invokeMaybeFire(sup, "org", staleInstance("0.0.9")); + expect( + loggedEvents.filter((e) => e.event === "pty_daemon_update_pending"), + ).toHaveLength(2); + }); + + test("does not fire when updatePending is false", () => { + invokeMaybeFire(sup, "org", freshInstance()); + expect( + loggedEvents.filter((e) => e.event === "pty_daemon_update_pending"), + ).toHaveLength(0); + }); + + test("debounce is per-organization", () => { + const stale = staleInstance("0.0.9"); + invokeMaybeFire(sup, "org-a", stale); + invokeMaybeFire(sup, "org-b", stale); + expect( + loggedEvents.filter((e) => e.event === "pty_daemon_update_pending"), + ).toHaveLength(2); + }); +}); + +describe("DaemonSupervisor.restart", () => { + let sup: DaemonSupervisor; + + beforeEach(() => { + sup = new DaemonSupervisor({ scriptPath: "/nonexistent" }); + (sup as unknown as { stop: typeof sup.stop }).stop = mock( + async () => {}, + ) as typeof sup.stop; + (sup as unknown as { ensure: typeof sup.ensure }).ensure = mock(async () => + freshInstance(), + ) as typeof sup.ensure; + }); + + test("logs pty_daemon_user_restart with previous-version context", async () => { + seedInstance(sup, "org-restart", { + runningVersion: "0.0.9", + expectedVersion: "0.1.0", + updatePending: true, + }); + await sup.restart("org-restart"); + const restartLogs = loggedEvents.filter( + (e) => e.event === "pty_daemon_user_restart", + ); + expect(restartLogs).toHaveLength(1); + expect(restartLogs[0]?.props).toMatchObject({ + organizationId: "org-restart", + previousRunningVersion: "0.0.9", + previousExpectedVersion: "0.1.0", + previousUpdatePending: true, + hadCircuitOpen: false, + }); + }); + + test("clears the crash circuit so the user can recover from a tripped breaker", async () => { + (sup as unknown as { circuitOpen: Set }).circuitOpen.add( + "org-tripped", + ); + (sup as unknown as { crashTimes: Map }).crashTimes.set( + "org-tripped", + [1, 2, 3, 4], + ); + + await sup.restart("org-tripped"); + + expect(sup.isCircuitOpen("org-tripped")).toBe(false); + expect( + (sup as unknown as { crashTimes: Map }).crashTimes.get( + "org-tripped", + ), + ).toBeUndefined(); + + const restartLogs = loggedEvents.filter( + (e) => e.event === "pty_daemon_user_restart", + ); + expect(restartLogs[0]?.props).toMatchObject({ hadCircuitOpen: true }); + }); + + test("awaits an in-flight pendingStart before stopping", async () => { + let resolvePending: (value: unknown) => void = () => {}; + const pendingPromise = new Promise((resolve) => { + resolvePending = resolve; + }); + ( + sup as unknown as { pendingStarts: Map> } + ).pendingStarts.set("org-racey", pendingPromise); + + const stopMock = (sup as unknown as { stop: ReturnType }).stop; + const restartPromise = sup.restart("org-racey"); + + await new Promise((r) => setTimeout(r, 10)); + expect(stopMock).not.toHaveBeenCalled(); + + resolvePending({}); + await restartPromise; + expect(stopMock).toHaveBeenCalledTimes(1); + }); + + test("falls through cleanly if the pendingStart rejects", async () => { + const failingPending = Promise.reject(new Error("spawn failed")); + failingPending.catch(() => {}); + ( + sup as unknown as { pendingStarts: Map> } + ).pendingStarts.set("org-failed-spawn", failingPending); + + await expect(sup.restart("org-failed-spawn")).resolves.toEqual({ + success: true, + }); + }); + + test("returns success only after ensure resolves", async () => { + const ensureMock = mock(async () => freshInstance()); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + const result = await sup.restart("org-ok"); + expect(result).toEqual({ success: true }); + expect(ensureMock).toHaveBeenCalledTimes(1); + }); +}); + +// ---------------- helpers ---------------- + +interface SeededFields { + runningVersion: string; + expectedVersion: string; + updatePending: boolean; +} + +function seedInstance( + sup: DaemonSupervisor, + organizationId: string, + fields: SeededFields, +): void { + const instances = (sup as unknown as { instances: Map }) + .instances; + instances.set(organizationId, { + pid: 9999, + socketPath: "/tmp/seeded.sock", + startedAt: Date.now(), + ...fields, + }); +} + +function freshInstance() { + return { + pid: 1234, + socketPath: "/tmp/fresh.sock", + startedAt: Date.now(), + runningVersion: "0.1.0", + expectedVersion: "0.1.0", + updatePending: false, + }; +} + +function staleInstance(running: string) { + return { + pid: 1234, + socketPath: "/tmp/stale.sock", + startedAt: Date.now(), + runningVersion: running, + expectedVersion: "0.1.0", + updatePending: true, + }; +} + +function invokeMaybeFire( + sup: DaemonSupervisor, + organizationId: string, + instance: ReturnType, +): void { + ( + sup as unknown as { + maybeFireUpdatePending: (id: string, inst: typeof instance) => void; + } + ).maybeFireUpdatePending(organizationId, instance); +} diff --git a/packages/host-service/src/daemon/DaemonSupervisor.ts b/packages/host-service/src/daemon/DaemonSupervisor.ts new file mode 100644 index 00000000000..ca4cc39a65a --- /dev/null +++ b/packages/host-service/src/daemon/DaemonSupervisor.ts @@ -0,0 +1,686 @@ +// DaemonSupervisor — owns the per-organization pty-daemon process for +// host-service. Spawns or adopts the daemon and exposes its socket path +// via getSocketPath(orgId). PTY ownership lives here so host-service can +// crash/restart freely without losing user shells. +// +// History: this used to live in the desktop main process +// (`apps/desktop/src/main/lib/pty-daemon-coordinator.ts`). It moved here +// so host-service can be deployed independently of Electron — see +// `apps/desktop/plans/20260430-pty-daemon-host-service-migration.md`. + +import * as childProcess from "node:child_process"; +import { createHash } from "node:crypto"; +import * as fs from "node:fs"; +import * as net from "node:net"; +import * as os from "node:os"; +import * as path from "node:path"; +import { + CURRENT_PROTOCOL_VERSION, + encodeFrame, + FrameDecoder, + type ServerMessage, + type SessionInfo, +} from "@superset/pty-daemon/protocol"; +import semver from "semver"; +import { EXPECTED_DAEMON_VERSION } from "./expected-version.ts"; +import { MAX_DAEMON_LOG_BYTES, openRotatingLogFd } from "./log-fd.ts"; +import { + isProcessAlive, + type PtyDaemonManifest, + ptyDaemonManifestDir, + readPtyDaemonManifest, + removePtyDaemonManifest, + writePtyDaemonManifest, +} from "./manifest.ts"; + +interface DaemonInstance { + pid: number; + socketPath: string; + startedAt: number; + /** Version reported by the running daemon's hello-ack. "unknown" if probe failed. */ + runningVersion: string; + /** Bundled-binary version we expect — i.e. EXPECTED_DAEMON_VERSION at spawn time. */ + expectedVersion: string; + /** True when running < expected. Probe failure does NOT set this. */ + updatePending: boolean; +} + +const SOCKET_READY_TIMEOUT_MS = 5_000; +const VERSION_PROBE_TIMEOUT_MS = 1_500; + +/** + * Crash supervision parameters. If the daemon for an organization crashes + * more than CRASH_BUDGET times within CRASH_WINDOW_MS, we stop respawning + * and surface a hard error — repeated crashes are a bug, not transient + * recovery. + */ +const CRASH_BUDGET = 3; +const CRASH_WINDOW_MS = 60_000; + +/** + * Per-organization socket path. **Must stay short** — Darwin's `sun_path` + * is 104 bytes, and `$SUPERSET_HOME_DIR/host/{orgId}/pty-daemon.sock` blows + * past that in dev (worktree-relative SUPERSET_HOME_DIR + 36-char UUID). + * + * We put the socket in `os.tmpdir()` with a hash of the org id. Owner-only + * file mode (0600, set by the daemon's Server.listen) is the auth boundary; + * the directory permissions don't matter. + */ +function ptyDaemonSocketPath(organizationId: string): string { + const shortId = createHash("sha256") + .update(organizationId) + .digest("hex") + .slice(0, 12); + return path.join(os.tmpdir(), `superset-ptyd-${shortId}.sock`); +} + +/** + * Structured log helper. Replaces the desktop's `track(...)` calls — we + * keep the same event names + props so any future telemetry slice can + * lift them straight back into PostHog. + */ +function logEvent(event: string, props: Record): void { + console.log( + JSON.stringify({ component: "pty-daemon-supervisor", event, ...props }), + ); +} + +export interface DaemonSupervisorOptions { + /** Path to the daemon entry script (e.g. `dist/pty-daemon.js`). */ + scriptPath: string; +} + +export class DaemonSupervisor { + private readonly opts: DaemonSupervisorOptions; + private readonly instances = new Map(); + private readonly pendingStarts = new Map>(); + /** Recent crash timestamps per orgId, for the circuit breaker. */ + private readonly crashTimes = new Map(); + /** Orgs we've explicitly stopped — exit isn't a crash, don't respawn. */ + private readonly stopping = new Set(); + /** Orgs that tripped the circuit breaker — refuse respawn until cleared. */ + private readonly circuitOpen = new Set(); + /** + * Last (orgId → "running:expected") pair we logged update-pending for. + * Debounce — re-fire only when either side changes. + */ + private readonly lastUpdatePendingPair = new Map(); + + constructor(opts: DaemonSupervisorOptions) { + this.opts = opts; + } + + /** + * Has the org tripped the crash circuit breaker? Once tripped, ensure() + * fails fast with a clear error until clearCrashCircuit() is called. + */ + isCircuitOpen(organizationId: string): boolean { + return this.circuitOpen.has(organizationId); + } + + /** + * Reset the crash counter and close the circuit. Called from a UI + * "retry" action after surfacing the error to the user. + */ + clearCrashCircuit(organizationId: string): void { + this.circuitOpen.delete(organizationId); + this.crashTimes.delete(organizationId); + } + + /** + * Returns whether the running daemon is older than the bundled binary. + * Null when we have no instance for this org. `running === "unknown"` + * means the version probe failed during adoption — treat as not-pending + * (probe failure ≠ stale). + */ + getUpdateStatus( + organizationId: string, + ): { pending: boolean; running: string; expected: string } | null { + const instance = this.instances.get(organizationId); + if (!instance) return null; + return { + pending: instance.updatePending, + running: instance.runningVersion, + expected: instance.expectedVersion, + }; + } + + /** + * Explicitly restart the daemon for an org — kills sessions, spawns + * fresh. The user has opted in via UI confirmation. Distinct from + * crash-respawn: clears the crash circuit (if open) and emits its own + * event so logs can separate intent from recovery. + * + * Awaits any in-flight spawn before stopping so we never SIGTERM a + * partially-initialized child. + */ + async restart(organizationId: string): Promise<{ success: true }> { + const prev = this.instances.get(organizationId); + const hadCircuitOpen = this.circuitOpen.has(organizationId); + + const pending = this.pendingStarts.get(organizationId); + if (pending) { + try { + await pending; + } catch { + // Failed in-flight spawn — nothing to stop, ensure() will retry. + } + } + + await this.stop(organizationId); + this.clearCrashCircuit(organizationId); + + logEvent("pty_daemon_user_restart", { + organizationId, + hadCircuitOpen, + previousRunningVersion: prev?.runningVersion ?? null, + previousExpectedVersion: prev?.expectedVersion ?? null, + previousUpdatePending: prev?.updatePending ?? null, + }); + + await this.ensure(organizationId); + return { success: true }; + } + + /** + * Spawn the daemon if not already running for this organization, or + * adopt the running one. Returns the instance metadata. + */ + async ensure(organizationId: string): Promise { + if (this.circuitOpen.has(organizationId)) { + throw new Error( + `[pty-daemon:${organizationId}] crash circuit open: ${CRASH_BUDGET} crashes within ${CRASH_WINDOW_MS / 1000}s. Restart the host-service to retry.`, + ); + } + const existing = this.instances.get(organizationId); + if (existing) return existing; + const pending = this.pendingStarts.get(organizationId); + if (pending) return pending; + + const startPromise = this.start(organizationId).finally(() => { + this.pendingStarts.delete(organizationId); + }); + this.pendingStarts.set(organizationId, startPromise); + return startPromise; + } + + getSocketPath(organizationId: string): string | null { + return this.instances.get(organizationId)?.socketPath ?? null; + } + + /** + * Live session list from the running daemon. Null when there is no + * daemon for the org, the socket is unreachable, or the request times + * out — the caller treats null as "unknown" (distinct from `[]` which + * means "daemon up, no sessions"). + */ + async listSessions( + organizationId: string, + timeoutMs = 1500, + ): Promise { + const socketPath = this.getSocketPath(organizationId); + if (!socketPath) return null; + return listDaemonSessions(socketPath, timeoutMs); + } + + async stop(organizationId: string): Promise { + const instance = this.instances.get(organizationId); + this.instances.delete(organizationId); + if (!instance) return; + this.stopping.add(organizationId); + try { + process.kill(instance.pid, "SIGTERM"); + } catch { + // Already dead. + } + removePtyDaemonManifest(organizationId); + } + + private async start(organizationId: string): Promise { + const adopted = await this.tryAdopt(organizationId); + if (adopted) { + this.instances.set(organizationId, adopted); + console.log( + `[pty-daemon:${organizationId}] adopted existing daemon pid=${adopted.pid} runningVersion=${adopted.runningVersion} updatePending=${adopted.updatePending}`, + ); + logEvent("pty_daemon_adopt", { + organizationId, + pid: adopted.pid, + ageSeconds: Math.round((Date.now() - adopted.startedAt) / 1000), + runningVersion: adopted.runningVersion, + expectedVersion: adopted.expectedVersion, + updatePending: adopted.updatePending, + }); + this.maybeFireUpdatePending(organizationId, adopted); + return adopted; + } + + const instance = await this.spawn(organizationId); + logEvent("pty_daemon_spawn", { + organizationId, + pid: instance.pid, + socketPath: instance.socketPath, + daemonVersion: instance.runningVersion, + }); + this.lastUpdatePendingPair.delete(organizationId); + return instance; + } + + /** + * Log `pty_daemon_update_pending` once per (running, expected) pair so + * adopting the same stale daemon repeatedly doesn't spam logs. + */ + private maybeFireUpdatePending( + organizationId: string, + instance: DaemonInstance, + ): void { + if (!instance.updatePending) { + this.lastUpdatePendingPair.delete(organizationId); + return; + } + const pair = `${instance.runningVersion}:${instance.expectedVersion}`; + if (this.lastUpdatePendingPair.get(organizationId) === pair) return; + this.lastUpdatePendingPair.set(organizationId, pair); + logEvent("pty_daemon_update_pending", { + organizationId, + runningVersion: instance.runningVersion, + expectedVersion: instance.expectedVersion, + }); + } + + private async tryAdopt( + organizationId: string, + ): Promise { + const manifest = readPtyDaemonManifest(organizationId); + if (!manifest) return null; + if (!isProcessAlive(manifest.pid)) { + removePtyDaemonManifest(organizationId); + return null; + } + const reachable = await isSocketConnectable(manifest.socketPath, 1000); + if (!reachable) { + // PID alive but socket gone — daemon is wedged. Kill and respawn. + try { + process.kill(manifest.pid, "SIGTERM"); + } catch { + // Already dead. + } + removePtyDaemonManifest(organizationId); + return null; + } + + const probed = await probeDaemonVersion( + manifest.socketPath, + VERSION_PROBE_TIMEOUT_MS, + ); + const runningVersion = probed ?? "unknown"; + const updatePending = + !!probed && !semver.satisfies(probed, `>=${EXPECTED_DAEMON_VERSION}`); + + return { + pid: manifest.pid, + socketPath: manifest.socketPath, + startedAt: manifest.startedAt, + runningVersion, + expectedVersion: EXPECTED_DAEMON_VERSION, + updatePending, + }; + } + + private async spawn(organizationId: string): Promise { + const dir = ptyDaemonManifestDir(organizationId); + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true, mode: 0o700 }); + } + const socketPath = ptyDaemonSocketPath(organizationId); + const logPath = path.join(dir, "pty-daemon.log"); + + if (!fs.existsSync(this.opts.scriptPath)) { + throw new Error( + `[pty-daemon:${organizationId}] script not found at ${this.opts.scriptPath} — has the daemon binary been bundled?`, + ); + } + + const logFd = openRotatingLogFd(logPath, MAX_DAEMON_LOG_BYTES); + const stdio: childProcess.StdioOptions = + logFd >= 0 ? ["ignore", logFd, logFd] : ["ignore", "ignore", "ignore"]; + + const childEnv = { + ...(process.env as Record), + ORGANIZATION_ID: organizationId, + // Source of truth for daemon version. The daemon's main.ts reads + // this and surfaces it in the hello-ack so adoption probes can + // detect drift against EXPECTED_DAEMON_VERSION. + SUPERSET_PTY_DAEMON_VERSION: EXPECTED_DAEMON_VERSION, + }; + + console.log( + `[pty-daemon:${organizationId}] spawning ${this.opts.scriptPath} → ${socketPath} (log: ${logPath})`, + ); + + let child: ReturnType; + try { + child = childProcess.spawn( + process.execPath, + [this.opts.scriptPath, `--socket=${socketPath}`], + { + detached: true, + stdio, + env: childEnv, + windowsHide: true, + }, + ); + } finally { + if (logFd >= 0) { + try { + fs.closeSync(logFd); + } catch { + // best-effort + } + } + } + + const childPid = child.pid; + if (!childPid) { + throw new Error(`[pty-daemon:${organizationId}] failed to spawn`); + } + + let earlyExitCode: number | null = null; + let earlyExitSignal: NodeJS.Signals | null = null; + child.once("exit", (code, signal) => { + earlyExitCode = code; + earlyExitSignal = signal; + }); + + const ready = await waitForSocket(socketPath, SOCKET_READY_TIMEOUT_MS); + if (!ready) { + try { + child.kill("SIGTERM"); + } catch { + // best-effort + } + let logTail = ""; + try { + const buf = fs.readFileSync(logPath, "utf-8"); + logTail = buf.slice(-2000); + } catch { + logTail = "(no log file written)"; + } + logEvent("pty_daemon_spawn_failed", { + organizationId, + reason: "socket-not-ready", + timeoutMs: SOCKET_READY_TIMEOUT_MS, + earlyExitCode, + earlyExitSignal, + }); + throw new Error( + `[pty-daemon:${organizationId}] socket did not become ready within ${SOCKET_READY_TIMEOUT_MS}ms (childPid=${childPid}, earlyExit=${earlyExitCode ?? earlyExitSignal ?? "still alive"}). Log tail:\n${logTail}`, + ); + } + + child.unref(); + child.on("exit", (code) => { + console.log(`[pty-daemon:${organizationId}] exited with code ${code}`); + const current = this.instances.get(organizationId); + if (current?.pid !== childPid) return; + this.instances.delete(organizationId); + removePtyDaemonManifest(organizationId); + + if (this.stopping.has(organizationId)) { + this.stopping.delete(organizationId); + return; + } + + const now = Date.now(); + const recent = (this.crashTimes.get(organizationId) ?? []).filter( + (t) => now - t < CRASH_WINDOW_MS, + ); + recent.push(now); + this.crashTimes.set(organizationId, recent); + + logEvent("pty_daemon_crash", { + organizationId, + exitCode: code, + crashesInWindow: recent.length, + windowSeconds: CRASH_WINDOW_MS / 1000, + ageSeconds: Math.round((now - current.startedAt) / 1000), + }); + + if (recent.length > CRASH_BUDGET) { + this.circuitOpen.add(organizationId); + console.error( + `[pty-daemon:${organizationId}] crash circuit OPEN — ${recent.length} crashes in ${CRASH_WINDOW_MS / 1000}s; refusing further respawns until clearCrashCircuit() is called`, + ); + logEvent("pty_daemon_circuit_open", { + organizationId, + crashesInWindow: recent.length, + }); + return; + } + + console.warn( + `[pty-daemon:${organizationId}] auto-respawning after unexpected exit (${recent.length}/${CRASH_BUDGET} in window)`, + ); + void this.ensure(organizationId).catch((err) => { + console.error( + `[pty-daemon:${organizationId}] auto-respawn failed:`, + err, + ); + }); + }); + + const startedAt = Date.now(); + const manifest: PtyDaemonManifest = { + pid: childPid, + socketPath, + protocolVersions: [1], + startedAt, + organizationId, + }; + writePtyDaemonManifest(manifest); + + const instance: DaemonInstance = { + pid: childPid, + socketPath, + startedAt, + runningVersion: EXPECTED_DAEMON_VERSION, + expectedVersion: EXPECTED_DAEMON_VERSION, + updatePending: false, + }; + this.instances.set(organizationId, instance); + console.log( + `[pty-daemon:${organizationId}] spawned pid=${childPid} socket=${socketPath}`, + ); + return instance; + } +} + +async function waitForSocket( + socketPath: string, + timeoutMs: number, +): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + if (fs.existsSync(socketPath)) { + if (await isSocketConnectable(socketPath, 200)) return true; + } + await new Promise((r) => setTimeout(r, 50)); + } + return false; +} + +/** + * One-shot session list: connect, do handshake, send `list`, return the + * sessions array. Returns null on any failure. + * + * Owns its socket lifecycle on every exit path. + */ +export async function listDaemonSessions( + socketPath: string, + timeoutMs: number, +): Promise { + return new Promise((resolve) => { + const sock = net.createConnection({ path: socketPath }); + const decoder = new FrameDecoder(); + let helloAcked = false; + let settled = false; + + const cleanup = (value: SessionInfo[] | null) => { + if (settled) return; + settled = true; + clearTimeout(timer); + sock.removeAllListeners(); + try { + sock.end(); + } catch { + // best-effort + } + try { + sock.destroy(); + } catch { + // best-effort + } + resolve(value); + }; + + const timer = setTimeout(() => cleanup(null), timeoutMs); + + sock.once("error", () => cleanup(null)); + sock.once("close", () => cleanup(null)); + + sock.once("connect", () => { + try { + sock.write( + encodeFrame({ + type: "hello", + protocols: [CURRENT_PROTOCOL_VERSION], + clientVersion: "supervisor-list", + }), + ); + } catch { + cleanup(null); + } + }); + + sock.on("data", (chunk: Buffer) => { + try { + decoder.push(chunk); + for (const raw of decoder.drain()) { + const msg = raw as ServerMessage; + if (!helloAcked) { + if (msg.type !== "hello-ack") { + cleanup(null); + return; + } + helloAcked = true; + sock.write(encodeFrame({ type: "list" })); + continue; + } + if (msg.type === "list-reply") { + cleanup(msg.sessions); + return; + } + if (msg.type === "error") { + cleanup(null); + return; + } + } + } catch { + cleanup(null); + } + }); + }); +} + +/** + * One-shot version probe: connect, send `hello`, read framed `hello-ack`, + * close, return `daemonVersion`. Returns null on any failure. + * + * Owns its socket lifecycle on every exit path. + */ +export async function probeDaemonVersion( + socketPath: string, + timeoutMs: number, +): Promise { + return new Promise((resolve) => { + const sock = net.createConnection({ path: socketPath }); + const decoder = new FrameDecoder(); + let settled = false; + + const cleanup = (value: string | null) => { + if (settled) return; + settled = true; + clearTimeout(timer); + sock.removeAllListeners(); + try { + sock.end(); + } catch { + // best-effort + } + try { + sock.destroy(); + } catch { + // best-effort + } + resolve(value); + }; + + const timer = setTimeout(() => cleanup(null), timeoutMs); + + sock.once("error", () => cleanup(null)); + sock.once("close", () => cleanup(null)); + + sock.once("connect", () => { + try { + sock.write( + encodeFrame({ + type: "hello", + protocols: [CURRENT_PROTOCOL_VERSION], + clientVersion: "supervisor-probe", + }), + ); + } catch { + cleanup(null); + } + }); + + sock.on("data", (chunk: Buffer) => { + try { + decoder.push(chunk); + for (const raw of decoder.drain()) { + const msg = raw as ServerMessage; + if (msg.type === "hello-ack") { + cleanup(msg.daemonVersion ?? null); + return; + } + cleanup(null); + return; + } + } catch { + cleanup(null); + } + }); + }); +} + +function isSocketConnectable( + socketPath: string, + timeoutMs: number, +): Promise { + return new Promise((resolve) => { + const sock = net.createConnection({ path: socketPath }); + const timer = setTimeout(() => { + sock.destroy(); + resolve(false); + }, timeoutMs); + sock.once("connect", () => { + clearTimeout(timer); + sock.end(); + resolve(true); + }); + sock.once("error", () => { + clearTimeout(timer); + resolve(false); + }); + }); +} diff --git a/packages/host-service/src/daemon/expected-version.ts b/packages/host-service/src/daemon/expected-version.ts new file mode 100644 index 00000000000..993c055ce5d --- /dev/null +++ b/packages/host-service/src/daemon/expected-version.ts @@ -0,0 +1,16 @@ +// Bundled daemon version. **Hand-edited to match +// `packages/pty-daemon/package.json#version`** — keep them in lockstep. +// +// This drives the "update pending — restart terminals to apply" UX: +// when host-service adopts a daemon whose version (read via hello-ack) +// is older than this constant, the renderer surfaces a flag. +// +// We pass this to spawned daemons via `SUPERSET_PTY_DAEMON_VERSION` and +// probe it back on adoption. We do **not** auto-kill on mismatch (sessions +// live in the daemon); the user explicitly triggers restart. +// +// TODO: replace with a build-step that reads +// `node_modules/@superset/pty-daemon/package.json` and writes a generated +// constant, so the lockstep can't drift silently. For now: hand-edit and +// rely on PR review. +export const EXPECTED_DAEMON_VERSION = "0.1.0"; diff --git a/packages/host-service/src/daemon/index.ts b/packages/host-service/src/daemon/index.ts new file mode 100644 index 00000000000..23afe1f03bb --- /dev/null +++ b/packages/host-service/src/daemon/index.ts @@ -0,0 +1,13 @@ +export { + DaemonSupervisor, + type DaemonSupervisorOptions, + listDaemonSessions, + probeDaemonVersion, +} from "./DaemonSupervisor.ts"; +export { EXPECTED_DAEMON_VERSION } from "./expected-version.ts"; +export { + getSupervisor, + resolveSupervisorScriptPath, + startDaemonBootstrap, + waitForDaemonReady, +} from "./singleton.ts"; diff --git a/packages/host-service/src/daemon/log-fd.ts b/packages/host-service/src/daemon/log-fd.ts new file mode 100644 index 00000000000..7fa6e5dbadf --- /dev/null +++ b/packages/host-service/src/daemon/log-fd.ts @@ -0,0 +1,33 @@ +// Append-mode log fd for the daemon's stdio with size-based rotation. +// Mirrors the desktop's host-service log handling — when the bundle moves +// host-service into a headless deploy, daemon logs are still recoverable +// without an external log shipper. + +import * as fs from "node:fs"; +import * as path from "node:path"; + +export const MAX_DAEMON_LOG_BYTES = 5 * 1024 * 1024; + +/** + * Open an append-mode log fd, truncating first if it already exceeds + * `maxBytes`. Returns -1 on failure so callers can fall back to ignoring + * child stdio. + */ +export function openRotatingLogFd(logPath: string, maxBytes: number): number { + try { + fs.mkdirSync(path.dirname(logPath), { recursive: true, mode: 0o700 }); + if (fs.existsSync(logPath)) { + try { + const { size } = fs.statSync(logPath); + if (size > maxBytes) { + fs.writeFileSync(logPath, "", { mode: 0o600 }); + } + } catch { + // best-effort + } + } + return fs.openSync(logPath, "a", 0o600); + } catch { + return -1; + } +} diff --git a/apps/desktop/src/main/lib/pty-daemon-manifest.ts b/packages/host-service/src/daemon/manifest.ts similarity index 75% rename from apps/desktop/src/main/lib/pty-daemon-manifest.ts rename to packages/host-service/src/daemon/manifest.ts index 9888bb1b09c..6f7884f6ea0 100644 --- a/apps/desktop/src/main/lib/pty-daemon-manifest.ts +++ b/packages/host-service/src/daemon/manifest.ts @@ -1,3 +1,7 @@ +// Manifest for a running pty-daemon instance. Lives under +// $SUPERSET_HOME_DIR/host/{organizationId}/. Different lifecycle from +// host-service's own manifest — the daemon outlives host-service restarts. + import { existsSync, mkdirSync, @@ -6,26 +10,23 @@ import { unlinkSync, writeFileSync, } from "node:fs"; +import { homedir } from "node:os"; import { join } from "node:path"; -import { SUPERSET_HOME_DIR } from "./app-environment"; -/** - * Manifest for a running pty-daemon instance. Sibling of - * HostServiceManifest; lives in the same per-organization directory under - * $SUPERSET_HOME_DIR/host/{organizationId}/. Different lifecycles — the - * daemon outlives host-service restarts. - */ export interface PtyDaemonManifest { pid: number; socketPath: string; protocolVersions: number[]; - daemonVersion: string; startedAt: number; organizationId: string; } +function supersetHomeDir(): string { + return process.env.SUPERSET_HOME_DIR || join(homedir(), ".superset"); +} + export function ptyDaemonManifestDir(organizationId: string): string { - return join(SUPERSET_HOME_DIR, "host", organizationId); + return join(supersetHomeDir(), "host", organizationId); } function ptyDaemonManifestPath(organizationId: string): string { @@ -57,7 +58,6 @@ export function readPtyDaemonManifest( typeof data.pid !== "number" || typeof data.socketPath !== "string" || !Array.isArray(data.protocolVersions) || - typeof data.daemonVersion !== "string" || typeof data.startedAt !== "number" || typeof data.organizationId !== "string" ) { @@ -70,7 +70,7 @@ export function readPtyDaemonManifest( } export function listPtyDaemonManifests(): PtyDaemonManifest[] { - const hostDir = join(SUPERSET_HOME_DIR, "host"); + const hostDir = join(supersetHomeDir(), "host"); if (!existsSync(hostDir)) return []; const manifests: PtyDaemonManifest[] = []; try { @@ -80,7 +80,7 @@ export function listPtyDaemonManifests(): PtyDaemonManifest[] { if (manifest) manifests.push(manifest); } } catch { - // Best-effort scan. + // best-effort } return manifests; } @@ -90,6 +90,15 @@ export function removePtyDaemonManifest(organizationId: string): void { try { if (existsSync(filePath)) unlinkSync(filePath); } catch { - // Best-effort removal. + // best-effort + } +} + +export function isProcessAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch (err) { + return (err as NodeJS.ErrnoException).code === "EPERM"; } } diff --git a/packages/host-service/src/daemon/singleton.ts b/packages/host-service/src/daemon/singleton.ts new file mode 100644 index 00000000000..d4fd60b8e1d --- /dev/null +++ b/packages/host-service/src/daemon/singleton.ts @@ -0,0 +1,95 @@ +// Singleton DaemonSupervisor for the host-service process. One supervisor +// per host-service instance; it manages exactly one daemon (per the org +// host-service was started with). Lazy bootstrap so tests can construct +// host-service without spawning a real daemon — the bootstrap is kicked +// off explicitly from `serve.ts`. + +import { existsSync } from "node:fs"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; +import { DaemonSupervisor } from "./DaemonSupervisor.ts"; + +let supervisor: DaemonSupervisor | null = null; +let bootstrapPromise: Promise | null = null; + +/** + * Resolve the daemon entry script path. In production, host-service.js and + * pty-daemon.js are bundled side-by-side in the same dist directory. In + * dev (running from source under bun), we fall back to the workspace + * package's `dist/pty-daemon.js`. Either is fine — both are real Node + * scripts. + */ +export function resolveSupervisorScriptPath(): string { + const override = process.env.SUPERSET_PTY_DAEMON_SCRIPT_PATH; + if (override) return override; + + const here = path.dirname(fileURLToPath(import.meta.url)); + // Production: host-service.js and pty-daemon.js live side-by-side in + // the same bundled dist/. `here` is `/daemon/` after bundling + // (or close to it); two levels up + `pty-daemon.js` resolves there. + const sideBySide = path.resolve(here, "..", "..", "pty-daemon.js"); + if (existsSync(sideBySide)) return sideBySide; + + // Dev mode running from source: `here` is + // `packages/host-service/src/daemon/`; the daemon's bundled entry + // sits at `packages/pty-daemon/dist/pty-daemon.js` after `bun run + // build:daemon` in that package. + const workspaceDist = path.resolve( + here, + "..", + "..", + "..", + "pty-daemon", + "dist", + "pty-daemon.js", + ); + return workspaceDist; +} + +export function getSupervisor(scriptPath?: string): DaemonSupervisor { + if (!supervisor) { + supervisor = new DaemonSupervisor({ + scriptPath: scriptPath ?? resolveSupervisorScriptPath(), + }); + } + return supervisor; +} + +/** + * Kick off `ensure(orgId)` without awaiting (per the host-service + * migration plan, decision D3 — fire-and-track). Stash the promise so + * callers that need the daemon up can await it via `waitForDaemonReady`. + */ +export function startDaemonBootstrap(organizationId: string): void { + if (bootstrapPromise) return; + const sup = getSupervisor(); + bootstrapPromise = sup.ensure(organizationId).catch((err) => { + console.error( + `[host-service] pty-daemon bootstrap failed for org=${organizationId}:`, + err, + ); + // Reset so a future request can retry. + bootstrapPromise = null; + throw err; + }); +} + +/** + * Awaits the in-flight bootstrap. If bootstrap hasn't started, kicks one + * off first. Terminal request handlers call this before using the + * supervisor's socket path. + */ +export async function waitForDaemonReady( + organizationId: string, +): Promise { + if (!bootstrapPromise) startDaemonBootstrap(organizationId); + if (bootstrapPromise) { + await bootstrapPromise; + } +} + +/** Test-only — reset the singleton between tests. */ +export function __resetSupervisorForTesting(): void { + supervisor = null; + bootstrapPromise = null; +} diff --git a/packages/host-service/src/serve.ts b/packages/host-service/src/serve.ts index 1090f397bc9..b22d05a26e7 100644 --- a/packages/host-service/src/serve.ts +++ b/packages/host-service/src/serve.ts @@ -1,5 +1,6 @@ import { serve } from "@hono/node-server"; import { createApp } from "./app"; +import { startDaemonBootstrap } from "./daemon"; import { env } from "./env"; import { JwtApiAuthProvider } from "./providers/auth"; import { LocalGitCredentialProvider } from "./providers/git"; @@ -13,6 +14,14 @@ async function main(): Promise { const terminalBaseEnv = await resolveTerminalBaseEnv(); initTerminalBaseEnv(terminalBaseEnv); + // Fire-and-track: kick off pty-daemon spawn-or-adopt without blocking + // host-service startup. Terminal request handlers `await + // waitForDaemonReady(orgId)` before using the supervisor's socket path, + // so an in-flight bootstrap doesn't race with the first terminal launch. + // Non-terminal requests (workspaces, git, chat) are unaffected if the + // daemon takes time to come up or fails entirely. + startDaemonBootstrap(env.ORGANIZATION_ID); + const authProvider = new JwtApiAuthProvider( env.AUTH_TOKEN, env.CLOUD_API_URL, diff --git a/packages/host-service/src/terminal/daemon-client-singleton.ts b/packages/host-service/src/terminal/daemon-client-singleton.ts index f60dc6f4d3b..d169d993619 100644 --- a/packages/host-service/src/terminal/daemon-client-singleton.ts +++ b/packages/host-service/src/terminal/daemon-client-singleton.ts @@ -1,6 +1,7 @@ -// Lazy singleton DaemonClient for host-service. The desktop coordinator -// passes the daemon socket path via SUPERSET_PTY_DAEMON_SOCKET. We connect -// once on first use and reuse the connection for all sessions. +// Lazy singleton DaemonClient for host-service. The DaemonSupervisor +// (host-service-internal) owns the daemon's process lifecycle; this +// singleton just connects to the supervisor's socket path on first use +// and reuses the connection for all sessions. // // On disconnect we surface via console.error, notify subscribers (terminal.ts // uses this to close WS sockets so the renderer reconnects against the @@ -8,6 +9,8 @@ // the client. There's no in-band reconnect here — see DaemonClient's "dumb" // failure model. +import { getSupervisor, waitForDaemonReady } from "../daemon/index.ts"; +import { env } from "../env.ts"; import { DaemonClient } from "./DaemonClient/index.ts"; let cached: DaemonClient | null = null; @@ -27,20 +30,30 @@ export function onDaemonDisconnect(cb: (err?: Error) => void): () => void { }; } -export function ptyDaemonSocketPath(): string { - const path = process.env.SUPERSET_PTY_DAEMON_SOCKET; - if (!path) { +async function ptyDaemonSocketPath(): Promise { + // Test escape hatch: when SUPERSET_PTY_DAEMON_SOCKET is set explicitly + // (e.g. by the adoption integration test), skip the supervisor and + // connect directly. Production paths leave this env var unset; the + // supervisor's own spawn does not set it. + const testOverride = process.env.SUPERSET_PTY_DAEMON_SOCKET; + if (testOverride) return testOverride; + + await waitForDaemonReady(env.ORGANIZATION_ID); + const sockPath = getSupervisor().getSocketPath(env.ORGANIZATION_ID); + if (!sockPath) { throw new Error( - "pty-daemon is not available: SUPERSET_PTY_DAEMON_SOCKET is not set. The desktop coordinator should set this before spawning host-service. Terminals will not work until the daemon comes up.", + "pty-daemon is not available: supervisor returned no socket path. " + + "The bootstrap must have failed — check host-service logs for spawn errors.", ); } - return path; + return sockPath; } export async function getDaemonClient(): Promise { if (cached?.isConnected) return cached; if (connecting) return connecting; - const client = new DaemonClient({ socketPath: ptyDaemonSocketPath() }); + const sockPath = await ptyDaemonSocketPath(); + const client = new DaemonClient({ socketPath: sockPath }); client.onDisconnect((err) => { console.error( "[host-service] pty-daemon disconnected:", diff --git a/packages/host-service/src/trpc/router/terminal/terminal.ts b/packages/host-service/src/trpc/router/terminal/terminal.ts index e289c50fa3e..411263775e0 100644 --- a/packages/host-service/src/trpc/router/terminal/terminal.ts +++ b/packages/host-service/src/trpc/router/terminal/terminal.ts @@ -1,7 +1,9 @@ import { TRPCError } from "@trpc/server"; import { eq } from "drizzle-orm"; import { z } from "zod"; +import { getSupervisor, waitForDaemonReady } from "../../../daemon"; import { terminalSessions, workspaces } from "../../../db/schema"; +import { env } from "../../../env"; import { createTerminalSessionInternal, disposeSession, @@ -10,6 +12,27 @@ import { } from "../../../terminal/terminal"; import { protectedProcedure, router } from "../../index"; +// Daemon control surface — sibling to the per-workspace terminal ops above. +// Org-scoped (one daemon per host-service); reads org id from env. +// Supervisor lives in this same process so calls go through the in-process +// singleton, not over the wire. +const daemonRouter = router({ + getUpdateStatus: protectedProcedure.query(() => + getSupervisor().getUpdateStatus(env.ORGANIZATION_ID), + ), + + listSessions: protectedProcedure.query(async () => { + // Wait for the bootstrap so the supervisor has a socket path. + await waitForDaemonReady(env.ORGANIZATION_ID); + return getSupervisor().listSessions(env.ORGANIZATION_ID); + }), + + restart: protectedProcedure.mutation(async () => { + await waitForDaemonReady(env.ORGANIZATION_ID); + return getSupervisor().restart(env.ORGANIZATION_ID); + }), +}); + export const terminalRouter = router({ launchSession: protectedProcedure .input( @@ -94,4 +117,6 @@ export const terminalRouter = router({ disposeSession(input.terminalId, ctx.db); return { terminalId: input.terminalId, status: "disposed" as const }; }), + + daemon: daemonRouter, }); diff --git a/packages/pty-daemon/src/main.ts b/packages/pty-daemon/src/main.ts index d40bd03fe1e..a7264389d60 100755 --- a/packages/pty-daemon/src/main.ts +++ b/packages/pty-daemon/src/main.ts @@ -38,7 +38,11 @@ function parseArgs(argv: string[]): CliArgs { async function main(): Promise { const args = parseArgs(process.argv.slice(2)); - const daemonVersion = readPackageVersion(); + // Env takes precedence so the supervisor (or a test harness) can pin + // the version to a known value. Falls back to the package.json read + // when env is unset — that's the deployed-artifact source of truth. + const daemonVersion = + process.env.SUPERSET_PTY_DAEMON_VERSION ?? readPackageVersion(); const server = new Server({ socketPath: args.socket, daemonVersion, From 07ec14fb6fa1279e9d2a6c411b62462cfdbb3ba4 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 12:29:29 -0700 Subject: [PATCH 20/33] docs(host-service): daemon supervision reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Architecture/reference doc colocated with the supervisor code. Replaces the migration plan that was the input to this work — describes the end state for future contributors. --- packages/host-service/DAEMON_SUPERVISION.md | 110 ++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 packages/host-service/DAEMON_SUPERVISION.md diff --git a/packages/host-service/DAEMON_SUPERVISION.md b/packages/host-service/DAEMON_SUPERVISION.md new file mode 100644 index 00000000000..83deb497261 --- /dev/null +++ b/packages/host-service/DAEMON_SUPERVISION.md @@ -0,0 +1,110 @@ +# Daemon Supervision + +Host-service owns the lifecycle of `@superset/pty-daemon` — the long-lived +PTY process. Supervision lives here (not in the desktop app) so +host-service can be deployed independently of Electron. The daemon +outlives host-service crashes via detached spawn + manifest adoption. + +## Where it lives + +- **Supervisor**: `src/daemon/DaemonSupervisor.ts` — spawn / adopt / + restart / crash-circuit. One supervisor per host-service process, + managing one daemon (per the org host-service was started for). +- **Singleton + bootstrap**: `src/daemon/singleton.ts` — process-level + cache + `startDaemonBootstrap` / `waitForDaemonReady` for the boot + pattern below. +- **Manifest**: `src/daemon/manifest.ts` — `$SUPERSET_HOME_DIR/host/{orgId}/pty-daemon-manifest.json`. + Read by `tryAdopt` on startup to find a still-running daemon from a + previous host-service incarnation. +- **Expected version**: `src/daemon/expected-version.ts` — hand-edited + `EXPECTED_DAEMON_VERSION`, kept in lockstep with + `packages/pty-daemon/package.json#version`. Drives the + "update available, restart terminals" UX. +- **Renderer surface**: `terminal.daemon.{getUpdateStatus, listSessions, restart}` + on the host-service tRPC. + +## Boot pattern (fire-and-track) + +`serve.ts` calls `startDaemonBootstrap(env.ORGANIZATION_ID)` during +startup but does **not** await it. tRPC accepts connections immediately; +non-terminal ops (workspaces, git, chat) work without waiting for the +daemon. Terminal request handlers `await waitForDaemonReady(orgId)` +before using the supervisor's socket path, so an in-flight bootstrap +doesn't race with the first terminal launch. + +## Detached spawn + adoption + +The daemon is spawned `detached: true` so it survives host-service +exit. On next host-service start, `tryAdopt` reads the manifest, checks +the PID is alive and the socket is reachable, and reuses the running +daemon. PTY sessions therefore survive host-service restarts. + +The socket path lives in `os.tmpdir()/superset-ptyd-.sock` +— short enough to fit Darwin's 104-byte `sun_path` limit. Owner-only +file mode (0600) is the auth boundary. + +## Version detection + +On adoption, `probeDaemonVersion` does a one-shot `hello`/`hello-ack` to +read the running daemon's `daemonVersion`, compares against +`EXPECTED_DAEMON_VERSION` via `semver.satisfies(>=)`. Mismatch sets +`updatePending: true` on the instance — the renderer surfaces a +"restart to update" affordance. We do **not** auto-kill on mismatch +because PTY sessions live in the daemon; the user opts in via Restart. + +Probe failure ≠ stale: a transient socket issue produces +`runningVersion: "unknown", updatePending: false` rather than a +false-positive update flag. + +## Crash circuit breaker + +Auto-respawn unexpected exits, but only up to `CRASH_BUDGET = 3` within +`CRASH_WINDOW_MS = 60_000`. Past that, the circuit opens and `ensure` +fails fast with a clear error until something calls +`clearCrashCircuit(orgId)` — which the user-triggered `restart()` +implicitly does, so the user can always recover. + +## User-triggered restart + +`restart(orgId)` awaits any in-flight pending spawn, calls `stop`, +clears the crash circuit, logs `pty_daemon_user_restart`, then `ensure`s +fresh. Sessions die in the gap — that's the cost the user accepted via +the confirmation dialog. + +## Telemetry + +The supervisor emits structured `console.log` lines with +`{ component: "pty-daemon-supervisor", event, ...props }`. Events: +`pty_daemon_spawn`, `pty_daemon_adopt`, `pty_daemon_user_restart`, +`pty_daemon_update_pending`, `pty_daemon_crash`, +`pty_daemon_circuit_open`, `pty_daemon_spawn_failed`. No PostHog +plumbing on host-service yet — promote to real telemetry when the path +is needed. + +## Tests + +- `src/daemon/DaemonSupervisor.test.ts` — probe edge cases, debounce + semantics, restart race-await + circuit clear. +- Daemon wire protocol coverage lives in `packages/pty-daemon/test/` + (handshake, adoption, SIGKILL recovery). + +## Test escape hatch + +Setting `SUPERSET_PTY_DAEMON_SOCKET` env var bypasses the supervisor in +`daemon-client-singleton.ts` and connects directly to the given socket. +Used by `terminal.adoption.node-test.ts` to test host-service against an +in-process Server instance. Production paths leave this env unset. + +## Extension points + +Adding a daemon op the renderer needs: + +1. Add a method on `DaemonSupervisor` (or use `getDaemonClient()` from + `terminal/daemon-client-singleton.ts` if it's a wire-protocol op). +2. Expose via `terminal.daemon` in `src/trpc/router/terminal/terminal.ts`. +3. Call from the renderer via `workspaceTrpc.terminal.daemon.*`. + +Bumping the daemon version: edit `EXPECTED_DAEMON_VERSION` in +`expected-version.ts` to match the new `packages/pty-daemon/package.json#version`. +The supervisor's adoption probe will surface the "update available" flag +on existing installs until they restart. From 5cb3ba7f22df292cd655fe024d9be5f6e06f92c6 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 12:39:26 -0700 Subject: [PATCH 21/33] fix(host-service): DaemonClient lifecycle hardening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address PR review on the daemon transport. Four related issues, all in the same family of "subtle bugs that bite under load": - **Request-level timeouts** for open/close/list (15s/5s/5s). Without these, a live-but-stuck daemon (e.g. blocked node-pty.spawn) hangs callers indefinitely — only a full disconnect would unblock them. - **list() filtered to non-session error frames.** Previously any error could settle a pending list, so a concurrent error from a session could resolve a list() call with the wrong reply. - **Handshake failure tears down the socket.** A rejected handshake left the open socket and its listeners alive — leaked resources across retries. connect() now destroys + nulls on throw. - **Decode failure hard-closes the transport.** A protocol decode error called onClose() but didn't destroy() the socket, so the connection could keep delivering frames after local teardown. 200 host-service tests pass (the 1 unrelated `pull-requests` failure is preexisting on the branch baseline). --- .../src/terminal/DaemonClient/DaemonClient.ts | 73 +++++++++++++++++-- 1 file changed, 68 insertions(+), 5 deletions(-) diff --git a/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts b/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts index 1e03eb8be76..003e5d84109 100644 --- a/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts +++ b/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts @@ -52,6 +52,16 @@ export interface DaemonClientOptions { connectTimeoutMs?: number; } +/** + * Per-request timeouts. The daemon should respond within milliseconds for + * close/list, and within a few seconds for open (PTY spawn includes shell + * startup). Without these, a live-but-stuck daemon can hang callers + * indefinitely — a real risk if `node-pty.spawn` ever blocks. + */ +const OPEN_TIMEOUT_MS = 15_000; +const CLOSE_TIMEOUT_MS = 5_000; +const LIST_TIMEOUT_MS = 5_000; + export class DaemonClient { private readonly opts: DaemonClientOptions; private socket: net.Socket | null = null; @@ -72,7 +82,17 @@ export class DaemonClient { socket.on("data", (chunk) => this.onData(chunk)); socket.on("close", () => this.onClose()); socket.on("error", (err) => this.onClose(err)); - await this.handshake(); + try { + await this.handshake(); + } catch (err) { + // Handshake rejected — destroy the socket and clear state so the + // caller's retry sees a clean slate. Without this, the socket and + // its listeners leak across failed connect attempts. + this.socket = null; + socket.removeAllListeners(); + socket.destroy(); + throw err; + } this.connected = true; } @@ -96,14 +116,22 @@ export class DaemonClient { } async open(id: string, meta: SessionMeta): Promise { - const reply = await this.requestSession(id, { type: "open", id, meta }); + const reply = await this.requestSession( + id, + { type: "open", id, meta }, + OPEN_TIMEOUT_MS, + ); if (reply.type === "open-ok") return { id, pid: reply.pid }; if (reply.type === "error") throw new Error(`open ${id}: ${reply.message}`); throw new Error(`open ${id}: unexpected reply ${reply.type}`); } async close(id: string, signal: Signal = "SIGTERM"): Promise { - const reply = await this.requestSession(id, { type: "close", id, signal }); + const reply = await this.requestSession( + id, + { type: "close", id, signal }, + CLOSE_TIMEOUT_MS, + ); if (reply.type === "closed") return; if (reply.type === "error") throw new Error(`close ${id}: ${reply.message}`); @@ -111,7 +139,11 @@ export class DaemonClient { } async list(): Promise { - const reply = await this.requestNonSession({ type: "list" }, "list-reply"); + const reply = await this.requestNonSession( + { type: "list" }, + "list-reply", + LIST_TIMEOUT_MS, + ); if (reply.type === "list-reply") return reply.sessions; throw new Error(`list: unexpected reply ${reply.type}`); } @@ -206,6 +238,7 @@ export class DaemonClient { req: | { type: "open"; id: string; meta: SessionMeta } | { type: "close"; id: string; signal: Signal }, + timeoutMs: number, ): Promise { return new Promise((resolve, reject) => { let resolved = false; @@ -231,9 +264,19 @@ export class DaemonClient { const offDisc = this.onDisconnect((err) => fail(err ?? new Error("daemon disconnected")), ); + const timer = setTimeout( + () => + fail( + new Error( + `daemon ${req.type} ${id}: timed out after ${timeoutMs}ms`, + ), + ), + timeoutMs, + ); const cleanup = () => { off(); offDisc(); + clearTimeout(timer); }; this.send(req); }); @@ -242,6 +285,7 @@ export class DaemonClient { private requestNonSession( req: { type: "list" }, expectType: "list-reply", + timeoutMs: number, ): Promise { return new Promise((resolve, reject) => { let resolved = false; @@ -258,14 +302,28 @@ export class DaemonClient { reject(err); }; const off = this.on((m) => { - if (m.type === expectType || m.type === "error") settle(m); + if (m.type === expectType) { + settle(m); + return; + } + // Non-session error frames (no `id`) belong to the + // most-recent non-session request — settle on those. Errors + // keyed to a session id come from concurrent ops on that + // session; ignore them here. + if (m.type === "error" && m.id === undefined) settle(m); }); const offDisc = this.onDisconnect((err) => fail(err ?? new Error("daemon disconnected")), ); + const timer = setTimeout( + () => + fail(new Error(`daemon ${req.type}: timed out after ${timeoutMs}ms`)), + timeoutMs, + ); const cleanup = () => { off(); offDisc(); + clearTimeout(timer); }; this.send(req); }); @@ -314,6 +372,11 @@ export class DaemonClient { try { frames = this.decoder.drain(); } catch (err) { + // Protocol decode failure — the wire stream is corrupt. Hard-close + // the transport so we don't keep accepting data on a broken + // connection. Without destroy() the socket can keep delivering + // frames after onClose() has fired. + this.socket?.destroy(); this.onClose(err as Error); return; } From 2b8ca2c65de3584b40ff49ee1835fd90912d73db Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 13:24:13 -0700 Subject: [PATCH 22/33] fix(host-service): subscribe replay assertion + daemon CLI polish - DaemonClient.subscribe now throws if a second subscriber requests replay:true. The daemon's ring buffer is delivered once, on the first subscribe; later subscribers can't get historical data this way and used to silently miss it. Loud-fail the surprising case so callers pick a server-side replay path instead. Updated the existing fan-out test to use replay:false on the second subscriber (the right value for that use case anyway). - pty-daemon main.ts: validate --buffer-bytes is a positive integer; wrap the shutdown handler in try/finally with a re-entry guard so a second SIGINT/SIGTERM during graceful close doesn't double-call server.close() and the process always exits deterministically. --- .../DaemonClient/DaemonClient.node-test.ts | 6 +++- .../src/terminal/DaemonClient/DaemonClient.ts | 10 +++++++ packages/pty-daemon/src/main.ts | 29 +++++++++++++++---- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts b/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts index d37e0403fce..7fe6d8767be 100644 --- a/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts +++ b/packages/host-service/src/terminal/DaemonClient/DaemonClient.node-test.ts @@ -135,9 +135,13 @@ test("multiple local subscribers get fanned out from one wire subscription", asy onExit: () => {}, }, ); + // Second subscriber must use replay:false — the daemon's buffer was + // already delivered to the first subscribe; requesting replay again + // is now an explicit error (see DaemonClient.subscribe). The + // fan-out applies to live output only. const unsubB = c.subscribe( id, - { replay: true }, + { replay: false }, { onOutput: (buf) => b.push(buf), onExit: () => {}, diff --git a/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts b/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts index 003e5d84109..7afa7823245 100644 --- a/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts +++ b/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts @@ -183,6 +183,16 @@ export class DaemonClient { entry.exit.add(cb.onExit); // Only the first subscribe per session id sends the wire `subscribe`. // Subsequent local callbacks just register into the existing entry. + // The daemon's ring buffer is delivered once, on the first subscribe + // — so `replay: true` only makes sense on a fresh subscription. + // Loud-fail the surprising case where a later subscriber asks for + // replay; the caller needs to replay from a server-side cache + // instead (see terminal.ts replayBuffer). + if (!wasFirst && opts.replay) { + throw new Error( + `subscribe(${id}): replay is not available on a second subscribe; the daemon's buffer was already consumed.`, + ); + } if (wasFirst) { this.send({ type: "subscribe", id, replay: opts.replay }); } diff --git a/packages/pty-daemon/src/main.ts b/packages/pty-daemon/src/main.ts index a7264389d60..61dc22a86b6 100755 --- a/packages/pty-daemon/src/main.ts +++ b/packages/pty-daemon/src/main.ts @@ -24,10 +24,14 @@ function parseArgs(argv: string[]): CliArgs { if (arg.startsWith("--socket=")) args.socket = arg.slice("--socket=".length); else if (arg.startsWith("--buffer-bytes=")) { - args.bufferBytes = Number.parseInt( - arg.slice("--buffer-bytes=".length), - 10, - ); + const raw = arg.slice("--buffer-bytes=".length); + const parsed = Number.parseInt(raw, 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + throw new Error( + `--buffer-bytes must be a positive integer, got: ${raw}`, + ); + } + args.bufferBytes = parsed; } } if (!args.socket) { @@ -53,10 +57,23 @@ async function main(): Promise { `[pty-daemon] listening on ${args.socket} (v${daemonVersion}, host=${os.hostname()})\n`, ); + let shuttingDown = false; const shutdown = async (signal: NodeJS.Signals) => { + // Re-entry guard: a second SIGINT/SIGTERM during graceful close + // should not double-call server.close() or change the exit code. + if (shuttingDown) return; + shuttingDown = true; process.stderr.write(`[pty-daemon] received ${signal}, shutting down\n`); - await server.close(); - process.exit(0); + try { + await server.close(); + } catch (err) { + process.stderr.write( + `[pty-daemon] shutdown error: ${(err as Error).stack ?? err}\n`, + ); + } finally { + // Always exit deterministically, even if server.close() threw. + process.exit(0); + } }; process.on("SIGINT", () => void shutdown("SIGINT")); process.on("SIGTERM", () => void shutdown("SIGTERM")); From 2b24a383a30aba2457071c5693d1d098da3a3157 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 13:24:28 -0700 Subject: [PATCH 23/33] test(host-service): supervisor + tRPC + Electron-coupling coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds 16 new tests across four files to close the test gaps for the pty-daemon migration. Most are bun unit tests; the supervisor integration test runs under node:test because the supervisor uses process.execPath to spawn the daemon (must be node, not bun). - DaemonSupervisor.node-test.ts (5 real-spawn scenarios): fresh spawn, cross-instance adoption, version drift detection on adoption, user-restart kills + respawns, auto-respawn after SIGKILL. - singleton.test.ts (6 cases): getSupervisor identity, fire-and-track bootstrap doesn't await, idempotent startDaemonBootstrap, waitForDaemonReady kicks off lazy bootstrap, failed bootstrap is retryable. - terminal.daemon.test.ts (4 cases): tRPC procedure wiring against a stub supervisor — UNAUTHORIZED gating, getUpdateStatus delegation, listSessions awaits bootstrap before delegating, restart wiring. - no-electron-coupling.test.ts (1 case): asserts host-service source has zero Electron imports/globals/APIs. Substitutes for a true headless smoke test until native-addon distribution is solved (better-sqlite3, node-pty, @parcel/watcher are bundle-external and currently expect Electron's resolution path). Also exports __resetSupervisorForTesting from src/daemon/index.ts so tests can reset the singleton between runs, and registers the new node-test in the test:integration script. Total host-service test suite is now 211 pass / 1 fail (the failing one is a preexisting pull-requests test unrelated to the migration). --- packages/host-service/package.json | 2 +- .../src/daemon/DaemonSupervisor.node-test.ts | 266 ++++++++++++++++++ packages/host-service/src/daemon/index.ts | 1 + .../host-service/src/daemon/singleton.test.ts | 114 ++++++++ .../src/no-electron-coupling.test.ts | 80 ++++++ .../router/terminal/terminal.daemon.test.ts | 122 ++++++++ 6 files changed, 584 insertions(+), 1 deletion(-) create mode 100644 packages/host-service/src/daemon/DaemonSupervisor.node-test.ts create mode 100644 packages/host-service/src/daemon/singleton.test.ts create mode 100644 packages/host-service/src/no-electron-coupling.test.ts create mode 100644 packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts diff --git a/packages/host-service/package.json b/packages/host-service/package.json index 104bd93d05d..91435defe67 100644 --- a/packages/host-service/package.json +++ b/packages/host-service/package.json @@ -39,7 +39,7 @@ "build:host": "bun run build.ts", "generate": "drizzle-kit generate", "typecheck": "tsc --noEmit --emitDeclarationOnly false", - "test:integration": "node --experimental-strip-types --test src/terminal/DaemonClient/DaemonClient.node-test.ts", + "test:integration": "node --experimental-strip-types --test src/terminal/DaemonClient/DaemonClient.node-test.ts src/daemon/DaemonSupervisor.node-test.ts", "test:e2e": "bun run scripts/test-e2e.ts" }, "dependencies": { diff --git a/packages/host-service/src/daemon/DaemonSupervisor.node-test.ts b/packages/host-service/src/daemon/DaemonSupervisor.node-test.ts new file mode 100644 index 00000000000..cbc10a9cf18 --- /dev/null +++ b/packages/host-service/src/daemon/DaemonSupervisor.node-test.ts @@ -0,0 +1,266 @@ +// Real-spawn integration tests for DaemonSupervisor. +// Runs under Node (`node --experimental-strip-types --test`) because the +// supervisor uses `process.execPath` to spawn the daemon, and the daemon +// imports node-pty (a native addon that needs Node ABI). +// +// Unit-level coverage for the same surface lives in DaemonSupervisor.test.ts +// (under bun test). These integration tests catch process-lifecycle bugs +// that mocks don't (PID liveness, manifest IO across supervisor instances, +// real socket connectivity). + +import { strict as assert } from "node:assert"; +import * as childProcess from "node:child_process"; +import * as crypto from "node:crypto"; +import * as fs from "node:fs"; +import * as net from "node:net"; +import * as os from "node:os"; +import * as path from "node:path"; +import { afterEach, beforeEach, describe, test } from "node:test"; +import { fileURLToPath } from "node:url"; +import { DaemonSupervisor } from "./DaemonSupervisor.ts"; +import { + type PtyDaemonManifest, + ptyDaemonManifestDir, + writePtyDaemonManifest, +} from "./manifest.ts"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +// packages/host-service/src/daemon → packages/pty-daemon/dist/pty-daemon.js +const DAEMON_BUNDLE = path.resolve( + __dirname, + "../../../pty-daemon/dist/pty-daemon.js", +); + +if (!fs.existsSync(DAEMON_BUNDLE)) { + throw new Error( + `Daemon bundle missing at ${DAEMON_BUNDLE}. Run \`bun run build:daemon\` in packages/pty-daemon first.`, + ); +} + +let tmpHome: string; +let originalHome: string | undefined; +const supervisorsToCleanup: { sup: DaemonSupervisor; orgId: string }[] = []; + +beforeEach(() => { + tmpHome = fs.mkdtempSync(path.join(os.tmpdir(), "pty-daemon-it-")); + originalHome = process.env.SUPERSET_HOME_DIR; + process.env.SUPERSET_HOME_DIR = tmpHome; +}); + +afterEach(async () => { + // Detached daemons survive the test process by design — kill any we + // spawned so they don't leak across test runs. + for (const { sup, orgId } of supervisorsToCleanup.splice(0)) { + try { + await sup.stop(orgId); + } catch { + // best-effort + } + } + if (originalHome !== undefined) { + process.env.SUPERSET_HOME_DIR = originalHome; + } else { + delete process.env.SUPERSET_HOME_DIR; + } + try { + fs.rmSync(tmpHome, { recursive: true, force: true }); + } catch { + // best-effort + } +}); + +describe("DaemonSupervisor.ensure (real spawn)", () => { + test("spawns a fresh daemon and reports running == expected", async () => { + const sup = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup, orgId: "org-spawn" }); + const inst = await sup.ensure("org-spawn"); + assert.ok(inst.pid > 0, "expected a positive pid"); + assert.equal(inst.runningVersion, inst.expectedVersion); + assert.equal(inst.updatePending, false); + assert.equal(await isReachable(inst.socketPath), true); + }); + + test("adopts a running daemon across supervisor instances", async () => { + const supA = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + const a = await supA.ensure("org-adopt"); + assert.ok(a.pid > 0); + + // Track the daemon for cleanup; we'll stop via supervisor B since + // that's the live owner by the end of the test. + try { + // Supervisor B simulates a host-service restart — fresh state, + // but the manifest + running daemon are still on disk/live. + const supB = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup: supB, orgId: "org-adopt" }); + const b = await supB.ensure("org-adopt"); + assert.equal(b.pid, a.pid, "B should adopt A's daemon"); + assert.equal(b.socketPath, a.socketPath); + assert.equal(b.runningVersion, a.expectedVersion); + assert.equal(b.updatePending, false); + } catch (err) { + // On failure, make sure A still cleans up. + await supA.stop("org-adopt").catch(() => {}); + throw err; + } + }); + + test("flags updatePending when running daemon is older than expected", async () => { + // We spawn the daemon DIRECTLY (not via supervisor.ensure), pinning + // its version to "0.0.1" via env. Then we write the manifest and + // hand the supervisor a fresh instance that adopts via tryAdopt. + // Going through supervisor.ensure for the spawn would inject + // EXPECTED_DAEMON_VERSION (0.1.0) into childEnv, defeating the + // older-version setup. + const orgId = "org-stale"; + const socketPath = path.join( + os.tmpdir(), + `superset-ptyd-${crypto + .createHash("sha256") + .update(orgId) + .digest("hex") + .slice(0, 12)}.sock`, + ); + // Clean up any leftover socket from prior runs. + try { + fs.unlinkSync(socketPath); + } catch {} + + const child = childProcess.spawn( + process.execPath, + [DAEMON_BUNDLE, `--socket=${socketPath}`], + { + detached: true, + stdio: "ignore", + env: { ...process.env, SUPERSET_PTY_DAEMON_VERSION: "0.0.1" }, + }, + ); + child.unref(); + // Wait for the socket to come up. + const ready = await waitForSocket(socketPath, 5000); + assert.equal(ready, true, "daemon socket did not become ready"); + + try { + // Write the manifest the supervisor needs to find the daemon. + fs.mkdirSync(ptyDaemonManifestDir(orgId), { + recursive: true, + mode: 0o700, + }); + const manifest: PtyDaemonManifest = { + pid: child.pid as number, + socketPath, + protocolVersions: [1], + startedAt: Date.now(), + organizationId: orgId, + }; + writePtyDaemonManifest(manifest); + + // Fresh supervisor adopts and probes. + const sup = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup, orgId }); + const adopted = await sup.ensure(orgId); + assert.equal(adopted.runningVersion, "0.0.1"); + assert.equal(adopted.expectedVersion, "0.1.0"); + assert.equal(adopted.updatePending, true); + } catch (err) { + // On failure, kill the orphaned daemon ourselves. + try { + if (child.pid) process.kill(child.pid, "SIGTERM"); + } catch {} + throw err; + } + }); + + test("restart() kills the old daemon and spawns a new one", async () => { + const sup = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup, orgId: "org-restart" }); + const a = await sup.ensure("org-restart"); + const aPid = a.pid; + + await sup.restart("org-restart"); + const after = ( + sup as unknown as { instances: Map } + ).instances.get("org-restart"); + assert.ok(after, "expected an instance after restart"); + assert.notEqual(after.pid, aPid, "expected a new pid after restart"); + // Old PID is dead within a beat. + await new Promise((r) => setTimeout(r, 200)); + assert.equal(isAlive(aPid), false); + }); + + test("auto-respawns after the running daemon dies unexpectedly", async () => { + // SIGKILL the running daemon, wait for the supervisor's on-exit + // handler to fire, and verify a new daemon comes up. Crash-budget + // behavior past this point is covered by the unit tests in + // DaemonSupervisor.test.ts (mocked stop/ensure for determinism — + // killing 4 daemons in a row from this test would race with the + // auto-respawn loop). + const sup = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup, orgId: "org-respawn" }); + const a = await sup.ensure("org-respawn"); + const aPid = a.pid; + + process.kill(aPid, "SIGKILL"); + + // Wait for the on-exit handler to register the death and respawn. + // The supervisor's auto-respawn fires inside `child.on("exit")`. + const deadline = Date.now() + 8000; + let _next = sup.getSocketPath("org-respawn"); + while (Date.now() < deadline) { + const inst = ( + sup as unknown as { instances: Map } + ).instances.get("org-respawn"); + if (inst && inst.pid !== aPid) { + _next = inst as unknown as string; + break; + } + await new Promise((r) => setTimeout(r, 100)); + } + const after = ( + sup as unknown as { instances: Map } + ).instances.get("org-respawn"); + assert.ok(after, "expected a respawned instance"); + assert.notEqual(after.pid, aPid); + }); +}); + +function isAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch (err) { + return (err as NodeJS.ErrnoException).code === "EPERM"; + } +} + +function isReachable(socketPath: string): Promise { + return new Promise((resolve) => { + const sock = net.createConnection({ path: socketPath }); + const timer = setTimeout(() => { + sock.destroy(); + resolve(false); + }, 500); + sock.once("connect", () => { + clearTimeout(timer); + sock.end(); + resolve(true); + }); + sock.once("error", () => { + clearTimeout(timer); + resolve(false); + }); + }); +} + +async function waitForSocket( + socketPath: string, + timeoutMs: number, +): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + if (fs.existsSync(socketPath)) { + if (await isReachable(socketPath)) return true; + } + await new Promise((r) => setTimeout(r, 50)); + } + return false; +} diff --git a/packages/host-service/src/daemon/index.ts b/packages/host-service/src/daemon/index.ts index 23afe1f03bb..81b6504ee06 100644 --- a/packages/host-service/src/daemon/index.ts +++ b/packages/host-service/src/daemon/index.ts @@ -6,6 +6,7 @@ export { } from "./DaemonSupervisor.ts"; export { EXPECTED_DAEMON_VERSION } from "./expected-version.ts"; export { + __resetSupervisorForTesting, getSupervisor, resolveSupervisorScriptPath, startDaemonBootstrap, diff --git a/packages/host-service/src/daemon/singleton.test.ts b/packages/host-service/src/daemon/singleton.test.ts new file mode 100644 index 00000000000..33fa33d4344 --- /dev/null +++ b/packages/host-service/src/daemon/singleton.test.ts @@ -0,0 +1,114 @@ +// Tests for the daemon supervisor singleton + bootstrap helpers. +// We don't spawn a real daemon here — the singleton is just plumbing +// (DI for the supervisor, fire-and-track promise stash). Real-spawn +// coverage lives in DaemonSupervisor.node-test.ts. + +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; +import { DaemonSupervisor } from "./DaemonSupervisor.ts"; +import { + __resetSupervisorForTesting, + getSupervisor, + startDaemonBootstrap, + waitForDaemonReady, +} from "./singleton.ts"; + +beforeEach(() => { + __resetSupervisorForTesting(); +}); + +afterEach(() => { + __resetSupervisorForTesting(); +}); + +describe("getSupervisor", () => { + test("returns the same instance across calls", () => { + const a = getSupervisor("/nonexistent"); + const b = getSupervisor("/different"); + // Singleton — second arg is ignored after first construction. + expect(b).toBe(a); + }); + + test("constructs with the provided scriptPath on first call", () => { + const sup = getSupervisor("/some/path/pty-daemon.js"); + // We can't read scriptPath via public API, but we can confirm the + // supervisor was constructed (not null) and uses the path when it + // tries to spawn — `existsSync` check throws "script not found". + expect(sup).toBeInstanceOf(DaemonSupervisor); + }); +}); + +describe("fire-and-track bootstrap", () => { + test("startDaemonBootstrap kicks off ensure without awaiting", async () => { + const sup = getSupervisor("/nonexistent"); + const ensureMock = mock(async () => { + // Long-running ensure that we control via a manual settle. + await new Promise((r) => setTimeout(r, 50)); + return {} as Awaited>; + }); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + + const t0 = Date.now(); + startDaemonBootstrap("org-fnt"); + const elapsed = Date.now() - t0; + // Should return immediately, not after the ensure delay. + expect(elapsed).toBeLessThan(20); + expect(ensureMock).toHaveBeenCalledTimes(1); + expect(ensureMock).toHaveBeenCalledWith("org-fnt"); + + // Now await readiness — should complete after ensure resolves. + await waitForDaemonReady("org-fnt"); + // Sanity: ensure was invoked exactly once across both calls. + expect(ensureMock).toHaveBeenCalledTimes(1); + }); + + test("startDaemonBootstrap is idempotent", () => { + const sup = getSupervisor("/nonexistent"); + const ensureMock = mock(async () => { + await new Promise((r) => setTimeout(r, 100)); + return {} as Awaited>; + }); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + + startDaemonBootstrap("org-idempotent"); + startDaemonBootstrap("org-idempotent"); + startDaemonBootstrap("org-idempotent"); + expect(ensureMock).toHaveBeenCalledTimes(1); + }); + + test("waitForDaemonReady kicks off bootstrap if none in flight", async () => { + const sup = getSupervisor("/nonexistent"); + const ensureMock = mock(async () => { + return {} as Awaited>; + }); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + + await waitForDaemonReady("org-lazy"); + expect(ensureMock).toHaveBeenCalledTimes(1); + }); + + test("a failed bootstrap is retryable", async () => { + const sup = getSupervisor("/nonexistent"); + let failNext = true; + const ensureMock = mock(async () => { + if (failNext) { + failNext = false; + throw new Error("simulated spawn failure"); + } + return {} as Awaited>; + }); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + + // First wait surfaces the failure. + await expect(waitForDaemonReady("org-retry")).rejects.toThrow( + "simulated spawn failure", + ); + // Second wait kicks off a new bootstrap (the failed promise was + // cleared) and succeeds. + await waitForDaemonReady("org-retry"); + expect(ensureMock).toHaveBeenCalledTimes(2); + }); +}); diff --git a/packages/host-service/src/no-electron-coupling.test.ts b/packages/host-service/src/no-electron-coupling.test.ts new file mode 100644 index 00000000000..ad71437bb79 --- /dev/null +++ b/packages/host-service/src/no-electron-coupling.test.ts @@ -0,0 +1,80 @@ +// Asserts host-service source has no Electron coupling. The migration's +// thesis is that host-service is independently deployable; this test +// keeps that promise honest by failing if someone accidentally imports +// electron, uses an Electron global, or shells out to an Electron API. +// +// Why a grep test rather than a real `node dist/host-service.js` smoke +// test: native addons (better-sqlite3, node-pty, @parcel/watcher) are +// marked external in the bundle and currently expect Electron's +// resolution path. Solving the native-addon distribution for headless +// deploy is its own slice. In the meantime this test catches the +// regression class the smoke test was designed to catch: "did we +// re-couple to Electron at the source level?" + +import { describe, expect, test } from "bun:test"; +import * as fs from "node:fs"; +import * as path from "node:path"; + +const SRC_DIR = path.resolve(import.meta.dirname); + +const ELECTRON_PATTERNS = [ + // Imports + /from\s+["']electron["']/, + /from\s+["']@electron[/-]/, + /from\s+["']electron\/(main|renderer)["']/, + /require\(["']electron["']\)/, + // Runtime detection / globals + /process\.versions\.electron/, + /\bapp\.(getPath|getName|getVersion|isPackaged)\b/, + /\bdialog\.(showMessageBox|showSaveDialog|showOpenDialog)\b/, + /\bBrowserWindow\b/, + /\bipcMain\b/, + /\bipcRenderer\b/, +]; + +function* walk(dir: string): Generator { + for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + if (entry.name === "node_modules" || entry.name === "dist") continue; + yield* walk(full); + continue; + } + if (!entry.isFile()) continue; + if (!entry.name.endsWith(".ts") && !entry.name.endsWith(".tsx")) continue; + // Skip self. + if (full === import.meta.url.replace(/^file:\/\//, "")) continue; + yield full; + } +} + +describe("host-service has no Electron coupling", () => { + test("no Electron imports or globals in src/", () => { + const offenders: { file: string; line: number; match: string }[] = []; + for (const file of walk(SRC_DIR)) { + const contents = fs.readFileSync(file, "utf-8"); + const lines = contents.split("\n"); + lines.forEach((line, idx) => { + // Skip our own assertions. + if (line.includes("ELECTRON_PATTERNS")) return; + for (const pat of ELECTRON_PATTERNS) { + if (pat.test(line)) { + offenders.push({ + file: path.relative(SRC_DIR, file), + line: idx + 1, + match: line.trim(), + }); + } + } + }); + } + if (offenders.length > 0) { + throw new Error( + `Found Electron coupling in host-service source:\n${offenders + .map((o) => ` ${o.file}:${o.line} ${o.match}`) + .join("\n")}`, + ); + } + expect(offenders.length).toBe(0); + }); +}); diff --git a/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts b/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts new file mode 100644 index 00000000000..5ce7cc267d7 --- /dev/null +++ b/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts @@ -0,0 +1,122 @@ +// Tests for the `terminal.daemon` tRPC procedures. +// +// We exercise the wiring (procedure → supervisor delegation, env.ORGANIZATION_ID +// resolution) against a stubbed singleton supervisor, not a real spawn. +// Real spawn coverage is in src/daemon/DaemonSupervisor.node-test.ts. + +import { beforeEach, describe, expect, mock, test } from "bun:test"; +// We need to control what `getSupervisor()` returns AND what +// `waitForDaemonReady` does. The cleanest way is to install a stub +// supervisor into the singleton via `getSupervisor("...")` (which +// constructs lazily on first call) then monkey-patch the methods we +// care about. +import { __resetSupervisorForTesting, getSupervisor } from "../../../daemon"; + +// Make env.ORGANIZATION_ID resolvable. The env module reads from +// process.env at module load via @t3-oss/env-core, so we must set +// the var BEFORE importing. +process.env.ORGANIZATION_ID = "00000000-0000-4000-8000-000000000000"; +process.env.HOST_SERVICE_SECRET = "test-secret"; +process.env.HOST_DB_PATH = "/tmp/test-host.db"; +process.env.HOST_MIGRATIONS_FOLDER = "/tmp/test-migrations"; +process.env.AUTH_TOKEN = "test-auth-token"; +process.env.CLOUD_API_URL = "https://cloud.example.com"; + +const { appRouter } = await import("../router.ts"); + +interface MinimalCtx { + isAuthenticated: boolean; +} + +function makeCaller(authenticated = true) { + // Cast to whatever; we only invoke procedures that don't touch db/git/etc. + return appRouter.createCaller({ + isAuthenticated: authenticated, + } as unknown as Parameters[0]); +} + +beforeEach(() => { + __resetSupervisorForTesting(); +}); + +describe("terminal.daemon tRPC procedures", () => { + test("rejects with UNAUTHORIZED when ctx is unauthenticated", async () => { + const caller = makeCaller(false); + await expect(caller.terminal.daemon.getUpdateStatus()).rejects.toThrow( + /Invalid or missing/, + ); + }); + + test("getUpdateStatus delegates to supervisor", async () => { + const sup = getSupervisor("/nonexistent"); + const getUpdateStatusMock = mock(() => ({ + pending: true, + running: "0.0.9", + expected: "0.1.0", + })); + ( + sup as unknown as { getUpdateStatus: typeof sup.getUpdateStatus } + ).getUpdateStatus = getUpdateStatusMock as typeof sup.getUpdateStatus; + + const caller = makeCaller(); + const result = await caller.terminal.daemon.getUpdateStatus(); + + expect(getUpdateStatusMock).toHaveBeenCalledTimes(1); + expect(getUpdateStatusMock).toHaveBeenCalledWith( + "00000000-0000-4000-8000-000000000000", + ); + expect(result).toEqual({ + pending: true, + running: "0.0.9", + expected: "0.1.0", + }); + }); + + test("listSessions awaits bootstrap before delegating", async () => { + const sup = getSupervisor("/nonexistent"); + const order: string[] = []; + + const ensureMock = mock(async () => { + order.push("ensure"); + await new Promise((r) => setTimeout(r, 30)); + return {} as Awaited>; + }); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + + const listMock = mock(async () => { + order.push("list"); + return []; + }); + (sup as unknown as { listSessions: typeof sup.listSessions }).listSessions = + listMock as typeof sup.listSessions; + + const caller = makeCaller(); + const result = await caller.terminal.daemon.listSessions(); + + expect(result).toEqual([]); + // Bootstrap must have started before list resolved. + expect(order[0]).toBe("ensure"); + expect(order).toContain("list"); + }); + + test("restart awaits bootstrap then delegates to supervisor.restart", async () => { + const sup = getSupervisor("/nonexistent"); + const ensureMock = mock( + async () => ({}) as Awaited>, + ); + const restartMock = mock(async () => ({ success: true as const })); + (sup as unknown as { ensure: typeof sup.ensure }).ensure = + ensureMock as typeof sup.ensure; + (sup as unknown as { restart: typeof sup.restart }).restart = + restartMock as typeof sup.restart; + + const caller = makeCaller(); + const result = await caller.terminal.daemon.restart(); + + expect(result).toEqual({ success: true }); + expect(restartMock).toHaveBeenCalledWith( + "00000000-0000-4000-8000-000000000000", + ); + }); +}); From 743d9f392812482b751ac2bf27aaff7701da781e Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 14:12:23 -0700 Subject: [PATCH 24/33] feat(host-service): kill pty-daemon on dev-mode shutdown Per migration plan D5: in dev mode (NODE_ENV !== production), the host-service shutdown handler now stops the supervised daemon before exit. Production still keeps the daemon detached so PTYs survive host-service restarts (the original v2 thesis). Lets dev iteration on daemon code reset cleanly without manually killing the daemon between cycles. --- packages/host-service/src/serve.ts | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/packages/host-service/src/serve.ts b/packages/host-service/src/serve.ts index b22d05a26e7..ac39d011cb8 100644 --- a/packages/host-service/src/serve.ts +++ b/packages/host-service/src/serve.ts @@ -1,6 +1,6 @@ import { serve } from "@hono/node-server"; import { createApp } from "./app"; -import { startDaemonBootstrap } from "./daemon"; +import { getSupervisor, startDaemonBootstrap } from "./daemon"; import { env } from "./env"; import { JwtApiAuthProvider } from "./providers/auth"; import { LocalGitCredentialProvider } from "./providers/git"; @@ -43,6 +43,34 @@ async function main(): Promise { }, }); + // Dev-mode shutdown: kill the daemon on host-service exit so dev + // iteration on daemon code resets cleanly. Production keeps the + // daemon detached so PTYs survive host-service restarts. + // Per the migration plan's D5 decision. + const isDev = process.env.NODE_ENV !== "production"; + if (isDev) { + let shuttingDown = false; + const devShutdown = async (signal: NodeJS.Signals) => { + if (shuttingDown) return; + shuttingDown = true; + console.log( + `[host-service] dev-mode ${signal} — stopping pty-daemon for clean iteration`, + ); + try { + await getSupervisor().stop(env.ORGANIZATION_ID); + } catch (err) { + console.error( + "[host-service] dev shutdown: supervisor.stop failed:", + err, + ); + } finally { + process.exit(0); + } + }; + process.on("SIGINT", () => void devShutdown("SIGINT")); + process.on("SIGTERM", () => void devShutdown("SIGTERM")); + } + const server = serve({ fetch: app.fetch, port: env.PORT }, (info) => { // Install only after the server is listening so startup throws still // reach `main().catch(...)` and exit with a non-zero code. From ab5413f25a090b48d931e96f773c3716dd37e015 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 14:58:30 -0700 Subject: [PATCH 25/33] fix(desktop): restore pty-daemon bundle target for Electron The supervisor's `sideBySide` path resolution expects pty-daemon.js next to host-service.js in the same dist directory. The Electron deploy bundles host-service into apps/desktop/dist/main/ via electron-vite, and that pipeline still needs an entry to bundle the daemon alongside. Restoring `apps/desktop/src/main/pty-daemon/index.ts` as a thin shim: imports Server from @superset/pty-daemon (workspace dep), parses argv, handles signals, that's it. The daemon implementation still lives entirely in the package. Headless deploys can spawn the package's own main.ts directly via the supervisor's workspace-dist fallback path. --- apps/desktop/src/main/pty-daemon/index.ts | 79 +++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 apps/desktop/src/main/pty-daemon/index.ts diff --git a/apps/desktop/src/main/pty-daemon/index.ts b/apps/desktop/src/main/pty-daemon/index.ts new file mode 100644 index 00000000000..330755139af --- /dev/null +++ b/apps/desktop/src/main/pty-daemon/index.ts @@ -0,0 +1,79 @@ +/** + * pty-daemon — Desktop bundle target + * + * The supervisor (in @superset/host-service) spawns this script as the + * daemon process. We need a desktop-side entry so electron-vite emits + * `apps/desktop/dist/main/pty-daemon.js` alongside `host-service.js` — + * the supervisor's `sideBySide` script-path resolution looks for the + * daemon binary right next to its own bundle. + * + * The actual daemon implementation lives in `@superset/pty-daemon`. + * This file is a thin runtime shim: argv parsing, signal handling, + * and starting the Server. Mirrors the layout host-service uses + * (apps/desktop/src/main/host-service/index.ts). + * + * Headless deploy path: in a non-Electron build, this file is unused — + * the supervisor instead spawns the @superset/pty-daemon package's + * built-in main.ts directly. + */ + +import { Server } from "@superset/pty-daemon"; + +interface CliArgs { + socket: string; +} + +function parseArgs(argv: string[]): CliArgs { + const args: Partial = {}; + for (const arg of argv) { + if (arg.startsWith("--socket=")) { + args.socket = arg.slice("--socket=".length); + } + } + if (!args.socket) { + throw new Error("--socket=PATH is required"); + } + return args as CliArgs; +} + +async function main(): Promise { + const args = parseArgs(process.argv.slice(2)); + // Source of truth for daemon version — the supervisor sets this env + // var on spawn (matching its EXPECTED_DAEMON_VERSION). Falls back to + // a hardcoded default if launched without env, so the daemon still + // reports something sane on direct invocation. + const daemonVersion = process.env.SUPERSET_PTY_DAEMON_VERSION ?? "0.1.0"; + const server = new Server({ + socketPath: args.socket, + daemonVersion, + }); + await server.listen(); + process.stderr.write( + `[pty-daemon] listening on ${args.socket} (v${daemonVersion})\n`, + ); + + let shuttingDown = false; + const shutdown = async (signal: NodeJS.Signals) => { + if (shuttingDown) return; + shuttingDown = true; + process.stderr.write(`[pty-daemon] received ${signal}, shutting down\n`); + try { + await server.close(); + } catch (err) { + process.stderr.write( + `[pty-daemon] shutdown error: ${(err as Error).stack ?? err}\n`, + ); + } finally { + process.exit(0); + } + }; + process.on("SIGINT", () => void shutdown("SIGINT")); + process.on("SIGTERM", () => void shutdown("SIGTERM")); +} + +void main().catch((error) => { + process.stderr.write( + `[pty-daemon] failed to start: ${(error as Error).stack ?? error}\n`, + ); + process.exit(1); +}); From 7d34cad082ad0b85df406e2915609533acdf071c Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 15:14:12 -0700 Subject: [PATCH 26/33] fix(desktop): wrap V2SessionsSection in WorkspaceClientProvider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Settings → Terminal route lives outside any WorkspaceClientProvider (those are per-workspace). Without one, workspaceTrpc hooks fall through to electron-trpc, which has no `terminal.daemon` namespace (we removed the desktop-side proxy in the migration). The renderer silently failed with "no procedure on path terminal.daemon.*". V2SessionsSection now mounts its own WorkspaceClientProvider keyed to the active org's host URL from LocalHostServiceProvider. Hooks now reach host-service over HTTP correctly. Also adds light startup logging on host-service: - `[host-service] starting (org=..., port=..., NODE_ENV=...)` - `[supervisor] kicking off bootstrap for org=...` - `[supervisor] bootstrap OK for org=... pid=... version=... [update pending]` - `[supervisor] bootstrap failed for org=...` These survived the migration debugging session and are useful as production startup-trace lines. Per-call procedure logs were stripped to keep noise low. --- .../V2SessionsSection/V2SessionsSection.tsx | 56 ++++++++++++++++++- packages/host-service/src/daemon/singleton.ts | 27 ++++++--- packages/host-service/src/serve.ts | 4 ++ 3 files changed, 77 insertions(+), 10 deletions(-) diff --git a/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/V2SessionsSection.tsx b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/V2SessionsSection.tsx index 2b65e181aab..f46c6eed9bc 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/V2SessionsSection.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/settings/terminal/components/TerminalSettings/components/V2SessionsSection/V2SessionsSection.tsx @@ -6,6 +6,13 @@ // per-row kill. Restart already achieves the kill-all effect for v2; // scrollback is owned per-session by the daemon's ring buffer with no // disk persistence; per-row kill belongs in the renderer's pane controls. +// +// Provider plumbing: workspaceTrpc needs a WorkspaceClientProvider with a +// real host URL. Settings routes are *outside* any per-workspace provider +// (they're org-level), so we mount our own here using the active org's +// host URL from LocalHostServiceProvider. Without this wrapping, hooks +// fall through to electron-trpc and fail with "no procedure on path +// terminal.daemon.*" — there's no such namespace on electron-trpc. import { AlertDialog, @@ -18,12 +25,45 @@ import { import { Button } from "@superset/ui/button"; import { Label } from "@superset/ui/label"; import { toast } from "@superset/ui/sonner"; -import { workspaceTrpc } from "@superset/workspace-client"; +import { + WorkspaceClientProvider, + workspaceTrpc, +} from "@superset/workspace-client"; import { useState } from "react"; +import { + getHostServiceHeaders, + getHostServiceWsToken, +} from "renderer/lib/host-service-auth"; +import { useLocalHostService } from "renderer/routes/_authenticated/providers/LocalHostServiceProvider"; const REFETCH_WHILE_OPEN_MS = 5_000; export function V2SessionsSection() { + const { activeHostUrl } = useLocalHostService(); + if (!activeHostUrl) { + return ( +
+ +

+ Host service is starting… +

+
+ ); + } + return ( + getHostServiceHeaders(activeHostUrl)} + wsToken={() => getHostServiceWsToken(activeHostUrl)} + > + + + ); +} + +function V2SessionsSectionInner() { const [confirmRestartOpen, setConfirmRestartOpen] = useState(false); const [showSessionList, setShowSessionList] = useState(false); @@ -40,6 +80,20 @@ export function V2SessionsSection() { refetchOnWindowFocus: true, }, ); + // Surface query errors so they're visible in renderer logs even when + // the section's UI gracefully degrades to "Daemon unavailable". + if (updateStatusQuery.error) { + console.error( + "[V2SessionsSection] getUpdateStatus error:", + updateStatusQuery.error, + ); + } + if (sessionsQuery.error) { + console.error( + "[V2SessionsSection] listSessions error:", + sessionsQuery.error, + ); + } const restartDaemon = workspaceTrpc.terminal.daemon.restart.useMutation({ onSuccess: () => { diff --git a/packages/host-service/src/daemon/singleton.ts b/packages/host-service/src/daemon/singleton.ts index d4fd60b8e1d..2477dbeb6df 100644 --- a/packages/host-service/src/daemon/singleton.ts +++ b/packages/host-service/src/daemon/singleton.ts @@ -63,15 +63,24 @@ export function getSupervisor(scriptPath?: string): DaemonSupervisor { export function startDaemonBootstrap(organizationId: string): void { if (bootstrapPromise) return; const sup = getSupervisor(); - bootstrapPromise = sup.ensure(organizationId).catch((err) => { - console.error( - `[host-service] pty-daemon bootstrap failed for org=${organizationId}:`, - err, - ); - // Reset so a future request can retry. - bootstrapPromise = null; - throw err; - }); + console.log(`[supervisor] kicking off bootstrap for org=${organizationId}`); + bootstrapPromise = sup + .ensure(organizationId) + .then((inst) => { + console.log( + `[supervisor] bootstrap OK for org=${organizationId} pid=${inst.pid} version=${inst.runningVersion}${inst.updatePending ? " (update pending)" : ""}`, + ); + return inst; + }) + .catch((err) => { + console.error( + `[supervisor] bootstrap failed for org=${organizationId}:`, + err, + ); + // Reset so a future request can retry. + bootstrapPromise = null; + throw err; + }); } /** diff --git a/packages/host-service/src/serve.ts b/packages/host-service/src/serve.ts index ac39d011cb8..3c787498fd1 100644 --- a/packages/host-service/src/serve.ts +++ b/packages/host-service/src/serve.ts @@ -11,6 +11,10 @@ import { initTerminalBaseEnv, resolveTerminalBaseEnv } from "./terminal/env"; import { connectRelay } from "./tunnel"; async function main(): Promise { + console.log( + `[host-service] starting (org=${env.ORGANIZATION_ID}, port=${env.PORT}, NODE_ENV=${process.env.NODE_ENV ?? "unset"})`, + ); + const terminalBaseEnv = await resolveTerminalBaseEnv(); initTerminalBaseEnv(terminalBaseEnv); From 285179ab8206416f3b71008fbbd4d16917aaca1f Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 15:51:21 -0700 Subject: [PATCH 27/33] fix(host-service): correct sideBySide daemon-script path resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit resolveSupervisorScriptPath was walking two extra levels (`..`, `..`) when looking for pty-daemon.js next to host-service.js, which in the electron-vite bundle resolved to apps/desktop/pty-daemon.js (doesn't exist). The bundle emits both files in the same dist/main/ directory, so the path is just `path.resolve(here, "pty-daemon.js")`. Manifested as "[pty-daemon] script not found at apps/pty-daemon/dist/ pty-daemon.js" when triggering Restart daemon from Settings — the sideBySide check failed and the supervisor fell through to the workspace-source fallback path, which doesn't apply in a bundled deploy. --- packages/host-service/src/daemon/singleton.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/packages/host-service/src/daemon/singleton.ts b/packages/host-service/src/daemon/singleton.ts index 2477dbeb6df..08e35949ee6 100644 --- a/packages/host-service/src/daemon/singleton.ts +++ b/packages/host-service/src/daemon/singleton.ts @@ -24,16 +24,16 @@ export function resolveSupervisorScriptPath(): string { if (override) return override; const here = path.dirname(fileURLToPath(import.meta.url)); - // Production: host-service.js and pty-daemon.js live side-by-side in - // the same bundled dist/. `here` is `/daemon/` after bundling - // (or close to it); two levels up + `pty-daemon.js` resolves there. - const sideBySide = path.resolve(here, "..", "..", "pty-daemon.js"); + // Production / dev (electron-vite bundle): host-service.js and + // pty-daemon.js are emitted side-by-side in the same dist directory, + // so `here` and the daemon entry share a parent. + const sideBySide = path.resolve(here, "pty-daemon.js"); if (existsSync(sideBySide)) return sideBySide; - // Dev mode running from source: `here` is - // `packages/host-service/src/daemon/`; the daemon's bundled entry - // sits at `packages/pty-daemon/dist/pty-daemon.js` after `bun run - // build:daemon` in that package. + // Source-running fallback (`bun run` from packages/host-service): + // `here` is `packages/host-service/src/daemon/`; the daemon's bundled + // entry sits at `packages/pty-daemon/dist/pty-daemon.js` after + // `bun run build:daemon` in that package. const workspaceDist = path.resolve( here, "..", From 9697ce8f15f7e41d2bdc99823dab834f21e016da Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 16:20:28 -0700 Subject: [PATCH 28/33] fix(pty-daemon): delete sessions on PTY exit (no accumulation) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Server.onExit marked sessions as exited and fanned out the exit event but never deleted them from the SessionStore. Comment claimed "we delete on next list/close" but neither path did. Result: every closed terminal pane left a permanent row in the daemon's map — list-reply inflated, memory grew unbounded over time. Now: delete the session row immediately after fanning out the exit event. Also clear matching subscriptions on the live conns so they don't carry a stale id forward. Tradeoff: a late subscriber that connects after exit (e.g. host-service restarting *during* the exit window) gets ENOENT instead of buffered output + exit event. The renderer's xterm.js already has whatever was rendered before disconnect — what's lost is just the "Process exited with code N" footer for that narrow window. Accepted per project preference for simplest invariants. Updated tests: dropped subscribe-with-replay-on-exited (its premise no longer holds) and replaced with a non-accumulation assertion + ENOENT expectation on post-exit input. EEXITED is no longer a returned code (still defined in protocol for forward-compat). --- packages/pty-daemon/src/Server/Server.ts | 18 +++++-- .../pty-daemon/test/control-plane.test.ts | 52 +++++++++---------- 2 files changed, 39 insertions(+), 31 deletions(-) diff --git a/packages/pty-daemon/src/Server/Server.ts b/packages/pty-daemon/src/Server/Server.ts index 3395c54502e..5dbc619e17a 100644 --- a/packages/pty-daemon/src/Server/Server.ts +++ b/packages/pty-daemon/src/Server/Server.ts @@ -234,10 +234,22 @@ export class Server { signal: info.signal, }; for (const c of this.conns) { - if (c.subscriptions.has(session.id)) c.send(ev); + if (c.subscriptions.has(session.id)) { + c.send(ev); + c.subscriptions.delete(session.id); + } } - // Keep the session row around briefly so a late subscriber can still - // fetch the buffer; we delete on next list/close. + // Delete the session immediately. Without this, every closed + // terminal pane left a row in the store forever — list-reply + // inflated, memory grew unbounded. + // + // Tradeoff: a late subscriber that connects after this point + // (e.g. host-service restarting *during* the shell exit window) + // gets ENOENT instead of the buffered output + exit event. The + // renderer's xterm.js already has whatever was rendered before + // disconnect — it just loses the "Process exited with code N" + // footer for that narrow window. + this.store.delete(session.id); }); } } diff --git a/packages/pty-daemon/test/control-plane.test.ts b/packages/pty-daemon/test/control-plane.test.ts index 6999e34c944..2fe60fbf0bb 100644 --- a/packages/pty-daemon/test/control-plane.test.ts +++ b/packages/pty-daemon/test/control-plane.test.ts @@ -646,11 +646,13 @@ describe("cross-client continuity (host-service restart simulation)", () => { await b.close(); }); - test("subscribe-with-replay on an already-exited session yields buffered output + immediate exit event", async () => { - // After a host-service restart, a shell that exited during the gap - // should still surface its final output AND its exit event when the - // new host-service subscribes. Otherwise the renderer hangs waiting - // for output that will never come. + test("exited sessions are deleted immediately (no accumulation)", async () => { + // Sessions are removed from the store the moment their PTY exits. + // Late subscribers (e.g. host-service restarting in the exit gap) + // get ENOENT — the renderer falls back to a generic "session + // unavailable" footer. Tradeoff: niche UX regression in the + // restart-during-exit window vs. unbounded session accumulation + // (every closed terminal pane otherwise left a row forever). const a = await connectAndHello(sockPath); const id = uniqueId("postexit"); a.send({ @@ -661,34 +663,23 @@ describe("cross-client continuity (host-service restart simulation)", () => { await a.waitFor((m) => m.type === "open-ok" && m.id === id); a.send({ type: "subscribe", id, replay: true }); await a.waitFor((m) => m.type === "exit" && m.id === id, 3000); - // Connection A drops without explicit close — session enters - // alive:false state but is still in the daemon's map. a.socket.destroy(); + // Give the on-exit handler a beat to run its store.delete. await new Promise((r) => setTimeout(r, 100)); + // New connection: late subscribe gets nothing useful for a + // vanished id. We assert that the session is gone from list and + // that an op on the id returns ENOENT. const b = await connectAndHello(sockPath); - b.send({ type: "subscribe", id, replay: true }); - await b.waitFor( - (m) => - m.type === "output" && - m.id === id && - Buffer.from(m.data, "base64").toString().includes("final-words"), - 2000, - ); - // Note: an exit event for an already-exited session is best-effort — - // the daemon's `wireSession` only fires onExit once when the shell - // actually dies. A late subscriber sees the buffered output and can - // observe `alive:false` via `list`. This test asserts the buffer - // behavior; the host-service supplements with a `list` check before - // declaring the session dead. b.send({ type: "list" }); - const reply = await b.waitFor((m) => m.type === "list-reply"); + const reply = await b.waitFor((m) => m.type === "list-reply", 1000); if (reply.type === "list-reply") { - const me = reply.sessions.find((s) => s.id === id); - assert.ok(me, "exited session should still be in list"); - assert.equal(me?.alive, false); + const found = reply.sessions.find((s) => s.id === id); + assert.equal(found, undefined, "exited session should not be in list"); } - b.send({ type: "close", id, signal: "SIGTERM" }); + b.send({ type: "close", id }); + const err = await b.waitFor((m) => m.type === "error", 1000); + if (err.type === "error") assert.equal(err.code, "ENOENT"); await b.close(); }); @@ -773,7 +764,11 @@ describe("hostile input", () => { await c.close(); }); - test("input on already-exited session returns EEXITED", async () => { + test("input on a session that just exited returns ENOENT", async () => { + // Exit deletes the session row, so post-exit input lands on + // "unknown session" — same code path as input on a never-existed + // id. EEXITED is no longer returned because there's no exited + // session to be "exited"; it's just gone. const c = await connectAndHello(sockPath); const id = uniqueId("dead"); c.send({ @@ -784,6 +779,7 @@ describe("hostile input", () => { await c.waitFor((m) => m.type === "open-ok" && m.id === id); c.send({ type: "subscribe", id, replay: true }); await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); + await new Promise((r) => setTimeout(r, 50)); c.send({ type: "input", @@ -791,7 +787,7 @@ describe("hostile input", () => { data: Buffer.from("ignored").toString("base64"), }); const err = await c.waitFor((m) => m.type === "error", 1000); - if (err.type === "error") assert.equal(err.code, "EEXITED"); + if (err.type === "error") assert.equal(err.code, "ENOENT"); await c.close(); }); }); From d85d3fc9491fe7bfbf83da0d76db146adf402865 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 21:55:50 -0700 Subject: [PATCH 29/33] feat(host-service): adopted-daemon liveness check + dev-mode log piping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related dev-quality fixes uncovered during manual QA: 1. Adopted daemons aren't tracked by `child.on("exit")` — the supervisor only attaches that handler to daemons it spawned. When an adopted daemon dies externally (kill -9, OOM, etc.) the supervisor's `instances` map carries a stale entry forever: `getSocketPath` returns a socket nobody's listening on, terminal ops fail with ECONNREFUSED until something forces a restart. Fix: poll `process.kill(pid, 0)` every 2s for adopted PIDs. On detected death, clear the instance + manifest so the next `ensure()` respawns. Added integration test: "detects when an adopted daemon dies externally". 2. host-service and pty-daemon stdout went to per-org rotating log files in BOTH dev and prod, so dev iteration had no live log visibility — every diagnostic required tailing files. Now in dev (NODE_ENV !== production) child stdout/stderr pipes through to the parent (host-service → desktop main → bun dev), each line tagged with `[hs:]` or `[ptyd:]`. Production stdio still backs to the rotating log file so detached children can outlive the parent without losing logs. Helper `pipeWithPrefix` splits chunks on \n so multi-line bursts keep the prefix on every line (was: only the first line). --- .../src/main/lib/host-service-coordinator.ts | 48 ++++++++- .../src/daemon/DaemonSupervisor.node-test.ts | 50 ++++++++++ .../src/daemon/DaemonSupervisor.ts | 99 ++++++++++++++++++- 3 files changed, 192 insertions(+), 5 deletions(-) diff --git a/apps/desktop/src/main/lib/host-service-coordinator.ts b/apps/desktop/src/main/lib/host-service-coordinator.ts index c78366ec4da..c82d2dd6cb2 100644 --- a/apps/desktop/src/main/lib/host-service-coordinator.ts +++ b/apps/desktop/src/main/lib/host-service-coordinator.ts @@ -401,8 +401,16 @@ export class HostServiceCoordinator extends EventEmitter { path.join(manifestDir(organizationId), "host-service.log"), MAX_HOST_LOG_BYTES, ); - const stdio: childProcess.StdioOptions = - logFd >= 0 ? ["ignore", logFd, logFd] : ["ignore", "ignore", "ignore"]; + // Dev: pipe child stdout/stderr through this process so log lines + // land in the developer's `bun dev` terminal. Production: hard-back + // stdio with the rotating log file so the detached child survives + // parent teardown without losing logs. + const isDev = !app.isPackaged; + const stdio: childProcess.StdioOptions = isDev + ? ["ignore", "pipe", "pipe"] + : logFd >= 0 + ? ["ignore", logFd, logFd] + : ["ignore", "ignore", "ignore"]; let child: ReturnType; try { @@ -423,6 +431,15 @@ export class HostServiceCoordinator extends EventEmitter { } } + // In dev, fan child output through to parent stdout/stderr with a + // prefix so it's identifiable in `bun dev`. The detached child has + // its own session, so closing pipes won't kill it on parent exit. + if (isDev && child.stdout && child.stderr) { + const tag = `[hs:${organizationId.slice(0, 8)}]`; + pipeWithPrefix(child.stdout, process.stdout, tag); + pipeWithPrefix(child.stderr, process.stderr, tag); + } + const childPid = child.pid; if (!childPid) { this.instances.delete(organizationId); @@ -549,6 +566,33 @@ export class HostServiceCoordinator extends EventEmitter { } } +/** + * Forward child stdout/stderr to a parent stream with a per-line prefix. + * Plain `chunk => parent.write(`${tag} ${chunk}`)` only prefixes the first + * line in a chunk and breaks visual scanning when child output bursts. + */ +function pipeWithPrefix( + source: NodeJS.ReadableStream, + target: NodeJS.WritableStream, + tag: string, +): void { + let pending = ""; + source.on("data", (chunk: Buffer) => { + const text = pending + chunk.toString("utf8"); + const lines = text.split("\n"); + // Last element is a partial line if input doesn't end with \n; + // stash it for the next chunk. + pending = lines.pop() ?? ""; + for (const line of lines) { + target.write(`${tag} ${line}\n`); + } + }); + source.on("end", () => { + if (pending) target.write(`${tag} ${pending}\n`); + pending = ""; + }); +} + let coordinator: HostServiceCoordinator | null = null; export function getHostServiceCoordinator(): HostServiceCoordinator { diff --git a/packages/host-service/src/daemon/DaemonSupervisor.node-test.ts b/packages/host-service/src/daemon/DaemonSupervisor.node-test.ts index cbc10a9cf18..9e954e541a3 100644 --- a/packages/host-service/src/daemon/DaemonSupervisor.node-test.ts +++ b/packages/host-service/src/daemon/DaemonSupervisor.node-test.ts @@ -221,6 +221,56 @@ describe("DaemonSupervisor.ensure (real spawn)", () => { assert.ok(after, "expected a respawned instance"); assert.notEqual(after.pid, aPid); }); + + test("detects when an adopted daemon dies externally", async () => { + // Adopted daemons (PIDs from a manifest, not spawned children) + // don't fire `child.on("exit")` when killed externally. The + // supervisor must poll PID liveness to notice and clear the + // stale instance so the next ensure() respawns. Without this, + // host-service would keep handing out a dead socket path until + // something else forced a restart. + const orgId = "org-adopted-died"; + + // Supervisor A spawns the daemon. We'll then construct a + // supervisor B that adopts via manifest, verify the adopted + // PID, kill it externally, and assert B clears its instance. + const supA = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + const a = await supA.ensure(orgId); + const adoptedPid = a.pid; + + const supB = new DaemonSupervisor({ scriptPath: DAEMON_BUNDLE }); + supervisorsToCleanup.push({ sup: supB, orgId }); + const b = await supB.ensure(orgId); + assert.equal(b.pid, adoptedPid, "B should adopt A's daemon"); + + // Externally kill the adopted daemon. supA never had a child + // handle so its on-exit handler can't fire; supB only adopted + // (no child handle either). The poller must catch this. + process.kill(adoptedPid, "SIGKILL"); + + // Wait up to 6s for the liveness poller (2s interval) to fire. + const deadline = Date.now() + 6000; + while (Date.now() < deadline) { + const inst = ( + supB as unknown as { instances: Map } + ).instances.get(orgId); + if (!inst) break; + await new Promise((r) => setTimeout(r, 200)); + } + const after = ( + supB as unknown as { instances: Map } + ).instances.get(orgId); + assert.equal( + after, + undefined, + "supervisor should have cleared the dead adopted instance", + ); + + // Next ensure() should respawn fresh. + const fresh = await supB.ensure(orgId); + assert.notEqual(fresh.pid, adoptedPid); + assert.equal(isAlive(fresh.pid), true); + }); }); function isAlive(pid: number): boolean { diff --git a/packages/host-service/src/daemon/DaemonSupervisor.ts b/packages/host-service/src/daemon/DaemonSupervisor.ts index ca4cc39a65a..0035326f107 100644 --- a/packages/host-service/src/daemon/DaemonSupervisor.ts +++ b/packages/host-service/src/daemon/DaemonSupervisor.ts @@ -56,6 +56,8 @@ const VERSION_PROBE_TIMEOUT_MS = 1_500; */ const CRASH_BUDGET = 3; const CRASH_WINDOW_MS = 60_000; +/** How often to poll an adopted daemon's PID for liveness. */ +const ADOPTED_LIVENESS_INTERVAL_MS = 2_000; /** * Per-organization socket path. **Must stay short** — Darwin's `sun_path` @@ -105,6 +107,16 @@ export class DaemonSupervisor { * Debounce — re-fire only when either side changes. */ private readonly lastUpdatePendingPair = new Map(); + /** + * Liveness pollers per org. We only attach a `child.on("exit")` handler + * to daemons we *spawned* — adopted daemons (PIDs from a manifest) have + * no child handle, so we'd never notice if they died externally. This + * timer polls `process.kill(pid, 0)` to bridge that gap. + */ + private readonly adoptedLivenessTimers = new Map< + string, + ReturnType + >(); constructor(opts: DaemonSupervisorOptions) { this.opts = opts; @@ -226,6 +238,7 @@ export class DaemonSupervisor { async stop(organizationId: string): Promise { const instance = this.instances.get(organizationId); this.instances.delete(organizationId); + this.stopAdoptedLivenessCheck(organizationId); if (!instance) return; this.stopping.add(organizationId); try { @@ -236,6 +249,42 @@ export class DaemonSupervisor { removePtyDaemonManifest(organizationId); } + /** + * Poll an adopted daemon's liveness. Adopted daemons are PIDs we + * inherited via the manifest — we never spawned them as a child, so + * `child.on("exit")` doesn't fire when they die. Without this poller + * the supervisor's `instances` map carries a stale entry forever: + * `getSocketPath` returns a socket nobody's listening on, terminal + * ops fail with "ECONNREFUSED" until something forces a restart. + * + * On detected death: clear the instance + manifest so the next + * `ensure()` call respawns. + */ + private startAdoptedLivenessCheck(organizationId: string, pid: number): void { + this.stopAdoptedLivenessCheck(organizationId); + const timer = setInterval(() => { + if (isProcessAlive(pid)) return; + console.log( + `[pty-daemon:${organizationId}] adopted process ${pid} died — clearing instance for next-ensure respawn`, + ); + this.stopAdoptedLivenessCheck(organizationId); + const current = this.instances.get(organizationId); + if (current?.pid === pid) { + this.instances.delete(organizationId); + removePtyDaemonManifest(organizationId); + } + }, ADOPTED_LIVENESS_INTERVAL_MS); + this.adoptedLivenessTimers.set(organizationId, timer); + } + + private stopAdoptedLivenessCheck(organizationId: string): void { + const timer = this.adoptedLivenessTimers.get(organizationId); + if (timer) { + clearInterval(timer); + this.adoptedLivenessTimers.delete(organizationId); + } + } + private async start(organizationId: string): Promise { const adopted = await this.tryAdopt(organizationId); if (adopted) { @@ -252,6 +301,7 @@ export class DaemonSupervisor { updatePending: adopted.updatePending, }); this.maybeFireUpdatePending(organizationId, adopted); + this.startAdoptedLivenessCheck(organizationId, adopted.pid); return adopted; } @@ -341,9 +391,17 @@ export class DaemonSupervisor { ); } - const logFd = openRotatingLogFd(logPath, MAX_DAEMON_LOG_BYTES); - const stdio: childProcess.StdioOptions = - logFd >= 0 ? ["ignore", logFd, logFd] : ["ignore", "ignore", "ignore"]; + // Dev: pipe daemon stdout/stderr through host-service so log lines + // flow up to the developer's `bun dev` terminal. Production: + // hard-back stdio with the rotating log file so the detached + // daemon survives host-service teardown without losing logs. + const isDev = process.env.NODE_ENV !== "production"; + const logFd = isDev ? -1 : openRotatingLogFd(logPath, MAX_DAEMON_LOG_BYTES); + const stdio: childProcess.StdioOptions = isDev + ? ["ignore", "pipe", "pipe"] + : logFd >= 0 + ? ["ignore", logFd, logFd] + : ["ignore", "ignore", "ignore"]; const childEnv = { ...(process.env as Record), @@ -385,6 +443,15 @@ export class DaemonSupervisor { throw new Error(`[pty-daemon:${organizationId}] failed to spawn`); } + // Dev: fan daemon stdout/stderr up to host-service stdout (which + // itself flows up to `bun dev`). Production stdio is backed by the + // rotating log file already (logFd above), so no fan-out needed. + if (isDev && child.stdout && child.stderr) { + const tag = `[ptyd:${organizationId.slice(0, 8)}]`; + pipeWithPrefix(child.stdout, process.stdout, tag); + pipeWithPrefix(child.stderr, process.stderr, tag); + } + let earlyExitCode: number | null = null; let earlyExitSignal: NodeJS.Signals | null = null; child.once("exit", (code, signal) => { @@ -495,6 +562,32 @@ export class DaemonSupervisor { } } +/** + * Forward child stdout/stderr to a parent stream with a per-line prefix. + * Plain `chunk => parent.write(`${tag} ${chunk}`)` only prefixes the first + * line in a chunk; bursts of multi-line output lose the prefix on + * subsequent lines. + */ +function pipeWithPrefix( + source: NodeJS.ReadableStream, + target: NodeJS.WritableStream, + tag: string, +): void { + let pending = ""; + source.on("data", (chunk: Buffer) => { + const text = pending + chunk.toString("utf8"); + const lines = text.split("\n"); + pending = lines.pop() ?? ""; + for (const line of lines) { + target.write(`${tag} ${line}\n`); + } + }); + source.on("end", () => { + if (pending) target.write(`${tag} ${pending}\n`); + pending = ""; + }); +} + async function waitForSocket( socketPath: string, timeoutMs: number, From 0a8f06cbc604fcafc5b63fd066be3a4c522e3b69 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 22:22:19 -0700 Subject: [PATCH 30/33] =?UTF-8?q?fix(pty-daemon):=20default=20close=20to?= =?UTF-8?q?=20SIGHUP=20=E2=80=94=20interactive=20shells=20leak=20on=20SIGT?= =?UTF-8?q?ERM?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chain (DaemonClient.close, daemon handleClose, DaemonPty.kill) defaulted to SIGTERM. Interactive shells (especially `zsh -l`, the default macOS login shell) trap SIGTERM and stay alive — so every closed v2 terminal pane leaked a PTY process and a daemon session until something else SIGKILL'd it. Verified: PID 46234 (`zsh -l`, status `Ss+`) survived the v2 pane-close path (which sends SIGTERM by default). Manual `kill -HUP 46234` killed it cleanly. SIGHUP is the right semantic — it's what the kernel sends when a TTY actually closes. Default changed to SIGHUP at all three layers; explicit signals still pass through for callers that need stronger termination (e.g. SIGKILL for hung shells in test). Regression test added: "default close (SIGHUP) terminates an interactive login shell". Without the fix, this would time out waiting for the exit event. The earlier integration tests didn't catch this because they used non-interactive scripts (`-c "true"`) that exit naturally — no signal handling involved. --- .../src/terminal/DaemonClient/DaemonClient.ts | 2 +- .../host-service/src/terminal/terminal.ts | 2 +- packages/pty-daemon/src/handlers/handlers.ts | 7 ++++- .../pty-daemon/test/control-plane.test.ts | 28 +++++++++++++++++++ 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts b/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts index 7afa7823245..07cc79541f4 100644 --- a/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts +++ b/packages/host-service/src/terminal/DaemonClient/DaemonClient.ts @@ -126,7 +126,7 @@ export class DaemonClient { throw new Error(`open ${id}: unexpected reply ${reply.type}`); } - async close(id: string, signal: Signal = "SIGTERM"): Promise { + async close(id: string, signal: Signal = "SIGHUP"): Promise { const reply = await this.requestSession( id, { type: "close", id, signal }, diff --git a/packages/host-service/src/terminal/terminal.ts b/packages/host-service/src/terminal/terminal.ts index 79695539750..44a638af6c8 100644 --- a/packages/host-service/src/terminal/terminal.ts +++ b/packages/host-service/src/terminal/terminal.ts @@ -75,7 +75,7 @@ function makeDaemonPty( daemon .close( sessionId, - (signal as "SIGTERM" | "SIGKILL" | "SIGINT" | "SIGHUP") ?? "SIGTERM", + (signal as "SIGTERM" | "SIGKILL" | "SIGINT" | "SIGHUP") ?? "SIGHUP", ) .catch(() => { // Already gone or daemon disconnected — no-op. diff --git a/packages/pty-daemon/src/handlers/handlers.ts b/packages/pty-daemon/src/handlers/handlers.ts index 145f9b08360..24b249fc7bf 100644 --- a/packages/pty-daemon/src/handlers/handlers.ts +++ b/packages/pty-daemon/src/handlers/handlers.ts @@ -103,7 +103,12 @@ export function handleClose(ctx: HandlerCtx, msg: CloseMessage): ServerMessage { const session = ctx.store.get(msg.id); if (!session) return errorFor(msg.id, `unknown session: ${msg.id}`, "ENOENT"); try { - session.pty.kill(msg.signal ?? "SIGTERM"); + // SIGHUP is the right signal for "your terminal is going away" — + // what the kernel sends when a TTY actually closes. Interactive + // shells (especially `zsh -l`) trap SIGTERM and stay alive, so + // using SIGTERM as the default leaks PTY processes on every + // pane close. Callers can still pass an explicit signal. + session.pty.kill(msg.signal ?? "SIGHUP"); } catch (err) { return errorFor(msg.id, (err as Error).message, "EKILL"); } diff --git a/packages/pty-daemon/test/control-plane.test.ts b/packages/pty-daemon/test/control-plane.test.ts index 2fe60fbf0bb..3a990a69c43 100644 --- a/packages/pty-daemon/test/control-plane.test.ts +++ b/packages/pty-daemon/test/control-plane.test.ts @@ -164,6 +164,34 @@ describe("session lifecycle", () => { await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); await c.close(); }); + + test("default close (SIGHUP) terminates an interactive login shell", async () => { + // Regression test for a real-world bug: SIGTERM is the wrong default + // for "user closed the terminal pane" because interactive shells + // (especially `zsh -l`) trap SIGTERM and stay alive. The kernel + // sends SIGHUP when a TTY closes, and shells DO honor it. Without + // this, every closed v2 terminal pane leaked a zsh process. + const c = await connectAndHello(sockPath); + const id = uniqueId("interactive"); + // `-i` forces interactive mode even though stdin is a PTY pipe; + // matches the real terminal-launch shape closely enough for this + // regression to fire if someone reverts to SIGTERM. + c.send({ + type: "open", + id, + meta: { ...baseMeta, argv: ["-i"] }, + }); + await c.waitFor((m) => m.type === "open-ok" && m.id === id); + c.send({ type: "subscribe", id, replay: false }); + + // Default close — no explicit signal. Server defaults to SIGHUP. + c.send({ type: "close", id }); + await c.waitFor((m) => m.type === "closed" && m.id === id); + // Critical: the shell must actually exit. If SIGTERM defaults + // returned (the bug), this waitFor would timeout. + await c.waitFor((m) => m.type === "exit" && m.id === id, 3000); + await c.close(); + }); }); // ---------------- I/O patterns ---------------- From 69d3d6122b5d8ade4927e9fb8512ba98436fd3ba Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 23:35:51 -0700 Subject: [PATCH 31/33] chore(host-service): bump version to 0.5.0 to force fresh respawn on upgrade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pty-daemon supervision migration adds a new `terminal.daemon` tRPC namespace and changes host-service-internal lifecycle (supervisor owns the daemon now, dev-mode stdio piping, etc.). Existing 0.4.x host-services running on user machines don't have any of this. Without this version bump, the desktop coordinator's `tryAdopt` would adopt the old host-service in place — Settings → Manage daemon would 404 on the new procedures, and the v2 PTY-survival promise (the whole point of this PR) would silently not engage until something else forced a restart. Bumping HOST_SERVICE_VERSION + MIN_HOST_SERVICE_VERSION to 0.5.0 forces the coordinator to SIGTERM old host-services on first launch of the new desktop build and respawn from the new bundle. One-time terminal-session loss for users on upgrade — covered in release notes. --- .../desktop/src/main/lib/host-service-coordinator.ts | 12 +++++++++++- packages/host-service/src/trpc/router/host/host.ts | 8 +++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/apps/desktop/src/main/lib/host-service-coordinator.ts b/apps/desktop/src/main/lib/host-service-coordinator.ts index c82d2dd6cb2..decb39d9d7b 100644 --- a/apps/desktop/src/main/lib/host-service-coordinator.ts +++ b/apps/desktop/src/main/lib/host-service-coordinator.ts @@ -41,8 +41,18 @@ import { HOOK_PROTOCOL_VERSION } from "./terminal/env"; * `device.ensureV2Host`); v2_hosts/v2_users_hosts/v2_workspaces use * machineId text instead of uuid surrogates. * 0.2.0: `workspaceCreation.adopt` gained optional `worktreePath`. + * + * 0.5.0 — pty-daemon supervision migrated into host-service. New + * `terminal.daemon` tRPC namespace; older 0.4.x host-services don't + * expose it. Adopting one in place would leave the new desktop + * talking to old code: Settings → Manage daemon would silently + * fail, and the v2 PTY survival promise is broken. Bumping the + * floor forces the coordinator's `tryAdopt` (host-service-coordinator + * line ~308) to SIGTERM old host-services on first launch and + * respawn with the new bundle. One-time terminal-session loss for + * users on upgrade — accepted per release-notes guidance. */ -const MIN_HOST_SERVICE_VERSION = "0.4.0"; +const MIN_HOST_SERVICE_VERSION = "0.5.0"; export type HostServiceStatus = "starting" | "running" | "stopped"; diff --git a/packages/host-service/src/trpc/router/host/host.ts b/packages/host-service/src/trpc/router/host/host.ts index d587b0e2501..313a3cf76f1 100644 --- a/packages/host-service/src/trpc/router/host/host.ts +++ b/packages/host-service/src/trpc/router/host/host.ts @@ -12,7 +12,13 @@ import { protectedProcedure, router } from "../../index"; // not uuid. Older host-service binaries call the now-removed `device.*` // procedures and fail at registration. // 0.2.0: `workspaceCreation.adopt` accepts optional `worktreePath`. -const HOST_SERVICE_VERSION = "0.4.0"; +// 0.5.0: pty-daemon supervision moved into host-service. New +// `terminal.daemon` tRPC namespace; existing 0.4.x host-services +// don't expose it, so the desktop coordinator must refuse to adopt +// them on upgrade and respawn with the new bundle. Adopting in +// place would leave the new desktop talking to old code with no +// `terminal.daemon.*` routes, breaking Settings → Manage daemon. +const HOST_SERVICE_VERSION = "0.5.0"; const ORGANIZATION_CACHE_TTL_MS = 60 * 60 * 1000; let cachedOrganization: { From 668afeb75c62b84a9eebbe98ec2f249ec8ae3833 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Thu, 30 Apr 2026 23:38:54 -0700 Subject: [PATCH 32/33] docs(host-service): update daemon-supervision reference for shipped behavior Documents the additions from the manual-QA debugging session: - Adopted-daemon liveness polling (separate code path from spawned child's on-exit handler) - SIGHUP default for close (interactive shells trap SIGTERM) - Session deletion on PTY exit + the niche regression accepted - Dev-mode stdio piping with per-line prefix - Version-bump procedure (HOST_SERVICE_VERSION + MIN_HOST_SERVICE_VERSION in lockstep when adoption-floor matters) - Phase 2 (daemon-upgrade fd-handoff) explicitly noted as deferred, with the design hooks already in place that future work will use Existing sections (boot pattern, version detection, crash circuit, tests) updated to point at the new test files added in this PR. --- packages/host-service/DAEMON_SUPERVISION.md | 108 +++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/packages/host-service/DAEMON_SUPERVISION.md b/packages/host-service/DAEMON_SUPERVISION.md index 83deb497261..f23164bde6a 100644 --- a/packages/host-service/DAEMON_SUPERVISION.md +++ b/packages/host-service/DAEMON_SUPERVISION.md @@ -43,6 +43,20 @@ The socket path lives in `os.tmpdir()/superset-ptyd-. — short enough to fit Darwin's 104-byte `sun_path` limit. Owner-only file mode (0600) is the auth boundary. +### Adopted-daemon liveness check + +`child.on("exit")` only fires for daemons we *spawned* — adopted +daemons (PIDs from a manifest) have no child handle. Without a +liveness check, the supervisor's `instances` map carries a stale +entry forever when an adopted daemon dies externally (kill -9, OOM): +`getSocketPath` returns a socket nobody's listening on, terminal ops +fail with ECONNREFUSED until something forces a restart. + +We poll `process.kill(pid, 0)` every 2s for adopted PIDs +(`ADOPTED_LIVENESS_INTERVAL_MS`). On detected death we clear the +instance + manifest so the next `ensure()` respawns. Spawned daemons +keep using the cheaper `child.on("exit")` path. + ## Version detection On adoption, `probeDaemonVersion` does a one-shot `hello`/`hello-ack` to @@ -71,6 +85,49 @@ clears the crash circuit, logs `pty_daemon_user_restart`, then `ensure`s fresh. Sessions die in the gap — that's the cost the user accepted via the confirmation dialog. +### Default close signal: SIGHUP, not SIGTERM + +The kill chain (`DaemonClient.close`, daemon `handleClose`, +`DaemonPty.kill`) defaults to **SIGHUP**, not SIGTERM. Interactive +shells — especially `zsh -l`, the default macOS login shell — trap +SIGTERM and stay alive. SIGTERM defaults silently leaked PTY processes +on every closed pane until the daemon was respawned. SIGHUP is what +the kernel sends when a real TTY closes, and shells honor it. + +Explicit `SIGKILL` still passes through for hung shells (e.g. the +"force kill" path). + +## Session deletion on PTY exit + +The daemon's `Server.onExit` handler deletes the session row from +the store immediately after fanning out the exit event. **Late +subscribers that connect after exit get ENOENT**, not the buffered +output and exit event. + +Tradeoff: a host-service that restarts during the small window when +a shell is exiting will not be able to fetch the final output via +`subscribe(replay: true)` — the renderer falls back to a generic +"session unavailable" footer instead of "Process exited with code N". +Without this delete, every closed terminal pane left a row in the +store forever (every "Show sessions" entry would have been an Exited +zombie). + +## Dev-mode log piping + +In dev (`NODE_ENV !== "production"`), both host-service and +pty-daemon stdio is **piped through to the parent process** with +per-line prefixes: + +- `[hs:<8-char-orgId>] ...` — host-service stdout in `bun dev` +- `[ptyd:<8-char-orgId>] ...` — daemon stdout, fanned through host-service + +Production stdio backs to per-org rotating log files +(`$SUPERSET_HOME_DIR/host/{orgId}/{host-service,pty-daemon}.log`) +because the detached children must outlive parent teardown. + +The `pipeWithPrefix` helper splits incoming chunks on `\n` so +multi-line bursts keep the prefix on every line. + ## Telemetry The supervisor emits structured `console.log` lines with @@ -85,8 +142,21 @@ is needed. - `src/daemon/DaemonSupervisor.test.ts` — probe edge cases, debounce semantics, restart race-await + circuit clear. +- `src/daemon/DaemonSupervisor.node-test.ts` — real-spawn integration: + fresh spawn, cross-instance adoption, version drift via env override, + user-restart kills + respawns, auto-respawn after SIGKILL, **adopted + daemon dies externally → supervisor detects and respawns**. +- `src/daemon/singleton.test.ts` — fire-and-track bootstrap, idempotent + startDaemonBootstrap, retryable failure path. +- `src/trpc/router/terminal/terminal.daemon.test.ts` — tRPC procedure + wiring (UNAUTHORIZED gating, getUpdateStatus delegation, listSessions + awaits bootstrap, restart wiring). +- `src/no-electron-coupling.test.ts` — asserts host-service source has + zero Electron imports/globals/APIs (substitute for a real headless + smoke test until native-addon distribution is solved). - Daemon wire protocol coverage lives in `packages/pty-daemon/test/` - (handshake, adoption, SIGKILL recovery). + (handshake, adoption, SIGKILL recovery, **default-close terminates + an interactive login shell** — SIGHUP regression test). ## Test escape hatch @@ -108,3 +178,39 @@ Bumping the daemon version: edit `EXPECTED_DAEMON_VERSION` in `expected-version.ts` to match the new `packages/pty-daemon/package.json#version`. The supervisor's adoption probe will surface the "update available" flag on existing installs until they restart. + +Bumping host-service-level features that the desktop coordinator +needs to refuse to adopt old binaries: bump `HOST_SERVICE_VERSION` +in `src/trpc/router/host/host.ts` and `MIN_HOST_SERVICE_VERSION` in +`apps/desktop/src/main/lib/host-service-coordinator.ts` together. +The coordinator's `tryAdopt` does a `semver.satisfies(>=)` check and +SIGTERMs+respawns anything older. + +## Phase 2 deferred — daemon upgrades currently kill sessions + +The original Architecture E plan called for **daemon-upgrade fd-handoff** +so even daemon-binary changes preserve PTYs. Phase 0 (the Go and +node-pty harnesses in the design-doc branch) proved the primitive +works. **Phase 2 is not built in this codebase yet.** + +Today: clicking "Restart and update" in Settings → Manage daemon +SIGTERMs the running daemon and spawns the new bundle. All sessions +die in the gap. The confirmation dialog tells the user this. + +When Phase 2 lands: the supervisor will spawn the new daemon with +existing PTY master FDs in its `stdio` array (kernel-level dup, +refcount preserved across the swap). New daemon adopts the FDs, +takes over the socket, old daemon exits without closing them. +Sessions survive the upgrade. + +Hooks already in place that Phase 2 will use: +- Adopted-liveness check (it'll detect the old daemon's exit at + the supervisor level if anything goes wrong mid-handoff). +- Manifest-based daemon discovery (the supervisor's current + `tryAdopt` is what Phase 2's "fall back if handoff fails" path + reuses). +- Existing wire protocol (we'd add an `upgrade` message; the + protocol is versioned). + +See `apps/desktop/plans/20260430-pty-daemon-host-service-migration.md` +in the design-doc branch for the migration journey and Phase 2 sketch. From f9a7c81245d113f7bf7ec2b5a4d2adf08ef8d529 Mon Sep 17 00:00:00 2001 From: Kiet Ho Date: Fri, 1 May 2026 12:37:47 -0700 Subject: [PATCH 33/33] chore(host-service): post-merge CI cleanup - bunfig + test/setup-env.ts: populate env vars before @t3-oss/env-core validates at module load, so integration tests that boot via createApp (without serve.ts) don't crash importing the validated env module. - Align apps/desktop semver caret to ^7.7.4 (Sherif: multiple-dependency- versions across the workspace). - Drop pre-existing unused MinimalCtx interface and replace candidates[0]! non-null assertion with explicit guard (Biome lint). - pty-daemon Server.pickProtocol: remove dead `?? (... ? null : null)` branch and the now-orphan CURRENT_PROTOCOL_VERSION import. --- apps/desktop/package.json | 2 +- bun.lock | 2 +- packages/host-service/bunfig.toml | 2 ++ packages/host-service/scripts/test-e2e.ts | 5 +++-- .../src/trpc/router/terminal/terminal.daemon.test.ts | 4 ---- packages/host-service/test/setup-env.ts | 11 +++++++++++ packages/pty-daemon/src/Server/Server.ts | 3 +-- 7 files changed, 19 insertions(+), 10 deletions(-) create mode 100644 packages/host-service/bunfig.toml create mode 100644 packages/host-service/test/setup-env.ts diff --git a/apps/desktop/package.json b/apps/desktop/package.json index ff28fd1c95a..ee1052a1530 100644 --- a/apps/desktop/package.json +++ b/apps/desktop/package.json @@ -214,7 +214,7 @@ "rehype-raw": "^7.0.0", "rehype-sanitize": "^6.0.0", "remark-gfm": "^4.0.1", - "semver": "^7.7.3", + "semver": "^7.7.4", "shell-env": "^4.0.3", "shell-quote": "^1.8.3", "shiki": "^3.21.0", diff --git a/bun.lock b/bun.lock index a96af834942..a1791bda156 100644 --- a/bun.lock +++ b/bun.lock @@ -292,7 +292,7 @@ "rehype-raw": "^7.0.0", "rehype-sanitize": "^6.0.0", "remark-gfm": "^4.0.1", - "semver": "^7.7.3", + "semver": "^7.7.4", "shell-env": "^4.0.3", "shell-quote": "^1.8.3", "shiki": "^3.21.0", diff --git a/packages/host-service/bunfig.toml b/packages/host-service/bunfig.toml new file mode 100644 index 00000000000..a22eda2e883 --- /dev/null +++ b/packages/host-service/bunfig.toml @@ -0,0 +1,2 @@ +[test] +preload = ["./test/setup-env.ts"] diff --git a/packages/host-service/scripts/test-e2e.ts b/packages/host-service/scripts/test-e2e.ts index 441af59da13..7275d0367ce 100644 --- a/packages/host-service/scripts/test-e2e.ts +++ b/packages/host-service/scripts/test-e2e.ts @@ -26,12 +26,13 @@ function findElectronBinary(): string { }) .split("\n") .filter(Boolean); - if (candidates.length === 0) { + const first = candidates[0]; + if (!first) { throw new Error( "Electron binary not found. Run `bun install` from the repo root first.", ); } - return path.join(repoRoot, candidates[0]!); + return path.join(repoRoot, first); } const electronBin = findElectronBinary(); diff --git a/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts b/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts index 04e53c4436d..cd7ed80a95c 100644 --- a/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts +++ b/packages/host-service/src/trpc/router/terminal/terminal.daemon.test.ts @@ -24,10 +24,6 @@ process.env.SUPERSET_API_URL = "https://cloud.example.com"; const { appRouter } = await import("../router.ts"); -interface MinimalCtx { - isAuthenticated: boolean; -} - function makeCaller(authenticated = true) { // Cast to whatever; we only invoke procedures that don't touch db/git/etc. return appRouter.createCaller({ diff --git a/packages/host-service/test/setup-env.ts b/packages/host-service/test/setup-env.ts new file mode 100644 index 00000000000..2f0a33dea56 --- /dev/null +++ b/packages/host-service/test/setup-env.ts @@ -0,0 +1,11 @@ +// Populate the env vars `src/env.ts` validates at module load so test runtimes +// that boot host-service via `createApp` (instead of `serve.ts`) can import +// modules that transitively load the validated env. Real values come from +// each test's `createTestHost` config; these defaults exist purely to satisfy +// schema validation at import time. + +process.env.ORGANIZATION_ID ??= "00000000-0000-4000-8000-000000000000"; +process.env.HOST_DB_PATH ??= "/tmp/host-service-test.db"; +process.env.HOST_MIGRATIONS_FOLDER ??= "/tmp/host-service-test-migrations"; +process.env.AUTH_TOKEN ??= "test-auth-token"; +process.env.SUPERSET_API_URL ??= "http://localhost:0"; diff --git a/packages/pty-daemon/src/Server/Server.ts b/packages/pty-daemon/src/Server/Server.ts index 5dbc619e17a..3332cbebdb9 100644 --- a/packages/pty-daemon/src/Server/Server.ts +++ b/packages/pty-daemon/src/Server/Server.ts @@ -13,7 +13,6 @@ import { } from "../handlers/index.ts"; import { type ClientMessage, - CURRENT_PROTOCOL_VERSION, encodeFrame, FrameDecoder, type HelloMessage, @@ -260,7 +259,7 @@ function pickProtocol(hello: HelloMessage): number | null { for (const v of hello.protocols) { if (supported.has(v) && (best === null || v > best)) best = v; } - return best ?? (supported.has(CURRENT_PROTOCOL_VERSION) ? null : null); + return best; } function writeMessage(socket: net.Socket, msg: ServerMessage): void {