From 960f39753a57ccf5d5c9c9d4fb4e9f000edb5cc0 Mon Sep 17 00:00:00 2001 From: siddseethepalli Date: Thu, 16 Apr 2026 05:06:07 +0000 Subject: [PATCH] fix(meet): pre-register bot token, capture logs + screenshots on join failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related changes to make meet-join failures diagnosable: 1. Pre-register the per-meeting bot API token in `pendingBotTokens` before the container spawns. Previously the token only became resolvable after `audioIngestPromise` resolved, so the bot's earliest `lifecycle:joining` POST got 401, tripped its terminal-error handler, and shut the bot down before it could click "Ask to join" — leaving the user with a confusing "bot did not connect to audio.sock" timeout and no admit prompt. 2. Add `DockerRunner.logs()` (with multiplex demux) and capture the bot container's stdout/stderr to `/bot.log` before every rollback path removes the container. Previously a failed join left no trace once `runner.remove()` fired — debugging required catching logs mid-flight. 3. On `joinMeet` selector timeouts (prejoin name input, in-meeting UI), screenshot the page to `/out/{prejoin,admission}-failure.png` and include the final URL in the thrown error. Critical when Google Meet serves an unexpected surface (sign-in redirect, cookie banner, headless-detection wall, updated selectors). Regression tests: token resolver answers during the pre-session window; rollback clears `pendingBotTokens` on spawn failure; `demultiplexDockerLogs` handles mixed streams, empty buffers, and truncated tails. All 261 meet-join tests pass. --- skills/meet-join/bot/src/browser/join-flow.ts | 53 ++++++++++- .../daemon/__tests__/chat-send-e2e.test.ts | 1 + .../daemon/__tests__/docker-runner.test.ts | 38 ++++++++ .../daemon/__tests__/e2e-smoke.test.ts | 1 + .../__tests__/proactive-chat-e2e.test.ts | 1 + .../daemon/__tests__/session-manager.test.ts | 92 +++++++++++++++++++ skills/meet-join/daemon/docker-runner.ts | 89 ++++++++++++++++++ skills/meet-join/daemon/session-manager.ts | 69 +++++++++++++- 8 files changed, 338 insertions(+), 6 deletions(-) diff --git a/skills/meet-join/bot/src/browser/join-flow.ts b/skills/meet-join/bot/src/browser/join-flow.ts index 034fd8b18a2..53462d27919 100644 --- a/skills/meet-join/bot/src/browser/join-flow.ts +++ b/skills/meet-join/bot/src/browser/join-flow.ts @@ -48,6 +48,47 @@ export interface JoinMeetOptions { consentMessage: string; } +/** + * Directory the bot can write diagnostic artifacts to. Matches the + * session-manager's `/out` mount, which is bound back to the host at + * `/meets//out/` — so anything we drop here is + * visible to the operator even after the container is torn down. + */ +const DIAGNOSTICS_DIR = "/out"; + +/** + * Best-effort: snapshot the current page to `/out/.png` so an + * operator can see exactly what Google Meet was showing when a selector + * timed out. Never re-throws — diagnostics must not mask the real join + * failure that triggered the capture. + */ +async function captureFailureSnapshot( + page: Page, + name: string, +): Promise { + const snapPath = `${DIAGNOSTICS_DIR}/${name}.png`; + try { + await page.screenshot({ path: snapPath, fullPage: true }); + return snapPath; + } catch { + return null; + } +} + +/** + * Best-effort: capture the current page URL so a 301/redirect to a + * sign-in wall (or a completely different Meet surface) is obvious from + * the error message. Returns `null` silently if the page has already + * been closed. + */ +async function safePageUrl(page: Page): Promise { + try { + return page.url(); + } catch { + return null; + } +} + /** * Drive the Google Meet prejoin surface to completion and deliver the consent * notice. @@ -70,8 +111,12 @@ export async function joinMeet( }); } catch (err) { const msg = err instanceof Error ? err.message : String(err); + const url = await safePageUrl(page); + const snap = await captureFailureSnapshot(page, "prejoin-failure"); throw new Error( - `meet-bot: prejoin name input did not appear within ${PREJOIN_TIMEOUT_MS}ms: ${msg}`, + `meet-bot: prejoin name input did not appear within ${PREJOIN_TIMEOUT_MS}ms: ${msg}` + + (url ? ` (final URL: ${url})` : "") + + (snap ? ` (screenshot: ${snap})` : ""), ); } @@ -99,8 +144,12 @@ export async function joinMeet( }); } catch (err) { const msg = err instanceof Error ? err.message : String(err); + const url = await safePageUrl(page); + const snap = await captureFailureSnapshot(page, "admission-failure"); throw new Error( - `meet-bot: in-meeting UI did not appear within ${MEETING_ROOM_TIMEOUT_MS}ms (host may not have admitted the bot): ${msg}`, + `meet-bot: in-meeting UI did not appear within ${MEETING_ROOM_TIMEOUT_MS}ms (host may not have admitted the bot): ${msg}` + + (url ? ` (final URL: ${url})` : "") + + (snap ? ` (screenshot: ${snap})` : ""), ); } diff --git a/skills/meet-join/daemon/__tests__/chat-send-e2e.test.ts b/skills/meet-join/daemon/__tests__/chat-send-e2e.test.ts index f1f5ac89434..43d3034d2d1 100644 --- a/skills/meet-join/daemon/__tests__/chat-send-e2e.test.ts +++ b/skills/meet-join/daemon/__tests__/chat-send-e2e.test.ts @@ -195,6 +195,7 @@ function makeMockRunnerPointingAt(fakeBot: FakeBotServer) { stop: mock(async () => {}), remove: mock(async () => {}), inspect: mock(async () => ({ Id: runResult.containerId })), + logs: mock(async () => ""), }; } diff --git a/skills/meet-join/daemon/__tests__/docker-runner.test.ts b/skills/meet-join/daemon/__tests__/docker-runner.test.ts index 331e2c6c2f9..355680c4fdb 100644 --- a/skills/meet-join/daemon/__tests__/docker-runner.test.ts +++ b/skills/meet-join/daemon/__tests__/docker-runner.test.ts @@ -14,6 +14,7 @@ import { import { buildCreateBody, + demultiplexDockerLogs, DockerApiError, DockerRunner, dockerSocketUnreachableMessage, @@ -450,6 +451,43 @@ describe("extractBoundPorts", () => { // Mode-aware workspace mounts + host-gateway flag (Phase 1.10 — DinD) // --------------------------------------------------------------------------- +describe("demultiplexDockerLogs", () => { + // Build a framed chunk matching Docker's multiplexed logs framing: + // [streamType(1)][0,0,0][size(uint32 BE, 4)][payload] + // streamType: 1 = stdout, 2 = stderr. + function frame(stream: 1 | 2, payload: string): Buffer { + const data = Buffer.from(payload, "utf8"); + const header = Buffer.alloc(8); + header.writeUInt8(stream, 0); + header.writeUInt32BE(data.length, 4); + return Buffer.concat([header, data]); + } + + test("concatenates stdout and stderr frames in order", () => { + const buf = Buffer.concat([ + frame(1, "step 1\n"), + frame(2, "warn from stderr\n"), + frame(1, "step 2\n"), + ]); + expect(demultiplexDockerLogs(buf)).toBe( + "step 1\nwarn from stderr\nstep 2\n", + ); + }); + + test("returns empty string for an empty buffer", () => { + expect(demultiplexDockerLogs(Buffer.alloc(0))).toBe(""); + }); + + test("drops a truncated trailing frame instead of throwing", () => { + const complete = frame(1, "ok\n"); + // Truncate the second frame mid-payload. + const truncated = frame(1, "will not appear").subarray(0, 10); + expect(demultiplexDockerLogs(Buffer.concat([complete, truncated]))).toBe( + "ok\n", + ); + }); +}); + describe("DockerRunner workspace-mount mode branching", () => { let mock: MockDocker; diff --git a/skills/meet-join/daemon/__tests__/e2e-smoke.test.ts b/skills/meet-join/daemon/__tests__/e2e-smoke.test.ts index 934a8600645..f8cb692e1a9 100644 --- a/skills/meet-join/daemon/__tests__/e2e-smoke.test.ts +++ b/skills/meet-join/daemon/__tests__/e2e-smoke.test.ts @@ -164,6 +164,7 @@ function makeMockRunner() { stop: mock(async () => {}), remove: mock(async () => {}), inspect: mock(async () => ({ Id: "container-e2e-1" })), + logs: mock(async () => ""), }; } diff --git a/skills/meet-join/daemon/__tests__/proactive-chat-e2e.test.ts b/skills/meet-join/daemon/__tests__/proactive-chat-e2e.test.ts index d34aa0fc751..0ffeb8ba3a5 100644 --- a/skills/meet-join/daemon/__tests__/proactive-chat-e2e.test.ts +++ b/skills/meet-join/daemon/__tests__/proactive-chat-e2e.test.ts @@ -178,6 +178,7 @@ function makeMockRunnerPointingAt(fakeBot: FakeBotServer) { stop: mock(async () => {}), remove: mock(async () => {}), inspect: mock(async () => ({ Id: runResult.containerId })), + logs: mock(async () => ""), }; } diff --git a/skills/meet-join/daemon/__tests__/session-manager.test.ts b/skills/meet-join/daemon/__tests__/session-manager.test.ts index 88b526e7be6..535d6c67709 100644 --- a/skills/meet-join/daemon/__tests__/session-manager.test.ts +++ b/skills/meet-join/daemon/__tests__/session-manager.test.ts @@ -46,6 +46,7 @@ interface MockRunner { stop: ReturnType; remove: ReturnType; inspect: ReturnType; + logs: ReturnType; } function makeMockRunner( @@ -82,6 +83,7 @@ function makeMockRunner( stop: mock(async () => {}), remove: mock(async () => {}), inspect: mock(async () => ({ Id: runResult.containerId })), + logs: mock(async () => ""), }; } @@ -280,6 +282,96 @@ describe("MeetSessionManager.join", () => { await manager.leave("m2", "cleanup"); }); + test("token resolver is populated during the container-spawn / audio-ingest window (regression: #26005-ish)", async () => { + // Before the fix, the bot API token only became resolvable once the + // `ActiveSession` record landed in `this.sessions`, which happens + // AFTER `audioIngestPromise` resolves. The bot's `DaemonClient` + // starts POSTing `lifecycle:joining` events well before that, so + // every early event got a 401, tripped the bot's terminal-error + // handler, and the bot shut itself down before it ever reached the + // audio-socket connect or the "Ask to join" click. This test pins + // the resolver in place from the moment the container starts. + // + // We stall the audio ingest's `start()` so the resolver is checked + // during the exact window the bot's HTTP traffic hits. + let resolveIngestStart: () => void = () => {}; + const ingestStartPromise = new Promise((r) => { + resolveIngestStart = r; + }); + const factory = (): MeetAudioIngestLike => ({ + start: mock(async () => { + await ingestStartPromise; + }), + stop: mock(async () => {}), + subscribePcm: mock(() => () => {}), + }); + + const manager = _createMeetSessionManagerForTests({ + dockerRunnerFactory: () => makeMockRunner(), + getProviderKey: async () => "k", + getWorkspaceDir: () => workspaceDir, + botLeaveFetch: async () => {}, + audioIngestFactory: factory, + }); + + const joinPromise = manager.join({ + url: "u", + meetingId: "m-pending", + conversationId: "c", + }); + + // Yield so `join()` gets past token generation + runner.run(). + // At this point `this.sessions` does NOT yet contain the session + // (audio-ingest is stalled), but the resolver must still return the + // token the bot is presenting on `Authorization: Bearer …`. + await new Promise((resolve) => setTimeout(resolve, 0)); + const pendingToken = getMeetSessionEventRouter().resolveBotApiToken( + "m-pending", + ); + expect(pendingToken).toMatch(/^[0-9a-f]{64}$/); + + // Let the ingest finish; the session now lands in `this.sessions` + // and the resolver keeps returning the same token. + resolveIngestStart(); + const session = await joinPromise; + // Cast away the `| null` from the resolver's return type — we + // already asserted non-null above, but `toMatch` doesn't narrow. + expect(session.botApiToken).toBe(pendingToken as string); + expect(getMeetSessionEventRouter().resolveBotApiToken("m-pending")).toBe( + pendingToken, + ); + + await manager.leave("m-pending", "cleanup"); + expect( + getMeetSessionEventRouter().resolveBotApiToken("m-pending"), + ).toBeNull(); + }); + + test("token resolver is cleared when container spawn fails (no pending-token leak)", async () => { + // If `runner.run()` throws, the rollback path must drop the + // pre-registered pending token so a later retry with a fresh token + // doesn't see a stale match. + const runner = makeMockRunner({ runError: new Error("spawn boom") }); + const manager = _createMeetSessionManagerForTests({ + dockerRunnerFactory: () => runner, + getProviderKey: async () => "k", + getWorkspaceDir: () => workspaceDir, + audioIngestFactory: makeFakeAudioIngestFactory().factory, + }); + + await expect( + manager.join({ + url: "u", + meetingId: "m-spawn-fail", + conversationId: "c", + }), + ).rejects.toThrow(/spawn boom/); + + expect( + getMeetSessionEventRouter().resolveBotApiToken("m-spawn-fail"), + ).toBeNull(); + }); + test("rejects a second join for the same meeting id", async () => { const audioIngestFactory = makeFakeAudioIngestFactory(); const manager = _createMeetSessionManagerForTests({ diff --git a/skills/meet-join/daemon/docker-runner.ts b/skills/meet-join/daemon/docker-runner.ts index e644b785dab..bc37de0b5c2 100644 --- a/skills/meet-join/daemon/docker-runner.ts +++ b/skills/meet-join/daemon/docker-runner.ts @@ -321,6 +321,74 @@ function requestRaw( }); } +/** + * Like {@link requestRaw} but returns the raw response bytes instead of a + * UTF-8 string. Used by the container-logs fetcher, which has to look at + * byte-level framing (Docker's multiplexed `{type, size, payload}` wrap). + */ +function requestRawBuffer( + socketPath: string, + method: string, + path: string, +): Promise { + return new Promise((resolve, reject) => { + const req = httpRequest( + { + socketPath, + method, + path, + headers: { Host: UNIX_SOCKET_HOST, Accept: "*/*" }, + }, + (res) => { + const chunks: Buffer[] = []; + res.on("data", (chunk: Buffer) => chunks.push(chunk)); + res.on("end", () => { + const buf = Buffer.concat(chunks); + const status = res.statusCode ?? 0; + if (status < 200 || status >= 300) { + reject( + new DockerApiError(method, path, status, buf.toString("utf8")), + ); + return; + } + resolve(buf); + }); + }, + ); + req.on("error", (err) => reject(err)); + req.end(); + }); +} + +/** + * Strip Docker's 8-byte multiplexed framing from a logs response body so + * the result reads like the container's combined stdout/stderr would on + * the terminal. Frame format: + * + * ``` + * [0] stream type (0=stdin, 1=stdout, 2=stderr) + * [1..3] zero padding + * [4..7] payload size (big-endian uint32) + * [8..] payload bytes + * ``` + * + * Any malformed tail (truncated mid-frame) is silently dropped — this is + * a diagnostic helper, not a reliable log pipeline. + */ +export function demultiplexDockerLogs(buf: Buffer): string { + const parts: string[] = []; + let offset = 0; + while (offset + 8 <= buf.length) { + const size = buf.readUInt32BE(offset + 4); + const start = offset + 8; + const end = start + size; + if (end > buf.length) break; + parts.push(buf.subarray(start, end).toString("utf8")); + offset = end; + } + return parts.join(""); +} + export class DockerRunner { readonly socketPath: string; private readonly resolveMode: () => DaemonRuntimeMode; @@ -431,6 +499,27 @@ export class DockerRunner { ); } + /** + * Fetch the container's accumulated stdout/stderr as a single string. + * + * Wraps `GET /containers//logs?stdout=1&stderr=1`. The API emits a + * multiplexed framing (8-byte header, then payload) when the container + * was not started with a TTY — we always spawn without TTY, so the + * stream needs demultiplexing before it's human-readable. This is a + * best-effort diagnostic hook called from the rollback path; any + * Docker-side error is wrapped as {@link DockerApiError} so callers can + * swallow it without losing the original join failure. + */ + async logs( + containerId: string, + opts: { tailLines?: number } = {}, + ): Promise { + const tail = opts.tailLines ?? "all"; + const path = `/${DOCKER_API_VERSION}/containers/${containerId}/logs?stdout=1&stderr=1&tail=${tail}`; + const raw = await requestRawBuffer(this.socketPath, "GET", path); + return demultiplexDockerLogs(raw); + } + // ------------------------------------------------------------------------- // Internals // ------------------------------------------------------------------------- diff --git a/skills/meet-join/daemon/session-manager.ts b/skills/meet-join/daemon/session-manager.ts index 44bd2cd8b9a..09635d057e1 100644 --- a/skills/meet-join/daemon/session-manager.ts +++ b/skills/meet-join/daemon/session-manager.ts @@ -53,7 +53,7 @@ */ import { randomBytes } from "node:crypto"; -import { mkdirSync } from "node:fs"; +import { mkdirSync, writeFileSync } from "node:fs"; import { join } from "node:path"; import { getConfig } from "../../../assistant/src/config/loader.js"; @@ -438,7 +438,7 @@ export interface MeetSessionManagerDeps { /** Factory for the Docker runner — swapped in tests. */ dockerRunnerFactory?: () => Pick< DockerRunner, - "run" | "stop" | "remove" | "inspect" + "run" | "stop" | "remove" | "inspect" | "logs" >; /** Override the function that fetches credentials. */ getProviderKey?: (provider: string) => Promise; @@ -549,6 +549,20 @@ export interface MeetSessionManagerDeps { class MeetSessionManagerImpl { private sessions = new Map(); + /** + * Bot API tokens for sessions whose container has been spawned but whose + * full {@link ActiveSession} record has not yet been inserted into + * {@link sessions} (that insertion only happens after the audio-ingest + * handshake completes). The meet-internal events route needs the token + * resolver to answer the moment the bot's {@link DaemonClient} starts + * POSTing `lifecycle:joining` — which happens long before the session + * lands in `sessions`, so we register the token here as soon as we mint + * it and delete once the session is in `sessions` (or the join rolls + * back). Without this, early bot events get 401s, the bot's terminal- + * error handler trips, and the bot shuts down before it ever reaches + * the audio-socket connect or the meet "Ask to join" click. + */ + private pendingBotTokens = new Map(); private deps: Required; constructor(deps: MeetSessionManagerDeps = {}) { @@ -593,10 +607,13 @@ class MeetSessionManagerImpl { // The ingress route (PR 9) looks up per-meeting tokens through this // resolver. Install it once at construction time — it reads live state - // from `this.sessions`, so it stays correct as sessions come and go. + // from `this.sessions` (and {@link pendingBotTokens} during the + // container-spawn / audio-ingest window, before the session lands in + // `sessions`), so it stays correct as sessions come and go. getMeetSessionEventRouter().setBotApiTokenResolver((meetingId) => { const session = this.sessions.get(meetingId); - return session ? session.botApiToken : null; + if (session) return session.botApiToken; + return this.pendingBotTokens.get(meetingId) ?? null; }); } @@ -680,6 +697,13 @@ class MeetSessionManagerImpl { mkdirSync(outDir, { recursive: true }); const botApiToken = generateBotApiToken(); + // Pre-register the token so `/v1/internal/meet/:id/events` can + // authenticate the bot's earliest `lifecycle:joining` POST — which + // fires before the `ActiveSession` record lands in `this.sessions` + // (that happens only after the audio-ingest handshake completes). + // Cleared on every join-rollback path below and replaced by the + // authoritative `this.sessions` lookup once the session is in the map. + this.pendingBotTokens.set(meetingId, botApiToken); // Placeholder — Phase 3 (PR 23+) will resolve the real TTS credential. const ttsKey = (await this.deps.getProviderKey("tts")) ?? ""; @@ -795,6 +819,7 @@ class MeetSessionManagerImpl { audioIngestPromise.catch(() => {}); await audioIngest.stop().catch(() => {}); unregisterMeetingDispatcher(meetingId); + this.pendingBotTokens.delete(meetingId); void publishMeetEvent( DAEMON_INTERNAL_ASSISTANT_ID, meetingId, @@ -810,10 +835,12 @@ class MeetSessionManagerImpl { if (!boundPort) { // Roll back the container so we don't leak a started-but-unreachable // bot. Best-effort — surface the original error either way. + await captureBotLogs(runner, runResult.containerId, meetingDir); await runner.remove(runResult.containerId).catch(() => {}); audioIngestPromise.catch(() => {}); await audioIngest.stop().catch(() => {}); unregisterMeetingDispatcher(meetingId); + this.pendingBotTokens.delete(meetingId); const detail = `meet-bot container ${runResult.containerId} did not publish a host port for ${MEET_BOT_INTERNAL_PORT}/tcp`; void publishMeetEvent( DAEMON_INTERNAL_ASSISTANT_ID, @@ -841,9 +868,11 @@ class MeetSessionManagerImpl { "Meet audio ingest failed to start — rolling back container", ); await runner.stop(runResult.containerId).catch(() => {}); + await captureBotLogs(runner, runResult.containerId, meetingDir); await runner.remove(runResult.containerId).catch(() => {}); await audioIngest.stop().catch(() => {}); unregisterMeetingDispatcher(meetingId); + this.pendingBotTokens.delete(meetingId); void publishMeetEvent( DAEMON_INTERNAL_ASSISTANT_ID, meetingId, @@ -960,6 +989,9 @@ class MeetSessionManagerImpl { bargeInWatcher, }; this.sessions.set(meetingId, session); + // `this.sessions` is now the authoritative source for the resolver; + // the pre-registered pending entry is no longer needed. + this.pendingBotTokens.delete(meetingId); // Fan `participant.change` / `speaker.change` / final transcript chunks // out as `meet.*` events on the assistant event hub. @@ -1782,6 +1814,35 @@ export function substituteAssistantName( } /** Strip internal fields (`timeoutHandle`) from a session before exposing it. */ +/** + * Best-effort: pull the bot container's accumulated stdout/stderr and + * persist it to `/bot.log` before the container is removed. + * Called from every join-rollback path that has a containerId so a + * post-mortem exists even after `runner.remove()` deletes the container. + * Any Docker-side failure (container already gone, socket timeout, etc.) + * is swallowed — log capture must never mask the original join error. + */ +async function captureBotLogs( + runner: { logs: (id: string) => Promise }, + containerId: string, + meetingDir: string, +): Promise { + try { + const body = await runner.logs(containerId); + const dest = join(meetingDir, "bot.log"); + writeFileSync(dest, body); + log.info( + { containerId, dest, bytes: body.length }, + "Captured bot container logs before rollback", + ); + } catch (err) { + log.warn( + { err, containerId, meetingDir }, + "Failed to capture bot container logs (continuing rollback)", + ); + } +} + function sessionView(session: ActiveSession): MeetSession { return { meetingId: session.meetingId,