-
Notifications
You must be signed in to change notification settings - Fork 87
fix: add stop-acknowledgement timeout to prevent stale recording state #9024
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,13 @@ import { log, findSocketForSession, defineHandlers, type HandlerContext } from ' | |
| import * as conversationStore from '../../memory/conversation-store.js'; | ||
| import { uploadFileBackedAttachment, linkAttachmentToMessage } from '../../memory/attachments-store.js'; | ||
|
|
||
| // ─── Constants ─────────────────────────────────────────────────────────────── | ||
|
|
||
| /** How long to wait (ms) for a client to acknowledge a recording_stop before | ||
| * automatically cleaning up stale map entries. Prevents a missing client ack | ||
| * from permanently blocking all future recordings. */ | ||
| const STOP_ACK_TIMEOUT_MS = 30_000; | ||
|
|
||
| // ─── Deterministic maps ────────────────────────────────────────────────────── | ||
| // These ensure stop resolves the exact active recording for a conversation, | ||
| // prevent ambiguous cross-thread stop behavior, and maintain conversation | ||
|
|
@@ -18,6 +25,9 @@ const standaloneRecordingConversationId = new Map<string, string>(); | |
| /** Maps conversationId -> recordingId (active recording). */ | ||
| const recordingOwnerByConversation = new Map<string, string>(); | ||
|
|
||
| /** Pending stop-acknowledgement timeouts keyed by recordingId. */ | ||
| const pendingStopTimeouts = new Map<string, NodeJS.Timeout>(); | ||
|
|
||
| // ─── Start ─────────────────────────────────────────────────────────────────── | ||
|
|
||
| /** | ||
|
|
@@ -110,12 +120,31 @@ export function handleRecordingStop( | |
| recordingId, | ||
| }); | ||
|
|
||
| // Start a timeout so that if the client never acknowledges the stop (e.g. | ||
| // client bug, app freeze), we automatically clean up the maps and unblock | ||
| // future recordings. | ||
| const timeoutHandle = setTimeout(() => { | ||
| pendingStopTimeouts.delete(recordingId); | ||
| log.warn({ recordingId, conversationId: ownerConversationId, timeoutMs: STOP_ACK_TIMEOUT_MS }, 'Stop-acknowledgement timeout fired — cleaning up stale recording state'); | ||
| cleanupMaps(recordingId, ownerConversationId); | ||
| }, STOP_ACK_TIMEOUT_MS); | ||
| pendingStopTimeouts.set(recordingId, timeoutHandle); | ||
|
|
||
| log.info({ recordingId, conversationId }, 'Standalone recording stop sent'); | ||
| return recordingId; | ||
| } | ||
|
|
||
| // ─── Internal helpers ───────────────────────────────────────────────────────── | ||
|
|
||
| /** Cancel a pending stop-acknowledgement timeout for a recording, if any. */ | ||
| function cancelStopTimeout(recordingId: string): void { | ||
| const handle = pendingStopTimeouts.get(recordingId); | ||
| if (handle) { | ||
| clearTimeout(handle); | ||
| pendingStopTimeouts.delete(recordingId); | ||
| } | ||
| } | ||
|
|
||
| /** Remove a recording from both deterministic maps. */ | ||
| function cleanupMaps(recordingId: string, conversationId: string | undefined): void { | ||
| standaloneRecordingConversationId.delete(recordingId); | ||
|
|
@@ -151,6 +180,9 @@ function handleRecordingStatus( | |
| return; | ||
| } | ||
|
|
||
| // The client acknowledged this recording — cancel any pending stop timeout. | ||
| cancelStopTimeout(recordingId); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
|
|
||
| // Use the reporting socket (which delivered this message) as the primary | ||
| // recipient. Fall back to session-based lookup if the user switched sessions. | ||
| const notifySocket = reportingSocket ?? findSocketForSession(conversationId, ctx); | ||
|
|
@@ -319,6 +351,7 @@ export function cleanupRecordingsOnDisconnect( | |
| // or if the owner conversation has no socket bound at all. | ||
| if (!ownerSocket || ownerSocket === disconnectedSocket) { | ||
| log.warn({ conversationId: convId, recordingId: recId }, 'Cleaning up recording state for disconnected socket'); | ||
| cancelStopTimeout(recId); | ||
| standaloneRecordingConversationId.delete(recId); | ||
| recordingOwnerByConversation.delete(convId); | ||
| } | ||
|
|
@@ -329,6 +362,10 @@ export function cleanupRecordingsOnDisconnect( | |
|
|
||
| /** Reset module-level state. Only for use in tests. */ | ||
| export function __resetRecordingState(): void { | ||
| for (const handle of pendingStopTimeouts.values()) { | ||
| clearTimeout(handle); | ||
| } | ||
| pendingStopTimeouts.clear(); | ||
| standaloneRecordingConversationId.clear(); | ||
| recordingOwnerByConversation.clear(); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🔴 Stop-ack timeout cancelled by non-terminal
startedstatus, defeating the safety mechanismcancelStopTimeout(recordingId)at line 184 is called unconditionally for everyrecording_statusmessage, includingstatus: 'started'. Astartedstatus is the client acknowledging it began recording — not that it stopped. This creates a race condition that defeats the entire purpose of this PR.Race condition scenario
handleRecordingStart→ sendsrecording_startto clienthandleRecordingStop→ sendsrecording_stopto client → starts 30s timeout atrecording.ts:126-131startedack arrives (was in-flight or delayed) →handleRecordingStatusruns withstatus: 'started'→cancelStopTimeout(recordingId)at line 184 cancels the timeoutstopped→ timeout was already cancelled → maps are never cleaned up → future recordings permanently blockedThe fix should only cancel the timeout for terminal statuses (
stoppedorfailed), not forstarted. ThecancelStopTimeoutcall should be moved inside thecase 'stopped'andcase 'failed'branches of the switch statement, or guarded withif (msg.status !== 'started').Impact: The exact scenario this PR aims to prevent (stale recording state blocking future recordings) can still occur when a
startedack races with a stop request.Prompt for agents
Was this helpful? React with 👍 or 👎 to provide feedback.