diff --git a/assistant/src/calls/call-controller.ts b/assistant/src/calls/call-controller.ts index 760a60ff004..fe687c2c3fd 100644 --- a/assistant/src/calls/call-controller.ts +++ b/assistant/src/calls/call-controller.ts @@ -58,6 +58,7 @@ export class CallController { private state: ControllerState = 'idle'; private abortController: AbortController = new AbortController(); private currentTurnHandle: VoiceTurnHandle | null = null; + private currentTurnPromise: Promise | null = null; private silenceTimer: ReturnType | null = null; private durationTimer: ReturnType | null = null; private durationWarningTimer: ReturnType | null = null; @@ -152,6 +153,17 @@ export class CallController { // If we're already processing or speaking, abort the in-flight generation if (interruptedInFlight) { this.abortCurrentTurn(); + this.llmRunVersion++; // Invalidate stale turn before awaiting teardown + } + + // Always await any lingering turn promise, even if handleInterrupt() already ran + if (this.currentTurnPromise) { + const teardownPromise = this.currentTurnPromise; + this.currentTurnPromise = null; + await Promise.race([ + teardownPromise.catch(() => {}), + new Promise(resolve => setTimeout(resolve, 2000)), + ]); } this.state = 'processing'; @@ -189,6 +201,16 @@ export class CallController { this.consultationTimer = null; } + // Defensive: await any lingering turn promise before starting a new one. + if (this.currentTurnPromise) { + const teardownPromise = this.currentTurnPromise; + this.currentTurnPromise = null; + await Promise.race([ + teardownPromise.catch(() => {}), + new Promise(resolve => setTimeout(resolve, 2000)), + ]); + } + this.state = 'processing'; updateCallSession(this.callSessionId, { status: 'in_progress' }); @@ -265,6 +287,7 @@ export class CallController { if (this.durationEndTimer) { clearTimeout(this.durationEndTimer); this.durationEndTimer = null; } this.llmRunVersion++; this.abortCurrentTurn(); + this.currentTurnPromise = null; unregisterCallController(this.callSessionId); log.info({ callSessionId: this.callSessionId }, 'CallController destroyed'); } @@ -296,7 +319,13 @@ export class CallController { * Execute a single voice turn through the session pipeline and stream * the response back through the relay. */ - private async runTurn(content: string): Promise { + private runTurn(content: string): Promise { + const promise = this.runTurnInner(content); + this.currentTurnPromise = promise; + return promise; + } + + private async runTurnInner(content: string): Promise { const runVersion = ++this.llmRunVersion; const runSignal = this.abortController.signal; @@ -385,6 +414,8 @@ export class CallController { content, assistantId: this.assistantId, guardianContext: this.guardianContext ?? undefined, + isInbound: this.isInbound, + task: this.task, onTextDelta, onComplete, onError, diff --git a/assistant/src/calls/voice-session-bridge.ts b/assistant/src/calls/voice-session-bridge.ts index dbbbaf6d14e..c66f5c5382b 100644 --- a/assistant/src/calls/voice-session-bridge.ts +++ b/assistant/src/calls/voice-session-bridge.ts @@ -12,6 +12,7 @@ import type { RunOrchestrator, VoiceRunEventSink } from '../runtime/run-orchestrator.js'; import type { GuardianRuntimeContext } from '../daemon/session-runtime-assembly.js'; +import { getConfig } from '../config/loader.js'; import { getLogger } from '../util/logger.js'; const log = getLogger('voice-session-bridge'); @@ -43,6 +44,10 @@ export interface VoiceTurnOptions { assistantId?: string; /** Guardian trust context for the caller. */ guardianContext?: GuardianRuntimeContext; + /** Whether this is an inbound call (no outbound task). */ + isInbound: boolean; + /** The outbound call task, if any. */ + task?: string | null; /** Called for each streaming text token from the agent loop. */ onTextDelta: (text: string) => void; /** Called when the agent loop completes a full response. */ @@ -60,6 +65,83 @@ export interface VoiceTurnHandle { abort: () => void; } +// --------------------------------------------------------------------------- +// Call-control protocol prompt builder +// --------------------------------------------------------------------------- + +/** + * Build the call-control protocol prompt injected into each voice turn. + * + * This contains the marker protocol rules that the model needs to emit + * control markers during voice calls. It intentionally omits the "You are + * on a live phone call" framing (the session system prompt already + * provides assistant identity) and guardian context (injected separately). + */ +function buildVoiceCallControlPrompt(opts: { + isInbound: boolean; + task?: string | null; +}): string { + const config = getConfig(); + const disclosureRule = config.calls.disclosure.enabled + ? `1. ${config.calls.disclosure.text}` + : '1. Begin the conversation naturally.'; + + const lines: string[] = ['']; + + if (!opts.isInbound && opts.task) { + lines.push(`Task: ${opts.task}`); + lines.push(''); + } + + lines.push( + 'CALL PROTOCOL RULES:', + '0. When introducing yourself, refer to yourself as an assistant. Avoid the phrase "AI assistant" unless directly asked.', + disclosureRule, + '2. Be concise — phone conversations should be brief and natural.', + ); + + if (opts.isInbound) { + lines.push( + '3. If the caller asks something you don\'t know or need to verify, include [ASK_GUARDIAN: your question here] in your response along with a hold message like "Let me check on that for you."', + '4. If information is provided preceded by [USER_ANSWERED: ...], use that answer naturally in the conversation.', + '5. If you see [USER_INSTRUCTION: ...], treat it as a high-priority steering directive from your user. Follow the instruction immediately, adjusting your approach or response accordingly.', + '6. When the caller indicates they are done or the conversation reaches a natural conclusion, include [END_CALL] in your response along with a polite goodbye.', + ); + } else { + lines.push( + '3. If the callee asks something you don\'t know, include [ASK_GUARDIAN: your question here] in your response along with a hold message like "Let me check on that for you."', + '4. If the callee provides information preceded by [USER_ANSWERED: ...], use that answer naturally in the conversation.', + '5. If you see [USER_INSTRUCTION: ...], treat it as a high-priority steering directive from your user. Follow the instruction immediately, adjusting your approach or response accordingly.', + '6. When the call\'s purpose is fulfilled, include [END_CALL] in your response along with a polite goodbye.', + ); + } + + lines.push( + '7. Do not make up information — ask the user if unsure.', + '8. Keep responses short — 1-3 sentences is ideal for phone conversation.', + '9. When caller text includes [SPEAKER id="..." label="..."], treat each speaker as a distinct person and personalize responses using that speaker\'s prior context in this call.', + ); + + if (opts.isInbound) { + lines.push( + '10. If the latest user turn is [CALL_OPENING], greet the caller warmly and ask how you can help. Vary the wording; do not use a fixed template.', + '11. If the latest user turn includes [CALL_OPENING_ACK], treat it as the caller acknowledging your greeting and continue the conversation naturally.', + ); + } else { + lines.push( + '10. If the latest user turn is [CALL_OPENING], generate a natural, context-specific opener: briefly introduce yourself once as an assistant, state why you are calling using the Task context, and ask a short permission/check-in question. Vary the wording; do not use a fixed template.', + '11. If the latest user turn includes [CALL_OPENING_ACK], treat it as the callee acknowledging your opener and continue the conversation naturally without re-introducing yourself or repeating the initial check-in question.', + ); + } + + lines.push( + '12. Do not repeat your introduction within the same call unless the callee explicitly asks who you are.', + '', + ); + + return lines.join('\n'); +} + // --------------------------------------------------------------------------- // startVoiceTurn // --------------------------------------------------------------------------- @@ -98,6 +180,13 @@ export async function startVoiceTurn(opts: VoiceTurnOptions): Promise; allowedToolNames?: Set; @@ -320,6 +321,7 @@ export async function runAgentLoopImpl( channelTurnContext, guardianContext: ctx.guardianContext ?? null, temporalContext, + voiceCallControlPrompt: ctx.voiceCallControlPrompt ?? null, }); // Pre-run repair @@ -430,6 +432,7 @@ export async function runAgentLoopImpl( channelTurnContext, guardianContext: ctx.guardianContext ?? null, temporalContext, + voiceCallControlPrompt: ctx.voiceCallControlPrompt ?? null, }); preRepairMessages = runMessages; preRunHistoryLength = runMessages.length; @@ -465,6 +468,7 @@ export async function runAgentLoopImpl( channelTurnContext, guardianContext: ctx.guardianContext ?? null, temporalContext, + voiceCallControlPrompt: ctx.voiceCallControlPrompt ?? null, }); preRepairMessages = runMessages; preRunHistoryLength = runMessages.length; diff --git a/assistant/src/daemon/session-runtime-assembly.ts b/assistant/src/daemon/session-runtime-assembly.ts index 70dc1f4ec81..274cf556627 100644 --- a/assistant/src/daemon/session-runtime-assembly.ts +++ b/assistant/src/daemon/session-runtime-assembly.ts @@ -261,6 +261,26 @@ export function injectActiveSurfaceContext(message: Message, ctx: ActiveSurfaceC }; } +/** + * Append voice call-control protocol instructions to the last user + * message so the model knows how to emit control markers during voice + * turns routed through the session pipeline. + */ +export function injectVoiceCallControlContext(message: Message, prompt: string): Message { + return { + ...message, + content: [ + ...message.content, + { type: 'text', text: prompt }, + ], + }; +} + +/** Strip `` blocks injected by `injectVoiceCallControlContext`. */ +export function stripVoiceCallControlContext(messages: Message[]): Message[] { + return stripUserTextBlocksByPrefix(messages, ['']); +} + /** * Prepend channel capability context to the last user message so the * model knows what the current channel can and cannot do. @@ -514,6 +534,7 @@ const RUNTIME_INJECTION_PREFIXES = [ '', '', '', + '', '', TEMPORAL_INJECTED_PREFIX, '', @@ -558,10 +579,21 @@ export function applyRuntimeInjections( channelTurnContext?: ChannelTurnContextParams | null; guardianContext?: GuardianRuntimeContext | null; temporalContext?: string | null; + voiceCallControlPrompt?: string | null; }, ): Message[] { let result = runMessages; + if (options.voiceCallControlPrompt) { + const userTail = result[result.length - 1]; + if (userTail && userTail.role === 'user') { + result = [ + ...result.slice(0, -1), + injectVoiceCallControlContext(userTail, options.voiceCallControlPrompt), + ]; + } + } + if (options.softConflictInstruction) { const userTail = result[result.length - 1]; if (userTail && userTail.role === 'user') { diff --git a/assistant/src/daemon/session.ts b/assistant/src/daemon/session.ts index 3a9b503db6f..ab76dbbc1d8 100644 --- a/assistant/src/daemon/session.ts +++ b/assistant/src/daemon/session.ts @@ -129,6 +129,7 @@ export class Session { /** @internal */ currentPage?: string; /** @internal */ channelCapabilities?: ChannelCapabilities; /** @internal */ guardianContext?: GuardianRuntimeContext; + /** @internal */ voiceCallControlPrompt?: string; /** @internal */ assistantId?: string; /** @internal */ commandIntent?: { type: string; payload?: string; languageCode?: string }; /** @internal */ pendingSurfaceActions = new Map(); @@ -336,6 +337,10 @@ export class Session { this.guardianContext = ctx ?? undefined; } + setVoiceCallControlPrompt(prompt: string | null): void { + this.voiceCallControlPrompt = prompt ?? undefined; + } + setAssistantId(assistantId: string | null): void { this.assistantId = assistantId ?? undefined; } diff --git a/assistant/src/runtime/run-orchestrator.ts b/assistant/src/runtime/run-orchestrator.ts index 04b5ef6d47f..fa834237ab5 100644 --- a/assistant/src/runtime/run-orchestrator.ts +++ b/assistant/src/runtime/run-orchestrator.ts @@ -128,6 +128,11 @@ export interface RunStartOptions { * stall for the full permission timeout (300s by default). */ voiceAutoDenyConfirmations?: boolean; + /** + * Call-control protocol prompt injected into each voice turn so the + * model knows to emit control markers ([ASK_GUARDIAN:], [END_CALL], etc.). + */ + voiceCallControlPrompt?: string; } // --------------------------------------------------------------------------- @@ -215,6 +220,7 @@ export class RunOrchestrator { // (e.g. attachment scope) match the actual transport rather than always // defaulting to 'macos'. session.setChannelCapabilities(resolveChannelCapabilities(options?.sourceChannel ?? 'macos')); + session.setVoiceCallControlPrompt(options?.voiceCallControlPrompt ?? null); // Serialized publish chain so hub subscribers observe events in order. let hubChain: Promise = Promise.resolve(); @@ -312,6 +318,7 @@ export class RunOrchestrator { session.setGuardianContext(null); session.setCommandIntent(null); session.setAssistantId('self'); + session.setVoiceCallControlPrompt(null); // Reset the session's client callback to a no-op so the stale // closure doesn't intercept events from future runs on the same session. // Set hasNoClient=true here since the run is done and no HTTP caller