Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion assistant/src/calls/call-controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ export class CallController {
private state: ControllerState = 'idle';
private abortController: AbortController = new AbortController();
private currentTurnHandle: VoiceTurnHandle | null = null;
private currentTurnPromise: Promise<void> | null = null;
private silenceTimer: ReturnType<typeof setTimeout> | null = null;
private durationTimer: ReturnType<typeof setTimeout> | null = null;
private durationWarningTimer: ReturnType<typeof setTimeout> | null = null;
Expand Down Expand Up @@ -154,6 +155,17 @@ export class CallController {
this.abortCurrentTurn();
}

// Wait for the aborted turn to finish unwinding before starting a new
// one, so RunOrchestrator's `isProcessing()` guard doesn't reject us.
if (interruptedInFlight && this.currentTurnPromise) {
const teardownPromise = this.currentTurnPromise;
this.currentTurnPromise = null;
await Promise.race([
teardownPromise.catch(() => {}),
Comment thread
noanflaherty marked this conversation as resolved.
new Promise<void>(resolve => setTimeout(resolve, 2000)),
]);
}
Comment thread
noanflaherty marked this conversation as resolved.

this.state = 'processing';
this.resetSilenceTimer();
const callerContent = this.formatCallerUtterance(transcript, speaker);
Expand Down Expand Up @@ -189,6 +201,16 @@ export class CallController {
this.consultationTimer = null;
}

// Defensive: await any lingering turn promise before starting a new one.
if (this.currentTurnPromise) {
const teardownPromise = this.currentTurnPromise;
this.currentTurnPromise = null;
await Promise.race([
teardownPromise.catch(() => {}),
new Promise<void>(resolve => setTimeout(resolve, 2000)),
]);
}

this.state = 'processing';
updateCallSession(this.callSessionId, { status: 'in_progress' });

Expand Down Expand Up @@ -265,6 +287,7 @@ export class CallController {
if (this.durationEndTimer) { clearTimeout(this.durationEndTimer); this.durationEndTimer = null; }
this.llmRunVersion++;
this.abortCurrentTurn();
this.currentTurnPromise = null;
unregisterCallController(this.callSessionId);
log.info({ callSessionId: this.callSessionId }, 'CallController destroyed');
}
Expand Down Expand Up @@ -296,7 +319,13 @@ export class CallController {
* Execute a single voice turn through the session pipeline and stream
* the response back through the relay.
*/
private async runTurn(content: string): Promise<void> {
private runTurn(content: string): Promise<void> {
const promise = this.runTurnInner(content);
this.currentTurnPromise = promise;
return promise;
}

private async runTurnInner(content: string): Promise<void> {
const runVersion = ++this.llmRunVersion;
const runSignal = this.abortController.signal;

Expand Down Expand Up @@ -385,6 +414,8 @@ export class CallController {
content,
assistantId: this.assistantId,
guardianContext: this.guardianContext ?? undefined,
isInbound: this.isInbound,
task: this.task,
onTextDelta,
onComplete,
onError,
Expand Down
90 changes: 90 additions & 0 deletions assistant/src/calls/voice-session-bridge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import type { RunOrchestrator, VoiceRunEventSink } from '../runtime/run-orchestrator.js';
import type { GuardianRuntimeContext } from '../daemon/session-runtime-assembly.js';
import { getConfig } from '../config/loader.js';
import { getLogger } from '../util/logger.js';

const log = getLogger('voice-session-bridge');
Expand Down Expand Up @@ -43,6 +44,10 @@ export interface VoiceTurnOptions {
assistantId?: string;
/** Guardian trust context for the caller. */
guardianContext?: GuardianRuntimeContext;
/** Whether this is an inbound call (no outbound task). */
isInbound: boolean;
/** The outbound call task, if any. */
task?: string | null;
/** Called for each streaming text token from the agent loop. */
onTextDelta: (text: string) => void;
/** Called when the agent loop completes a full response. */
Expand All @@ -60,6 +65,83 @@ export interface VoiceTurnHandle {
abort: () => void;
}

// ---------------------------------------------------------------------------
// Call-control protocol prompt builder
// ---------------------------------------------------------------------------

/**
* Build the call-control protocol prompt injected into each voice turn.
*
* This contains the marker protocol rules that the model needs to emit
* control markers during voice calls. It intentionally omits the "You are
* on a live phone call" framing (the session system prompt already
* provides assistant identity) and guardian context (injected separately).
*/
function buildVoiceCallControlPrompt(opts: {
isInbound: boolean;
task?: string | null;
}): string {
const config = getConfig();
const disclosureRule = config.calls.disclosure.enabled
? `1. ${config.calls.disclosure.text}`
: '1. Begin the conversation naturally.';

const lines: string[] = ['<voice_call_control>'];

if (!opts.isInbound && opts.task) {
lines.push(`Task: ${opts.task}`);
lines.push('');
}

lines.push(
'CALL PROTOCOL RULES:',
'0. When introducing yourself, refer to yourself as an assistant. Avoid the phrase "AI assistant" unless directly asked.',
disclosureRule,
'2. Be concise — phone conversations should be brief and natural.',
);

if (opts.isInbound) {
lines.push(
'3. If the caller asks something you don\'t know or need to verify, include [ASK_GUARDIAN: your question here] in your response along with a hold message like "Let me check on that for you."',
'4. If information is provided preceded by [USER_ANSWERED: ...], use that answer naturally in the conversation.',
'5. If you see [USER_INSTRUCTION: ...], treat it as a high-priority steering directive from your user. Follow the instruction immediately, adjusting your approach or response accordingly.',
'6. When the caller indicates they are done or the conversation reaches a natural conclusion, include [END_CALL] in your response along with a polite goodbye.',
);
} else {
lines.push(
'3. If the callee asks something you don\'t know, include [ASK_GUARDIAN: your question here] in your response along with a hold message like "Let me check on that for you."',
'4. If the callee provides information preceded by [USER_ANSWERED: ...], use that answer naturally in the conversation.',
'5. If you see [USER_INSTRUCTION: ...], treat it as a high-priority steering directive from your user. Follow the instruction immediately, adjusting your approach or response accordingly.',
'6. When the call\'s purpose is fulfilled, include [END_CALL] in your response along with a polite goodbye.',
);
}

lines.push(
'7. Do not make up information — ask the user if unsure.',
'8. Keep responses short — 1-3 sentences is ideal for phone conversation.',
'9. When caller text includes [SPEAKER id="..." label="..."], treat each speaker as a distinct person and personalize responses using that speaker\'s prior context in this call.',
);

if (opts.isInbound) {
lines.push(
'10. If the latest user turn is [CALL_OPENING], greet the caller warmly and ask how you can help. Vary the wording; do not use a fixed template.',
'11. If the latest user turn includes [CALL_OPENING_ACK], treat it as the caller acknowledging your greeting and continue the conversation naturally.',
);
} else {
lines.push(
'10. If the latest user turn is [CALL_OPENING], generate a natural, context-specific opener: briefly introduce yourself once as an assistant, state why you are calling using the Task context, and ask a short permission/check-in question. Vary the wording; do not use a fixed template.',
'11. If the latest user turn includes [CALL_OPENING_ACK], treat it as the callee acknowledging your opener and continue the conversation naturally without re-introducing yourself or repeating the initial check-in question.',
);
}

lines.push(
'12. Do not repeat your introduction within the same call unless the callee explicitly asks who you are.',
'</voice_call_control>',
);

return lines.join('\n');
}

// ---------------------------------------------------------------------------
// startVoiceTurn
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -98,6 +180,13 @@ export async function startVoiceTurn(opts: VoiceTurnOptions): Promise<VoiceTurnH
? true
: undefined;

// Build the call-control protocol prompt so the model knows how to emit
// control markers (ASK_GUARDIAN, END_CALL, CALL_OPENING, etc.).
const voiceCallControlPrompt = buildVoiceCallControlPrompt({
isInbound: opts.isInbound,
task: opts.task,
});

const { run, abort } = await orchestrator.startRun(
opts.conversationId,
opts.content,
Expand All @@ -112,6 +201,7 @@ export async function startVoiceTurn(opts: VoiceTurnOptions): Promise<VoiceTurnH
assistantMessageChannel: 'voice',
},
eventSink,
voiceCallControlPrompt,
},
);

Expand Down
4 changes: 4 additions & 0 deletions assistant/src/daemon/session-agent-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ export interface AgentLoopSessionContext {
channelCapabilities?: ChannelCapabilities;
commandIntent?: { type: string; payload?: string; languageCode?: string };
guardianContext?: GuardianRuntimeContext;
voiceCallControlPrompt?: string;

readonly coreToolNames: Set<string>;
allowedToolNames?: Set<string>;
Expand Down Expand Up @@ -320,6 +321,7 @@ export async function runAgentLoopImpl(
channelTurnContext,
guardianContext: ctx.guardianContext ?? null,
temporalContext,
voiceCallControlPrompt: ctx.voiceCallControlPrompt ?? null,
});

// Pre-run repair
Expand Down Expand Up @@ -430,6 +432,7 @@ export async function runAgentLoopImpl(
channelTurnContext,
guardianContext: ctx.guardianContext ?? null,
temporalContext,
voiceCallControlPrompt: ctx.voiceCallControlPrompt ?? null,
});
preRepairMessages = runMessages;
preRunHistoryLength = runMessages.length;
Expand Down Expand Up @@ -465,6 +468,7 @@ export async function runAgentLoopImpl(
channelTurnContext,
guardianContext: ctx.guardianContext ?? null,
temporalContext,
voiceCallControlPrompt: ctx.voiceCallControlPrompt ?? null,
});
preRepairMessages = runMessages;
preRunHistoryLength = runMessages.length;
Expand Down
32 changes: 32 additions & 0 deletions assistant/src/daemon/session-runtime-assembly.ts
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,26 @@ export function injectActiveSurfaceContext(message: Message, ctx: ActiveSurfaceC
};
}

/**
* Append voice call-control protocol instructions to the last user
* message so the model knows how to emit control markers during voice
* turns routed through the session pipeline.
*/
export function injectVoiceCallControlContext(message: Message, prompt: string): Message {
return {
...message,
content: [
...message.content,
{ type: 'text', text: prompt },
],
};
}

/** Strip `<voice_call_control>` blocks injected by `injectVoiceCallControlContext`. */
export function stripVoiceCallControlContext(messages: Message[]): Message[] {
return stripUserTextBlocksByPrefix(messages, ['<voice_call_control>']);
}

/**
* Prepend channel capability context to the last user message so the
* model knows what the current channel can and cannot do.
Expand Down Expand Up @@ -514,6 +534,7 @@ const RUNTIME_INJECTION_PREFIXES = [
'<channel_command_context>',
'<channel_turn_context>',
'<guardian_context>',
'<voice_call_control>',
'<workspace_top_level>',
TEMPORAL_INJECTED_PREFIX,
'<active_workspace>',
Expand Down Expand Up @@ -558,10 +579,21 @@ export function applyRuntimeInjections(
channelTurnContext?: ChannelTurnContextParams | null;
guardianContext?: GuardianRuntimeContext | null;
temporalContext?: string | null;
voiceCallControlPrompt?: string | null;
},
): Message[] {
let result = runMessages;

if (options.voiceCallControlPrompt) {
const userTail = result[result.length - 1];
if (userTail && userTail.role === 'user') {
result = [
...result.slice(0, -1),
injectVoiceCallControlContext(userTail, options.voiceCallControlPrompt),
];
}
}

if (options.softConflictInstruction) {
const userTail = result[result.length - 1];
if (userTail && userTail.role === 'user') {
Expand Down
5 changes: 5 additions & 0 deletions assistant/src/daemon/session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ export class Session {
/** @internal */ currentPage?: string;
/** @internal */ channelCapabilities?: ChannelCapabilities;
/** @internal */ guardianContext?: GuardianRuntimeContext;
/** @internal */ voiceCallControlPrompt?: string;
/** @internal */ assistantId?: string;
/** @internal */ commandIntent?: { type: string; payload?: string; languageCode?: string };
/** @internal */ pendingSurfaceActions = new Map<string, { surfaceType: SurfaceType }>();
Expand Down Expand Up @@ -336,6 +337,10 @@ export class Session {
this.guardianContext = ctx ?? undefined;
}

setVoiceCallControlPrompt(prompt: string | null): void {
this.voiceCallControlPrompt = prompt ?? undefined;
}

setAssistantId(assistantId: string | null): void {
this.assistantId = assistantId ?? undefined;
}
Expand Down
7 changes: 7 additions & 0 deletions assistant/src/runtime/run-orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ export interface RunStartOptions {
* stall for the full permission timeout (300s by default).
*/
voiceAutoDenyConfirmations?: boolean;
/**
* Call-control protocol prompt injected into each voice turn so the
* model knows to emit control markers ([ASK_GUARDIAN:], [END_CALL], etc.).
*/
voiceCallControlPrompt?: string;
}

// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -215,6 +220,7 @@ export class RunOrchestrator {
// (e.g. attachment scope) match the actual transport rather than always
// defaulting to 'macos'.
session.setChannelCapabilities(resolveChannelCapabilities(options?.sourceChannel ?? 'macos'));
session.setVoiceCallControlPrompt(options?.voiceCallControlPrompt ?? null);

// Serialized publish chain so hub subscribers observe events in order.
let hubChain: Promise<void> = Promise.resolve();
Expand Down Expand Up @@ -312,6 +318,7 @@ export class RunOrchestrator {
session.setGuardianContext(null);
session.setCommandIntent(null);
session.setAssistantId('self');
session.setVoiceCallControlPrompt(null);
// Reset the session's client callback to a no-op so the stale
// closure doesn't intercept events from future runs on the same session.
// Set hasNoClient=true here since the run is done and no HTTP caller
Expand Down