11// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
22//
33// SPDX-License-Identifier: Apache-2.0
4- import type { Session } from '@google/genai' ;
5- import * as types from '@google/genai' ;
64import {
5+ type FunctionDeclaration ,
6+ type FunctionResponse ,
7+ type GoogleGenAIOptions ,
8+ type LiveClientToolResponse ,
9+ type Session ,
710 ActivityHandling ,
811 type AudioTranscriptionConfig ,
12+ Behavior ,
913 type ContextWindowCompressionConfig ,
14+ FunctionResponseScheduling ,
1015 GoogleGenAI ,
1116 type HttpOptions ,
12- Modality ,
1317 type RealtimeInputConfig ,
14- } from '@google/genai' ;
18+ type LiveServerMessage ,
19+ type LiveConnectConfig ,
20+ type LiveServerContent ,
21+ type LiveServerToolCall ,
22+ type LiveServerToolCallCancellation ,
23+ type UsageMetadata ,
24+ type ModalityTokenCount ,
25+ MediaModality ,
26+ type LiveServerGoAway ,
27+ Modality ,
28+ type Content ,
29+ type LiveClientRealtimeInput
30+ } from "@google/genai" ;
1531import type { APIConnectOptions } from '@livekit/agents' ;
1632import {
1733 APIConnectionError ,
@@ -35,6 +51,8 @@ import { toFunctionDeclarations } from '../../utils.js';
3551import type * as api_proto from './api_proto.js' ;
3652import type { LiveAPIModels , Voice } from './api_proto.js' ;
3753
54+ export { Behavior , FunctionResponseScheduling , Modality } ;
55+
3856// Input audio constants (matching Python)
3957const INPUT_AUDIO_SAMPLE_RATE = 16000 ;
4058const INPUT_AUDIO_CHANNELS = 1 ;
@@ -102,6 +120,8 @@ interface RealtimeOptions {
102120 contextWindowCompression ?: ContextWindowCompressionConfig ;
103121 apiVersion ?: string ;
104122 geminiTools ?: LLMTools ;
123+ toolBehavior ?: Behavior ;
124+ toolResponseScheduling ?: FunctionResponseScheduling ;
105125}
106126
107127/**
@@ -273,6 +293,18 @@ export class RealtimeModel extends llm.RealtimeModel {
273293 * Gemini-specific tools to use for the session
274294 */
275295 geminiTools ?: LLMTools ;
296+
297+ /**
298+ * Tool behavior for function calls (BLOCKING or NON_BLOCKING)
299+ * Defaults to NON_BLOCKING to prevent generateReply timeouts
300+ */
301+ toolBehavior ?: Behavior ;
302+
303+ /**
304+ * Function response scheduling (SILENT, WHEN_IDLE, or INTERRUPT)
305+ * Defaults to WHEN_IDLE
306+ */
307+ toolResponseScheduling ?: FunctionResponseScheduling ;
276308 } = { } ,
277309 ) {
278310 const inputAudioTranscription =
@@ -329,6 +361,9 @@ export class RealtimeModel extends llm.RealtimeModel {
329361 contextWindowCompression : options . contextWindowCompression ,
330362 apiVersion : options . apiVersion ,
331363 geminiTools : options . geminiTools ,
364+ toolBehavior : options . toolBehavior ?? Behavior . NON_BLOCKING ,
365+ toolResponseScheduling :
366+ options . toolResponseScheduling ?? FunctionResponseScheduling . WHEN_IDLE ,
332367 } ;
333368 }
334369
@@ -372,7 +407,7 @@ export class RealtimeSession extends llm.RealtimeSession {
372407 private _chatCtx = llm . ChatContext . empty ( ) ;
373408
374409 private options : RealtimeOptions ;
375- private geminiDeclarations : types . FunctionDeclaration [ ] = [ ] ;
410+ private geminiDeclarations : FunctionDeclaration [ ] = [ ] ;
376411 private messageChannel = new Queue < api_proto . ClientEvents > ( ) ;
377412 private inputResampler ?: AudioResampler ;
378413 private inputResamplerInputRate ?: number ;
@@ -421,7 +456,7 @@ export class RealtimeSession extends llm.RealtimeSession {
421456 timeout : this . options . connOptions . timeoutMs ,
422457 } ;
423458
424- const clientOptions : types . GoogleGenAIOptions = vertexai
459+ const clientOptions : GoogleGenAIOptions = vertexai
425460 ? {
426461 vertexai : true ,
427462 project,
@@ -463,15 +498,18 @@ export class RealtimeSession extends llm.RealtimeSession {
463498 private getToolResultsForRealtime (
464499 ctx : llm . ChatContext ,
465500 vertexai : boolean ,
466- ) : types . LiveClientToolResponse | undefined {
467- const toolResponses : types . FunctionResponse [ ] = [ ] ;
501+ ) : LiveClientToolResponse | undefined {
502+ const toolResponses : FunctionResponse [ ] = [ ] ;
468503
469504 for ( const item of ctx . items ) {
470505 if ( item . type === 'function_call_output' ) {
471- const response : types . FunctionResponse = {
506+ const response : FunctionResponse = {
472507 id : item . callId ,
473508 name : item . name ,
474- response : { output : item . output } ,
509+ response : {
510+ output : item . output ,
511+ scheduling : this . options . toolResponseScheduling ,
512+ } ,
475513 } ;
476514
477515 if ( ! vertexai ) {
@@ -552,7 +590,7 @@ export class RealtimeSession extends llm.RealtimeSession {
552590 this . sendClientEvent ( {
553591 type : 'content' ,
554592 value : {
555- turns : turns as types . Content [ ] ,
593+ turns : turns as Content [ ] ,
556594 turnComplete : false ,
557595 } ,
558596 } ) ;
@@ -572,7 +610,7 @@ export class RealtimeSession extends llm.RealtimeSession {
572610 }
573611
574612 async updateTools ( tools : llm . ToolContext ) : Promise < void > {
575- const newDeclarations = toFunctionDeclarations ( tools ) ;
613+ const newDeclarations = toFunctionDeclarations ( tools , this . options . toolBehavior ) ;
576614 const currentToolNames = new Set ( this . geminiDeclarations . map ( ( f ) => f . name ) ) ;
577615 const newToolNames = new Set ( newDeclarations . map ( ( f ) => f . name ) ) ;
578616
@@ -601,7 +639,7 @@ export class RealtimeSession extends llm.RealtimeSession {
601639
602640 for ( const f of this . resampleAudio ( frame ) ) {
603641 for ( const nf of this . bstream . write ( f . data . buffer ) ) {
604- const realtimeInput : types . LiveClientRealtimeInput = {
642+ const realtimeInput : LiveClientRealtimeInput = {
605643 mediaChunks : [
606644 {
607645 mimeType : 'audio/pcm' ,
@@ -648,7 +686,7 @@ export class RealtimeSession extends llm.RealtimeSession {
648686
649687 // Gemini requires the last message to end with user's turn
650688 // so we need to add a placeholder user turn in order to trigger a new generation
651- const turns : types . Content [ ] = [ ] ;
689+ const turns : Content [ ] = [ ] ;
652690 if ( instructions !== undefined ) {
653691 turns . push ( {
654692 parts : [ { text : instructions } ] ,
@@ -752,7 +790,7 @@ export class RealtimeSession extends llm.RealtimeSession {
752790 model : this . options . model ,
753791 callbacks : {
754792 onopen : ( ) => sessionOpened . set ( ) ,
755- onmessage : ( message : types . LiveServerMessage ) => {
793+ onmessage : ( message : LiveServerMessage ) => {
756794 this . onReceiveMessage ( session , message ) ;
757795 } ,
758796 onerror : ( error : ErrorEvent ) => {
@@ -846,7 +884,7 @@ export class RealtimeSession extends llm.RealtimeSession {
846884 }
847885 }
848886
849- private async sendTask ( session : types . Session , controller : AbortController ) : Promise < void > {
887+ private async sendTask ( session : Session , controller : AbortController ) : Promise < void > {
850888 try {
851889 while ( ! this . #closed && ! this . sessionShouldClose . isSet && ! controller . signal . aborted ) {
852890 const msg = await this . messageChannel . get ( ) ;
@@ -912,8 +950,8 @@ export class RealtimeSession extends llm.RealtimeSession {
912950 }
913951
914952 private async onReceiveMessage (
915- session : types . Session ,
916- response : types . LiveServerMessage ,
953+ session : Session ,
954+ response : LiveServerMessage ,
917955 ) : Promise < void > {
918956 // Skip logging verbose audio data events
919957 const hasAudioData = response . serverContent ?. modelTurn ?. parts ?. some (
@@ -1006,7 +1044,7 @@ export class RealtimeSession extends llm.RealtimeSession {
10061044 }
10071045
10081046 private loggableServerMessage (
1009- message : types . LiveServerMessage ,
1047+ message : LiveServerMessage ,
10101048 maxLength : number = 30 ,
10111049 ) : Record < string , unknown > {
10121050 const obj : any = { ...message } ;
@@ -1090,10 +1128,10 @@ export class RealtimeSession extends llm.RealtimeSession {
10901128 } ) ;
10911129 }
10921130
1093- private buildConnectConfig ( ) : types . LiveConnectConfig {
1131+ private buildConnectConfig ( ) : LiveConnectConfig {
10941132 const opts = this . options ;
10951133
1096- const config : types . LiveConnectConfig = {
1134+ const config : LiveConnectConfig = {
10971135 responseModalities : opts . responseModalities ,
10981136 systemInstruction : opts . instructions
10991137 ? {
@@ -1214,7 +1252,7 @@ export class RealtimeSession extends llm.RealtimeSession {
12141252 } as llm . InputSpeechStoppedEvent ) ;
12151253 }
12161254
1217- private handleServerContent ( serverContent : types . LiveServerContent ) : void {
1255+ private handleServerContent ( serverContent : LiveServerContent ) : void {
12181256 if ( ! this . currentGeneration ) {
12191257 this . #logger. warn ( 'received server content but no active generation.' ) ;
12201258 return ;
@@ -1298,7 +1336,7 @@ export class RealtimeSession extends llm.RealtimeSession {
12981336 }
12991337 }
13001338
1301- private handleToolCall ( toolCall : types . LiveServerToolCall ) : void {
1339+ private handleToolCall ( toolCall : LiveServerToolCall ) : void {
13021340 if ( ! this . currentGeneration ) {
13031341 this . #logger. warn ( 'received tool call but no active generation.' ) ;
13041342 return ;
@@ -1317,7 +1355,7 @@ export class RealtimeSession extends llm.RealtimeSession {
13171355 this . markCurrentGenerationDone ( ) ;
13181356 }
13191357
1320- private handleToolCallCancellation ( cancellation : types . LiveServerToolCallCancellation ) : void {
1358+ private handleToolCallCancellation ( cancellation : LiveServerToolCallCancellation ) : void {
13211359 this . #logger. warn (
13221360 {
13231361 functionCallIds : cancellation . ids ,
@@ -1326,7 +1364,7 @@ export class RealtimeSession extends llm.RealtimeSession {
13261364 ) ;
13271365 }
13281366
1329- private handleUsageMetadata ( usage : types . UsageMetadata ) : void {
1367+ private handleUsageMetadata ( usage : UsageMetadata ) : void {
13301368 if ( ! this . currentGeneration ) {
13311369 this . #logger. debug ( 'Received usage metadata but no active generation' ) ;
13321370 return ;
@@ -1371,7 +1409,7 @@ export class RealtimeSession extends llm.RealtimeSession {
13711409 this . emit ( 'metrics_collected' , realtimeMetrics ) ;
13721410 }
13731411
1374- private tokenDetailsMap ( tokenDetails : types . ModalityTokenCount [ ] | undefined ) : {
1412+ private tokenDetailsMap ( tokenDetails : ModalityTokenCount [ ] | undefined ) : {
13751413 audioTokens : number ;
13761414 textTokens : number ;
13771415 imageTokens : number ;
@@ -1386,18 +1424,18 @@ export class RealtimeSession extends llm.RealtimeSession {
13861424 continue ;
13871425 }
13881426
1389- if ( tokenDetail . modality === types . MediaModality . AUDIO ) {
1427+ if ( tokenDetail . modality === MediaModality . AUDIO ) {
13901428 tokenDetailsMap . audioTokens += tokenDetail . tokenCount ;
1391- } else if ( tokenDetail . modality === types . MediaModality . TEXT ) {
1429+ } else if ( tokenDetail . modality === MediaModality . TEXT ) {
13921430 tokenDetailsMap . textTokens += tokenDetail . tokenCount ;
1393- } else if ( tokenDetail . modality === types . MediaModality . IMAGE ) {
1431+ } else if ( tokenDetail . modality === MediaModality . IMAGE ) {
13941432 tokenDetailsMap . imageTokens += tokenDetail . tokenCount ;
13951433 }
13961434 }
13971435 return tokenDetailsMap ;
13981436 }
13991437
1400- private handleGoAway ( goAway : types . LiveServerGoAway ) : void {
1438+ private handleGoAway ( goAway : LiveServerGoAway ) : void {
14011439 this . #logger. warn ( { timeLeft : goAway . timeLeft } , 'Gemini server indicates disconnection soon.' ) ;
14021440 // TODO(brian): this isn't a seamless reconnection just yet
14031441 this . sessionShouldClose . set ( ) ;
0 commit comments