From 1e6f05b1f647c80225bca171b89adc0fa6ff3611 Mon Sep 17 00:00:00 2001 From: Cale Shapera <25466659+cshape@users.noreply.github.com> Date: Fri, 28 Nov 2025 15:01:41 -0800 Subject: [PATCH 1/3] added inworld TTS plugin and example --- examples/package.json | 1 + examples/src/inworld_tts.ts | 112 +++++ plugins/inworld/api-extractor.json | 21 + plugins/inworld/package.json | 52 +++ plugins/inworld/src/index.ts | 20 + plugins/inworld/src/tts.ts | 655 +++++++++++++++++++++++++++++ plugins/inworld/tsconfig.json | 17 + plugins/inworld/tsup.config.ts | 8 + pnpm-lock.yaml | 64 ++- 9 files changed, 914 insertions(+), 36 deletions(-) create mode 100644 examples/src/inworld_tts.ts create mode 100644 plugins/inworld/api-extractor.json create mode 100644 plugins/inworld/package.json create mode 100644 plugins/inworld/src/index.ts create mode 100644 plugins/inworld/src/tts.ts create mode 100644 plugins/inworld/tsconfig.json create mode 100644 plugins/inworld/tsup.config.ts diff --git a/examples/package.json b/examples/package.json index edbf7796..1bcac74c 100644 --- a/examples/package.json +++ b/examples/package.json @@ -29,6 +29,7 @@ "@livekit/agents-plugin-deepgram": "workspace:*", "@livekit/agents-plugin-elevenlabs": "workspace:*", "@livekit/agents-plugin-google": "workspace:*", + "@livekit/agents-plugin-inworld": "workspace:*", "@livekit/agents-plugin-livekit": "workspace:*", "@livekit/agents-plugin-neuphonic": "workspace:*", "@livekit/agents-plugin-openai": "workspace:*", diff --git a/examples/src/inworld_tts.ts b/examples/src/inworld_tts.ts new file mode 100644 index 00000000..faedf829 --- /dev/null +++ b/examples/src/inworld_tts.ts @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + type JobContext, + type JobProcess, + WorkerOptions, + cli, + defineAgent, + metrics, + voice, + } from '@livekit/agents'; + import * as livekit from '@livekit/agents-plugin-livekit'; + import * as inworld from '@livekit/agents-plugin-inworld'; + import * as silero from '@livekit/agents-plugin-silero'; + import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; + import { fileURLToPath } from 'node:url'; + + export default defineAgent({ + prewarm: async (proc: JobProcess) => { + proc.userData.vad = await silero.VAD.load(); + }, + entry: async (ctx: JobContext) => { + const agent = new voice.Agent({ + instructions: + "You are a helpful assistant, you can hear the user's message and respond to it in 1-2 short sentences." + }); + + // Create TTS instance + const tts = new inworld.TTS({ + timestampType: 'WORD', + voice: 'Hades', + model: 'inworld-tts-1', + encoding: 'LINEAR16', + textNormalization: 'ON', + bitRate: 64000, + sampleRate: 24000, + speakingRate: 1.0, + temperature: 1.1, + bufferCharThreshold: 100, + maxBufferDelayMs: 3000 + }); + + // List available voices (non-blocking) + tts.listVoices().then((voices: inworld.Voice[]) => { + console.log(`[Inworld TTS] ${voices.length} voices available in this workspace`); + if (voices.length > 0) { + console.log('[Inworld TTS] Logging information for first voice:', JSON.stringify(voices[0], null, 2)); + } + }).catch((err: Error) => { + console.error('[Inworld TTS] Failed to list voices:', err); + }); + + const session = new voice.AgentSession({ + // Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand + // See all available models at https://docs.livekit.io/agents/models/stt/ + stt: 'assemblyai/universal-streaming:en', + // A Large Language Model (LLM) is your agent's brain, processing user input and generating a response + // See all available models at https://docs.livekit.io/agents/models/llm/ + llm: 'openai/gpt-4.1-mini', + // Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear + // See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ + tts, + // VAD and turn detection are used to determine when the user is speaking and when the agent should respond + // See more at https://docs.livekit.io/agents/build/turns + vad: ctx.proc.userData.vad! as silero.VAD, + turnDetection: new livekit.turnDetector.MultilingualModel(), + // to use realtime model, replace the stt, llm, tts and vad with the following + // llm: new openai.realtime.RealtimeModel(), + voiceOptions: { + // allow the LLM to generate a response while waiting for the end of turn + preemptiveGeneration: true, + }, + }); + + // timestamp handling for inworld TTS + session.tts!.on('alignment' as any, (data: any) => { + if (data.wordAlignment) { + const { words, starts, ends } = data.wordAlignment; + for (let i = 0; i < words.length; i++) { + console.log(`[Inworld TTS] Word: "${words[i]}", Start: ${starts[i]}, End: ${ends[i]}`); + } + } + if (data.characterAlignment) { + const { chars, starts, ends } = data.characterAlignment; + for (let i = 0; i < chars.length; i++) { + console.log(`[Inworld TTS] Char: "${chars[i]}", Start: ${starts[i]}, End: ${ends[i]}`); + } + } + }); + + const usageCollector = new metrics.UsageCollector(); + + session.on(voice.AgentSessionEventTypes.MetricsCollected, (ev) => { + metrics.logMetrics(ev.metrics); + usageCollector.collect(ev.metrics); + }); + + await session.start({ + agent, + room: ctx.room, + inputOptions: { + noiseCancellation: BackgroundVoiceCancellation(), + }, + }); + + session.say('Hello, how can I help you today?'); + }, + }); + + cli.runApp(new WorkerOptions({ agent: fileURLToPath(import.meta.url) })); + \ No newline at end of file diff --git a/plugins/inworld/api-extractor.json b/plugins/inworld/api-extractor.json new file mode 100644 index 00000000..488aaf03 --- /dev/null +++ b/plugins/inworld/api-extractor.json @@ -0,0 +1,21 @@ +/** + * Config file for API Extractor. For more info, please visit: https://api-extractor.com + */ +{ + "$schema": "https://developer.microsoft.com/json-schemas/api-extractor/v7/api-extractor.schema.json", + + /** + * Optionally specifies another JSON config file that this file extends from. This provides a way for + * standard settings to be shared across multiple projects. + * + * If the path starts with "./" or "../", the path is resolved relative to the folder of the file that contains + * the "extends" field. Otherwise, the first path segment is interpreted as an NPM package name, and will be + * resolved using NodeJS require(). + * + * SUPPORTED TOKENS: none + * DEFAULT VALUE: "" + */ + "extends": "../../api-extractor-shared.json", + "mainEntryPointFilePath": "./dist/index.d.ts" +} + diff --git a/plugins/inworld/package.json b/plugins/inworld/package.json new file mode 100644 index 00000000..f2c13066 --- /dev/null +++ b/plugins/inworld/package.json @@ -0,0 +1,52 @@ +{ + "name": "@livekit/agents-plugin-inworld", + "version": "0.1.0", + "description": "Inworld plugin for LiveKit Node Agents", + "main": "dist/index.js", + "require": "dist/index.cjs", + "types": "dist/index.d.ts", + "exports": { + "import": { + "types": "./dist/index.d.ts", + "default": "./dist/index.js" + }, + "require": { + "types": "./dist/index.d.cts", + "default": "./dist/index.cjs" + } + }, + "author": "LiveKit", + "type": "module", + "repository": "git@github.com:livekit/agents-js.git", + "license": "Apache-2.0", + "files": [ + "dist", + "src", + "README.md" + ], + "scripts": { + "build": "tsup --onSuccess \"pnpm build:types\"", + "build:types": "tsc --declaration --emitDeclarationOnly && node ../../scripts/copyDeclarationOutput.js", + "clean": "rm -rf dist", + "clean:build": "pnpm clean && pnpm build", + "lint": "eslint -f unix \"src/**/*.{ts,js}\"", + "api:check": "api-extractor run --typescript-compiler-folder ../../node_modules/typescript", + "api:update": "api-extractor run --local --typescript-compiler-folder ../../node_modules/typescript --verbose" + }, + "devDependencies": { + "@livekit/agents": "workspace:*", + "@livekit/rtc-node": "^0.13.12", + "@microsoft/api-extractor": "^7.35.0", + "@types/ws": "^8.5.10", + "tsup": "^8.3.5", + "typescript": "^5.0.0" + }, + "dependencies": { + "ws": "^8.16.0" + }, + "peerDependencies": { + "@livekit/agents": "workspace:*", + "@livekit/rtc-node": "^0.13.12" + } +} + diff --git a/plugins/inworld/src/index.ts b/plugins/inworld/src/index.ts new file mode 100644 index 00000000..932950e2 --- /dev/null +++ b/plugins/inworld/src/index.ts @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { Plugin } from '@livekit/agents'; +import { TTS } from './tts.js'; + +export * from './tts.js'; + +class InworldPlugin extends Plugin { + constructor() { + super({ + title: 'Inworld', + version: '0.1.0', + package: '@livekit/agents-plugin-inworld', + }); + } +} + +Plugin.registerPlugin(new InworldPlugin()); + diff --git a/plugins/inworld/src/tts.ts b/plugins/inworld/src/tts.ts new file mode 100644 index 00000000..ee0f7f78 --- /dev/null +++ b/plugins/inworld/src/tts.ts @@ -0,0 +1,655 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + AudioByteStream, + log, + shortuuid, + tts, + tokenize, +} from '@livekit/agents'; +import { type RawData, WebSocket } from 'ws'; + +const DEFAULT_BIT_RATE = 64000; +const DEFAULT_ENCODING = 'LINEAR16'; +const DEFAULT_MODEL = 'inworld-tts-1'; +const DEFAULT_SAMPLE_RATE = 24000; +const DEFAULT_URL = 'https://api.inworld.ai/'; +const DEFAULT_WS_URL = 'wss://api.inworld.ai/'; +const DEFAULT_VOICE = 'Ashley'; +const DEFAULT_TEMPERATURE = 1.1; +const DEFAULT_SPEAKING_RATE = 1.0; +const DEFAULT_BUFFER_CHAR_THRESHOLD = 100; +const DEFAULT_MAX_BUFFER_DELAY_MS = 3000; +const NUM_CHANNELS = 1; + +export type Encoding = 'LINEAR16' | 'MP3' | 'OGG_OPUS' | 'ALAW' | 'MULAW' | 'FLAC' | string; +export type TimestampType = 'TIMESTAMP_TYPE_UNSPECIFIED' | 'WORD' | 'CHARACTER'; +export type TextNormalization = 'APPLY_TEXT_NORMALIZATION_UNSPECIFIED' | 'ON' | 'OFF'; + +export interface TTSOptions { + apiKey?: string; + voice: string; + model: string; + encoding: Encoding; + bitRate: number; + sampleRate: number; + speakingRate: number; + temperature: number; + timestampType?: TimestampType; + textNormalization?: TextNormalization; + bufferCharThreshold: number; + maxBufferDelayMs: number; + baseURL: string; + wsURL: string; + tokenizer?: tokenize.SentenceTokenizer; +} + +// API request/response types +interface AudioConfig { + audioEncoding: Encoding; + sampleRateHertz: number; + bitrate: number; + speakingRate: number; + temperature?: number; +} + +interface SynthesizeRequest { + text: string; + voiceId: string; + modelId: string; + audioConfig: AudioConfig; + temperature: number; + timestampType?: TimestampType; + applyTextNormalization?: TextNormalization; +} + +interface CreateContextConfig { + voiceId: string; + modelId: string; + audioConfig: AudioConfig; + temperature: number; + bufferCharThreshold: number; + maxBufferDelayMs: number; + timestampType?: TimestampType; + applyTextNormalization?: TextNormalization; +} + +interface WordAlignment { + words: string[]; + wordStartTimeSeconds: number[]; + wordEndTimeSeconds: number[]; +} + +interface CharacterAlignment { + characters: string[]; + characterStartTimeSeconds: number[]; + characterEndTimeSeconds: number[]; +} + +interface TimestampInfo { + wordAlignment?: WordAlignment; + characterAlignment?: CharacterAlignment; +} + +interface AudioChunk { + audioContent?: string; + timestampInfo?: TimestampInfo; +} + +interface InworldResult { + contextId?: string; + contextCreated?: boolean; + contextClosed?: boolean; + audioChunk?: AudioChunk; + audioContent?: string; + status?: { code: number; message: string }; +} + +interface InworldMessage { + result?: InworldResult; + contextId?: string; + error?: { message: string }; +} + +export interface Voice { + voiceId: string; + displayName: string; + description: string; + languages: string[]; + tags: string[]; +} + +interface ListVoicesResponse { + voices: Voice[]; +} + +const defaultTTSOptionsBase: Omit = { + apiKey: process.env.INWORLD_API_KEY, + voice: DEFAULT_VOICE, + model: DEFAULT_MODEL, + encoding: DEFAULT_ENCODING as Encoding, + bitRate: DEFAULT_BIT_RATE, + sampleRate: DEFAULT_SAMPLE_RATE, + speakingRate: DEFAULT_SPEAKING_RATE, + temperature: DEFAULT_TEMPERATURE, + bufferCharThreshold: DEFAULT_BUFFER_CHAR_THRESHOLD, + maxBufferDelayMs: DEFAULT_MAX_BUFFER_DELAY_MS, + baseURL: DEFAULT_URL, + wsURL: DEFAULT_WS_URL, +}; + +const MAX_RETRIES = 2; +const BASE_DELAY_MS = 1000; + +class WSConnectionPool { + #ws?: WebSocket; + #url: string; + #auth: string; + #connecting?: Promise; + #listeners: Map void> = new Map(); + #logger = log(); + + constructor(url: string, auth: string) { + this.#url = url; + this.#auth = auth; + } + + async getConnection(): Promise { + if (this.#ws && this.#ws.readyState === WebSocket.OPEN) { + return this.#ws; + } + + if (this.#connecting) { + return this.#connecting; + } + + this.#connecting = this.#connectWithRetry(); + return this.#connecting; + } + + async #connectWithRetry(): Promise { + let lastError: Error | undefined; + + for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { + try { + return await this.#attemptConnection(); + } catch (err) { + lastError = err instanceof Error ? err : new Error(String(err)); + this.#connecting = undefined; + + if (attempt < MAX_RETRIES) { + // Exponential backoff: 1s, 2s + const delayMs = BASE_DELAY_MS * Math.pow(2, attempt); + this.#logger.warn( + { error: lastError, attempt: attempt + 1, maxRetries: MAX_RETRIES + 1, delayMs }, + `Failed to connect to Inworld, retrying in ${delayMs}ms`, + ); + await new Promise((resolve) => setTimeout(resolve, delayMs)); + } + } + } + + throw new Error( + `Failed to connect to Inworld after ${MAX_RETRIES + 1} attempts: ${lastError?.message}`, + ); + } + + #attemptConnection(): Promise { + return new Promise((resolve, reject) => { + const wsUrl = new URL('tts/v1/voice:streamBidirectional', this.#url); + // Ensure protocol is wss + if (wsUrl.protocol === 'https:') wsUrl.protocol = 'wss:'; + else if (wsUrl.protocol === 'http:') wsUrl.protocol = 'ws:'; + + const ws = new WebSocket(wsUrl.toString(), { + headers: { Authorization: this.#auth }, + }); + + ws.on('open', () => { + this.#ws = ws; + this.#connecting = undefined; + resolve(ws); + }); + + ws.on('error', (err) => { + if (this.#connecting) { + reject(err); + } else { + this.#logger.error({ err }, 'Inworld WebSocket error'); + } + }); + + ws.on('close', () => { + this.#ws = undefined; + this.#connecting = undefined; + }); + + ws.on('message', (data: RawData) => { + try { + const json = JSON.parse(data.toString()) as InworldMessage; + const result = json.result; + if (result) { + // Try to find contextId in result or top level + const contextId = result.contextId || json.contextId; + if (contextId && this.#listeners.has(contextId)) { + this.#listeners.get(contextId)!(json); + } + } else if (json.error) { + this.#logger.warn({ error: json.error }, 'Inworld received error message'); + } + } catch (e) { + this.#logger.warn({ error: e }, 'Failed to parse Inworld WebSocket message'); + } + }); + }); + } + + registerListener(contextId: string, cb: (msg: InworldMessage) => void) { + this.#listeners.set(contextId, cb); + } + + unregisterListener(contextId: string) { + this.#listeners.delete(contextId); + } + + close() { + if (this.#ws) { + this.#ws.close(); + this.#ws = undefined; + } + } +} + +export class TTS extends tts.TTS { + #opts: TTSOptions; + #pool: WSConnectionPool; + #authorization: string; + label = 'inworld.TTS'; + + constructor(opts: Partial = {}) { + const mergedOpts = { ...defaultTTSOptionsBase, ...opts }; + if (!mergedOpts.apiKey) { + throw new Error('Inworld API key required. Set INWORLD_API_KEY or provide apiKey.'); + } + + super(mergedOpts.sampleRate, NUM_CHANNELS, { + streaming: true, + }); + + this.#opts = mergedOpts as TTSOptions; + if (!this.#opts.tokenizer) { + this.#opts.tokenizer = new tokenize.basic.SentenceTokenizer(); + } + this.#authorization = `Basic ${mergedOpts.apiKey}`; + this.#pool = new WSConnectionPool(this.#opts.wsURL, this.#authorization); + } + + get pool(): WSConnectionPool { + return this.#pool; + } + + get authorization(): string { + return this.#authorization; + } + + /** + * List all available voices. + * @param language - Optional ISO 639-1 language code to filter voices (e.g., 'en', 'es', 'fr') + */ + async listVoices(language?: string): Promise { + const url = new URL('tts/v1/voices', this.#opts.baseURL); + if (language) { + url.searchParams.set('filter', `language=${language}`); + } + + const response = await fetch(url.toString(), { + method: 'GET', + headers: { + Authorization: this.#authorization, + }, + }); + + if (!response.ok) { + const errorBody = await response.json().catch(() => ({})); + throw new Error( + `Inworld API error: ${response.status} ${response.statusText}${errorBody.message ? ` - ${errorBody.message}` : ''}`, + ); + } + + const data = (await response.json()) as ListVoicesResponse; + return data.voices; + } + + synthesize(text: string): tts.ChunkedStream { + return new ChunkedStream(this, text, this.#opts); + } + + stream(): tts.SynthesizeStream { + return new SynthesizeStream(this, this.#opts); + } + + updateOptions(opts: Partial) { + this.#opts = { ...this.#opts, ...opts }; + if (opts.apiKey) { + this.#authorization = `Basic ${opts.apiKey}`; + // If API key changes, we might need to reset the pool or create a new one? + // For now, assume WS url doesn't change or we just create new pool if needed. + // But existing pool has hardcoded auth in constructor. + // Re-creating pool is safer. + this.#pool.close(); + this.#pool = new WSConnectionPool(this.#opts.wsURL, this.#authorization); + } + } + + async close() { + this.#pool.close(); + } +} + +class ChunkedStream extends tts.ChunkedStream { + #opts: TTSOptions; + #tts: TTS; + label = 'inworld.ChunkedStream'; + + constructor(ttsInstance: TTS, text: string, opts: TTSOptions) { + super(text, ttsInstance); + this.#tts = ttsInstance; + this.#opts = opts; + } + + protected async run() { + const audioConfig: AudioConfig = { + audioEncoding: this.#opts.encoding, + bitrate: this.#opts.bitRate, + sampleRateHertz: this.#opts.sampleRate, + temperature: this.#opts.temperature, + speakingRate: this.#opts.speakingRate, + }; + + const bodyParams: SynthesizeRequest = { + text: this.inputText, + voiceId: this.#opts.voice, + modelId: this.#opts.model, + audioConfig: audioConfig, + temperature: this.#opts.temperature, + timestampType: this.#opts.timestampType, + applyTextNormalization: this.#opts.textNormalization, + }; + + const url = new URL('tts/v1/voice:stream', this.#opts.baseURL); + + const response = await fetch(url.toString(), { + method: 'POST', + headers: { + Authorization: this.#tts.authorization, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(bodyParams), + }); + + if (!response.ok) { + throw new Error(`Inworld API error: ${response.status} ${response.statusText}`); + } + + if (!response.body) { + throw new Error('No response body'); + } + + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let buffer = ''; + + const requestId = shortuuid(); + const bstream = new AudioByteStream(this.#opts.sampleRate, NUM_CHANNELS); + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split('\n'); + buffer = lines.pop() || ''; + + for (const line of lines) { + if (line.trim()) { + try { + const data = JSON.parse(line); + if (data.result) { + if (data.result.audioContent) { + const audio = Buffer.from(data.result.audioContent, 'base64'); + + let pcmData = audio; + if (audio.length > 44 && audio.subarray(0, 4).toString() === 'RIFF') { + // This is a WAV header, skip 44 bytes + pcmData = audio.subarray(44); + } + + for (const frame of bstream.write( + pcmData.buffer.slice(pcmData.byteOffset, pcmData.byteOffset + pcmData.byteLength), + )) { + this.queue.put({ + requestId, + segmentId: requestId, + frame, + final: false, + }); + } + } + } else if (data.error) { + throw new Error(data.error.message); + } + } catch (e) { + log().warn({ error: e, line }, 'Failed to parse Inworld chunk'); + } + } + } + } + + // Flush remaining frames + for (const frame of bstream.flush()) { + this.queue.put({ + requestId, + segmentId: requestId, + frame, + final: false, + }); + } + } +} + +class SynthesizeStream extends tts.SynthesizeStream { + #opts: TTSOptions; + #tts: TTS; + #contextId: string; + label = 'inworld.SynthesizeStream'; + + constructor(ttsInstance: TTS, opts: TTSOptions) { + super(ttsInstance); + this.#tts = ttsInstance; + this.#opts = opts; + this.#contextId = shortuuid(); + } + + protected async run() { + const ws = await this.#tts.pool.getConnection(); + const bstream = new AudioByteStream(this.#opts.sampleRate, NUM_CHANNELS); + const tokenizerStream = this.#opts.tokenizer!.stream(); + + let resolveProcessing: () => void; + let rejectProcessing: (err: Error) => void; + const processing = new Promise((resolve, reject) => { + resolveProcessing = resolve; + rejectProcessing = reject; + }); + + const handleMessage = (msg: InworldMessage) => { + const result = msg.result; + if (!result) return; + + if (result.contextCreated) { + // context created + } else if (result.contextClosed) { + resolveProcessing(); + } else if (result.audioChunk) { + if (result.audioChunk.timestampInfo) { + // Log word timestamps if available + const tsInfo = result.audioChunk.timestampInfo; + if (tsInfo.wordAlignment) { + const words = tsInfo.wordAlignment.words || []; + const starts = tsInfo.wordAlignment.wordStartTimeSeconds || []; + const ends = tsInfo.wordAlignment.wordEndTimeSeconds || []; + + for (let i = 0; i < words.length; i++) { + // console.log(`[Inworld TTS] Word: "${words[i]}", Start: ${starts[i]}, End: ${ends[i]}`); + } + + (this.#tts as any).emit('alignment', { + requestId: this.#contextId, + segmentId: this.#contextId, + wordAlignment: { words, starts, ends } + }); + } + + if (tsInfo.characterAlignment) { + const chars = tsInfo.characterAlignment.characters || []; + const starts = tsInfo.characterAlignment.characterStartTimeSeconds || []; + const ends = tsInfo.characterAlignment.characterEndTimeSeconds || []; + + (this.#tts as any).emit('alignment', { + requestId: this.#contextId, + segmentId: this.#contextId, + characterAlignment: { chars, starts, ends } + }); + } + } + + if (result.audioChunk.audioContent) { + // Some servers may return either nested audioContent or top-level + const b64Content = result.audioChunk.audioContent || result.audioContent; + if (b64Content) { + const audio = Buffer.from(b64Content, 'base64'); + let pcmData = audio; + if (audio.length > 44 && audio.subarray(0, 4).toString() === 'RIFF') { + // This is a WAV header, skip 44 bytes + pcmData = audio.subarray(44); + } + for (const frame of bstream.write( + pcmData.buffer.slice(pcmData.byteOffset, pcmData.byteOffset + pcmData.byteLength), + )) { + this.queue.put({ + requestId: this.#contextId, + segmentId: this.#contextId, + frame, + final: false, + }); + } + } + } + } else if (result.status && result.status.code !== 0) { + const error = new Error(`Inworld stream error: ${result.status.message}`); + rejectProcessing(error); + } + }; + + this.#tts.pool.registerListener(this.#contextId, handleMessage); + + const sendLoop = async () => { + for await (const ev of tokenizerStream) { + await this.#sendText(ws, ev.token); + } + }; + const sendPromise = sendLoop(); + + try { + await this.#createContext(ws); + + for await (const text of this.input) { + if (text === tts.SynthesizeStream.FLUSH_SENTINEL) { + tokenizerStream.flush(); + } else { + tokenizerStream.pushText(text); + } + } + tokenizerStream.endInput(); + await sendPromise; + + await this.#flushContext(ws); + await this.#closeContext(ws); + await processing; + + // Flush remaining frames + for (const frame of bstream.flush()) { + this.queue.put({ + requestId: this.#contextId, + segmentId: this.#contextId, + frame, + final: false, + }); + } + + } catch (e) { + log().error({ error: e }, 'Error in SynthesizeStream run'); + throw e; + } finally { + this.#tts.pool.unregisterListener(this.#contextId); + } + } + + #send(ws: WebSocket, data: object): Promise { + return new Promise((resolve, reject) => { + if (ws.readyState !== WebSocket.OPEN) { + reject(new Error('WebSocket is not open')); + return; + } + ws.send(JSON.stringify(data), (err) => { + if (err) { + reject(err); + } else { + resolve(); + } + }); + }); + } + + #createContext(ws: WebSocket): Promise { + const config: CreateContextConfig = { + voiceId: this.#opts.voice, + modelId: this.#opts.model, + audioConfig: { + audioEncoding: this.#opts.encoding, + sampleRateHertz: this.#opts.sampleRate, + bitrate: this.#opts.bitRate, + speakingRate: this.#opts.speakingRate, + }, + temperature: this.#opts.temperature, + bufferCharThreshold: this.#opts.bufferCharThreshold, + maxBufferDelayMs: this.#opts.maxBufferDelayMs, + timestampType: this.#opts.timestampType, + applyTextNormalization: this.#opts.textNormalization, + }; + + return this.#send(ws, { create: config, contextId: this.#contextId }); + } + + #sendText(ws: WebSocket, text: string): Promise { + return this.#send(ws, { + send_text: { text }, + contextId: this.#contextId, + }); + } + + #flushContext(ws: WebSocket): Promise { + return this.#send(ws, { + flush_context: {}, + contextId: this.#contextId, + }); + } + + #closeContext(ws: WebSocket): Promise { + return this.#send(ws, { + close_context: {}, + contextId: this.#contextId, + }); + } +} diff --git a/plugins/inworld/tsconfig.json b/plugins/inworld/tsconfig.json new file mode 100644 index 00000000..a1617574 --- /dev/null +++ b/plugins/inworld/tsconfig.json @@ -0,0 +1,17 @@ +{ + "extends": "../../tsconfig.json", + "include": ["./src"], + "compilerOptions": { + // match output dir to input dir. e.g. dist/index instead of dist/src/index + "rootDir": "./src", + "declarationDir": "./dist", + "outDir": "./dist" + }, + "typedocOptions": { + "name": "plugins/agents-plugin-inworld", + "entryPointStrategy": "resolve", + "readme": "none", + "entryPoints": ["src/index.ts"] + } +} + diff --git a/plugins/inworld/tsup.config.ts b/plugins/inworld/tsup.config.ts new file mode 100644 index 00000000..96bdaec8 --- /dev/null +++ b/plugins/inworld/tsup.config.ts @@ -0,0 +1,8 @@ +import { defineConfig } from 'tsup'; + +import defaults from '../../tsup.config'; + +export default defineConfig({ + ...defaults, +}); + diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3a81f39b..39b53189 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -236,6 +236,9 @@ importers: '@livekit/agents-plugin-google': specifier: workspace:* version: link:../plugins/google + '@livekit/agents-plugin-inworld': + specifier: workspace:* + version: link:../plugins/inworld '@livekit/agents-plugin-livekit': specifier: workspace:* version: link:../plugins/livekit @@ -499,6 +502,31 @@ importers: specifier: ^5.0.0 version: 5.4.5 + plugins/inworld: + dependencies: + ws: + specifier: ^8.16.0 + version: 8.18.3 + devDependencies: + '@livekit/agents': + specifier: workspace:* + version: link:../../agents + '@livekit/rtc-node': + specifier: ^0.13.12 + version: 0.13.13 + '@microsoft/api-extractor': + specifier: ^7.35.0 + version: 7.43.7(@types/node@22.15.30) + '@types/ws': + specifier: ^8.5.10 + version: 8.5.10 + tsup: + specifier: ^8.3.5 + version: 8.4.0(@microsoft/api-extractor@7.43.7(@types/node@22.15.30))(postcss@8.4.38)(tsx@4.20.4)(typescript@5.4.5) + typescript: + specifier: ^5.0.0 + version: 5.4.5 + plugins/livekit: dependencies: '@huggingface/hub': @@ -1437,92 +1465,78 @@ packages: resolution: {integrity: sha512-RXwd0CgG+uPRX5YYrkzKyalt2OJYRiJQ8ED/fi1tq9WQW2jsQIn0tqrlR5l5dr/rjqq6AHAxURhj2DVjyQWSOA==} cpu: [arm64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-arm@1.2.0': resolution: {integrity: sha512-mWd2uWvDtL/nvIzThLq3fr2nnGfyr/XMXlq8ZJ9WMR6PXijHlC3ksp0IpuhK6bougvQrchUAfzRLnbsen0Cqvw==} cpu: [arm] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-ppc64@1.2.0': resolution: {integrity: sha512-Xod/7KaDDHkYu2phxxfeEPXfVXFKx70EAFZ0qyUdOjCcxbjqyJOEUpDe6RIyaunGxT34Anf9ue/wuWOqBW2WcQ==} cpu: [ppc64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-s390x@1.2.0': resolution: {integrity: sha512-eMKfzDxLGT8mnmPJTNMcjfO33fLiTDsrMlUVcp6b96ETbnJmd4uvZxVJSKPQfS+odwfVaGifhsB07J1LynFehw==} cpu: [s390x] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-x64@1.2.0': resolution: {integrity: sha512-ZW3FPWIc7K1sH9E3nxIGB3y3dZkpJlMnkk7z5tu1nSkBoCgw2nSRTFHI5pB/3CQaJM0pdzMF3paf9ckKMSE9Tg==} cpu: [x64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linuxmusl-arm64@1.2.0': resolution: {integrity: sha512-UG+LqQJbf5VJ8NWJ5Z3tdIe/HXjuIdo4JeVNADXBFuG7z9zjoegpzzGIyV5zQKi4zaJjnAd2+g2nna8TZvuW9Q==} cpu: [arm64] os: [linux] - libc: [musl] '@img/sharp-libvips-linuxmusl-x64@1.2.0': resolution: {integrity: sha512-SRYOLR7CXPgNze8akZwjoGBoN1ThNZoqpOgfnOxmWsklTGVfJiGJoC/Lod7aNMGA1jSsKWM1+HRX43OP6p9+6Q==} cpu: [x64] os: [linux] - libc: [musl] '@img/sharp-linux-arm64@0.34.3': resolution: {integrity: sha512-QdrKe3EvQrqwkDrtuTIjI0bu6YEJHTgEeqdzI3uWJOH6G1O8Nl1iEeVYRGdj1h5I21CqxSvQp1Yv7xeU3ZewbA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] - libc: [glibc] '@img/sharp-linux-arm@0.34.3': resolution: {integrity: sha512-oBK9l+h6KBN0i3dC8rYntLiVfW8D8wH+NPNT3O/WBHeW0OQWCjfWksLUaPidsrDKpJgXp3G3/hkmhptAW0I3+A==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm] os: [linux] - libc: [glibc] '@img/sharp-linux-ppc64@0.34.3': resolution: {integrity: sha512-GLtbLQMCNC5nxuImPR2+RgrviwKwVql28FWZIW1zWruy6zLgA5/x2ZXk3mxj58X/tszVF69KK0Is83V8YgWhLA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [ppc64] os: [linux] - libc: [glibc] '@img/sharp-linux-s390x@0.34.3': resolution: {integrity: sha512-3gahT+A6c4cdc2edhsLHmIOXMb17ltffJlxR0aC2VPZfwKoTGZec6u5GrFgdR7ciJSsHT27BD3TIuGcuRT0KmQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [s390x] os: [linux] - libc: [glibc] '@img/sharp-linux-x64@0.34.3': resolution: {integrity: sha512-8kYso8d806ypnSq3/Ly0QEw90V5ZoHh10yH0HnrzOCr6DKAPI6QVHvwleqMkVQ0m+fc7EH8ah0BB0QPuWY6zJQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] - libc: [glibc] '@img/sharp-linuxmusl-arm64@0.34.3': resolution: {integrity: sha512-vAjbHDlr4izEiXM1OTggpCcPg9tn4YriK5vAjowJsHwdBIdx0fYRsURkxLG2RLm9gyBq66gwtWI8Gx0/ov+JKQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] - libc: [musl] '@img/sharp-linuxmusl-x64@0.34.3': resolution: {integrity: sha512-gCWUn9547K5bwvOn9l5XGAEjVTTRji4aPTqLzGXHvIr6bIDZKNTA34seMPgM0WmSf+RYBH411VavCejp3PkOeQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] - libc: [musl] '@img/sharp-wasm32@0.34.3': resolution: {integrity: sha512-+CyRcpagHMGteySaWos8IbnXcHgfDn7pO2fiC2slJxvNq9gDipYBN42/RagzctVRKgxATmfqOSulgZv5e1RdMg==} @@ -1647,14 +1661,12 @@ packages: engines: {node: '>= 10'} cpu: [arm64] os: [linux] - libc: [glibc] '@livekit/rtc-node-linux-x64-gnu@0.13.13': resolution: {integrity: sha512-B/SgbeBRobpA5LqmDEoBJHpRXePpoF4RO4F0zJf9BdkDhOR0j77p6hD0ZiOuPTRoBzUqukpsTszp+lZnHoNmiA==} engines: {node: '>= 10'} cpu: [x64] os: [linux] - libc: [glibc] '@livekit/rtc-node-win32-x64-msvc@0.13.13': resolution: {integrity: sha512-ygVYV4eHczs3QdaW/p0ADhhm7InUDhFaCYk8OzzIn056ZibZPXzvPizCThZqs8VsDniA01MraZF3qhZZb8IyRg==} @@ -1981,121 +1993,101 @@ packages: resolution: {integrity: sha512-3reX2fUHqN7sffBNqmEyMQVj/CKhIHZd4y631duy0hZqI8Qoqf6lTtmAKvJFYa6bhU95B1D0WgzHkmTg33In0A==} cpu: [arm] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm-gnueabihf@4.40.0': resolution: {integrity: sha512-y/qUMOpJxBMy8xCXD++jeu8t7kzjlOCkoxxajL58G62PJGBZVl/Gwpm7JK9+YvlB701rcQTzjUZ1JgUoPTnoQA==} cpu: [arm] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm-musleabihf@4.17.2': resolution: {integrity: sha512-uSqpsp91mheRgw96xtyAGP9FW5ChctTFEoXP0r5FAzj/3ZRv3Uxjtc7taRQSaQM/q85KEKjKsZuiZM3GyUivRg==} cpu: [arm] os: [linux] - libc: [musl] '@rollup/rollup-linux-arm-musleabihf@4.40.0': resolution: {integrity: sha512-GoCsPibtVdJFPv/BOIvBKO/XmwZLwaNWdyD8TKlXuqp0veo2sHE+A/vpMQ5iSArRUz/uaoj4h5S6Pn0+PdhRjg==} cpu: [arm] os: [linux] - libc: [musl] '@rollup/rollup-linux-arm64-gnu@4.17.2': resolution: {integrity: sha512-EMMPHkiCRtE8Wdk3Qhtciq6BndLtstqZIroHiiGzB3C5LDJmIZcSzVtLRbwuXuUft1Cnv+9fxuDtDxz3k3EW2A==} cpu: [arm64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm64-gnu@4.40.0': resolution: {integrity: sha512-L5ZLphTjjAD9leJzSLI7rr8fNqJMlGDKlazW2tX4IUF9P7R5TMQPElpH82Q7eNIDQnQlAyiNVfRPfP2vM5Avvg==} cpu: [arm64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm64-musl@4.17.2': resolution: {integrity: sha512-NMPylUUZ1i0z/xJUIx6VUhISZDRT+uTWpBcjdv0/zkp7b/bQDF+NfnfdzuTiB1G6HTodgoFa93hp0O1xl+/UbA==} cpu: [arm64] os: [linux] - libc: [musl] '@rollup/rollup-linux-arm64-musl@4.40.0': resolution: {integrity: sha512-ATZvCRGCDtv1Y4gpDIXsS+wfFeFuLwVxyUBSLawjgXK2tRE6fnsQEkE4csQQYWlBlsFztRzCnBvWVfcae/1qxQ==} cpu: [arm64] os: [linux] - libc: [musl] '@rollup/rollup-linux-loongarch64-gnu@4.40.0': resolution: {integrity: sha512-wG9e2XtIhd++QugU5MD9i7OnpaVb08ji3P1y/hNbxrQ3sYEelKJOq1UJ5dXczeo6Hj2rfDEL5GdtkMSVLa/AOg==} cpu: [loong64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-powerpc64le-gnu@4.17.2': resolution: {integrity: sha512-T19My13y8uYXPw/L/k0JYaX1fJKFT/PWdXiHr8mTbXWxjVF1t+8Xl31DgBBvEKclw+1b00Chg0hxE2O7bTG7GQ==} cpu: [ppc64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-powerpc64le-gnu@4.40.0': resolution: {integrity: sha512-vgXfWmj0f3jAUvC7TZSU/m/cOE558ILWDzS7jBhiCAFpY2WEBn5jqgbqvmzlMjtp8KlLcBlXVD2mkTSEQE6Ixw==} cpu: [ppc64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-riscv64-gnu@4.17.2': resolution: {integrity: sha512-BOaNfthf3X3fOWAB+IJ9kxTgPmMqPPH5f5k2DcCsRrBIbWnaJCgX2ll77dV1TdSy9SaXTR5iDXRL8n7AnoP5cg==} cpu: [riscv64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-riscv64-gnu@4.40.0': resolution: {integrity: sha512-uJkYTugqtPZBS3Z136arevt/FsKTF/J9dEMTX/cwR7lsAW4bShzI2R0pJVw+hcBTWF4dxVckYh72Hk3/hWNKvA==} cpu: [riscv64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-riscv64-musl@4.40.0': resolution: {integrity: sha512-rKmSj6EXQRnhSkE22+WvrqOqRtk733x3p5sWpZilhmjnkHkpeCgWsFFo0dGnUGeA+OZjRl3+VYq+HyCOEuwcxQ==} cpu: [riscv64] os: [linux] - libc: [musl] '@rollup/rollup-linux-s390x-gnu@4.17.2': resolution: {integrity: sha512-W0UP/x7bnn3xN2eYMql2T/+wpASLE5SjObXILTMPUBDB/Fg/FxC+gX4nvCfPBCbNhz51C+HcqQp2qQ4u25ok6g==} cpu: [s390x] os: [linux] - libc: [glibc] '@rollup/rollup-linux-s390x-gnu@4.40.0': resolution: {integrity: sha512-SpnYlAfKPOoVsQqmTFJ0usx0z84bzGOS9anAC0AZ3rdSo3snecihbhFTlJZ8XMwzqAcodjFU4+/SM311dqE5Sw==} cpu: [s390x] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.17.2': resolution: {integrity: sha512-Hy7pLwByUOuyaFC6mAr7m+oMC+V7qyifzs/nW2OJfC8H4hbCzOX07Ov0VFk/zP3kBsELWNFi7rJtgbKYsav9QQ==} cpu: [x64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.40.0': resolution: {integrity: sha512-RcDGMtqF9EFN8i2RYN2W+64CdHruJ5rPqrlYw+cgM3uOVPSsnAQps7cpjXe9be/yDp8UC7VLoCoKC8J3Kn2FkQ==} cpu: [x64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-musl@4.17.2': resolution: {integrity: sha512-h1+yTWeYbRdAyJ/jMiVw0l6fOOm/0D1vNLui9iPuqgRGnXA0u21gAqOyB5iHjlM9MMfNOm9RHCQ7zLIzT0x11Q==} cpu: [x64] os: [linux] - libc: [musl] '@rollup/rollup-linux-x64-musl@4.40.0': resolution: {integrity: sha512-HZvjpiUmSNx5zFgwtQAV1GaGazT2RWvqeDi0hV+AtC8unqqDSsaFjPxfsO6qPtKRRg25SisACWnJ37Yio8ttaw==} cpu: [x64] os: [linux] - libc: [musl] '@rollup/rollup-win32-arm64-msvc@4.17.2': resolution: {integrity: sha512-tmdtXMfKAjy5+IQsVtDiCfqbynAQE/TQRpWdVataHmhMb9DCoJxp9vLcCBjEQWMiUYxO1QprH/HbY9ragCEFLA==} From e09963f14beeb8cf605c4350846e940558631393 Mon Sep 17 00:00:00 2001 From: Cale Shapera <25466659+cshape@users.noreply.github.com> Date: Fri, 28 Nov 2025 15:16:14 -0800 Subject: [PATCH 2/3] chore: lint fix and format --- examples/src/inworld_tts.ts | 203 +++++++++++++++--------------- plugins/inworld/src/index.ts | 3 +- plugins/inworld/src/tts.ts | 237 +++++++++++++++++------------------ turbo.json | 3 +- 4 files changed, 223 insertions(+), 223 deletions(-) diff --git a/examples/src/inworld_tts.ts b/examples/src/inworld_tts.ts index faedf829..20960d04 100644 --- a/examples/src/inworld_tts.ts +++ b/examples/src/inworld_tts.ts @@ -2,111 +2,116 @@ // // SPDX-License-Identifier: Apache-2.0 import { - type JobContext, - type JobProcess, - WorkerOptions, - cli, - defineAgent, - metrics, - voice, - } from '@livekit/agents'; - import * as livekit from '@livekit/agents-plugin-livekit'; - import * as inworld from '@livekit/agents-plugin-inworld'; - import * as silero from '@livekit/agents-plugin-silero'; - import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; - import { fileURLToPath } from 'node:url'; - - export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, - entry: async (ctx: JobContext) => { - const agent = new voice.Agent({ - instructions: - "You are a helpful assistant, you can hear the user's message and respond to it in 1-2 short sentences." - }); + type JobContext, + type JobProcess, + WorkerOptions, + cli, + defineAgent, + metrics, + voice, +} from '@livekit/agents'; +import * as inworld from '@livekit/agents-plugin-inworld'; +import * as livekit from '@livekit/agents-plugin-livekit'; +import * as silero from '@livekit/agents-plugin-silero'; +import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; +import { fileURLToPath } from 'node:url'; - // Create TTS instance - const tts = new inworld.TTS({ - timestampType: 'WORD', - voice: 'Hades', - model: 'inworld-tts-1', - encoding: 'LINEAR16', - textNormalization: 'ON', - bitRate: 64000, - sampleRate: 24000, - speakingRate: 1.0, - temperature: 1.1, - bufferCharThreshold: 100, - maxBufferDelayMs: 3000 - }); +export default defineAgent({ + prewarm: async (proc: JobProcess) => { + proc.userData.vad = await silero.VAD.load(); + }, + entry: async (ctx: JobContext) => { + const agent = new voice.Agent({ + instructions: + "You are a helpful assistant, you can hear the user's message and respond to it in 1-2 short sentences.", + }); - // List available voices (non-blocking) - tts.listVoices().then((voices: inworld.Voice[]) => { + // Create TTS instance + const tts = new inworld.TTS({ + timestampType: 'WORD', + voice: 'Hades', + model: 'inworld-tts-1', + encoding: 'LINEAR16', + textNormalization: 'ON', + bitRate: 64000, + sampleRate: 24000, + speakingRate: 1.0, + temperature: 1.1, + bufferCharThreshold: 100, + maxBufferDelayMs: 3000, + }); + + // List available voices (non-blocking) + tts + .listVoices() + .then((voices: inworld.Voice[]) => { console.log(`[Inworld TTS] ${voices.length} voices available in this workspace`); if (voices.length > 0) { - console.log('[Inworld TTS] Logging information for first voice:', JSON.stringify(voices[0], null, 2)); + console.log( + '[Inworld TTS] Logging information for first voice:', + JSON.stringify(voices[0], null, 2), + ); } - }).catch((err: Error) => { + }) + .catch((err: Error) => { console.error('[Inworld TTS] Failed to list voices:', err); }); - - const session = new voice.AgentSession({ - // Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand - // See all available models at https://docs.livekit.io/agents/models/stt/ - stt: 'assemblyai/universal-streaming:en', - // A Large Language Model (LLM) is your agent's brain, processing user input and generating a response - // See all available models at https://docs.livekit.io/agents/models/llm/ - llm: 'openai/gpt-4.1-mini', - // Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear - // See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ - tts, - // VAD and turn detection are used to determine when the user is speaking and when the agent should respond - // See more at https://docs.livekit.io/agents/build/turns - vad: ctx.proc.userData.vad! as silero.VAD, - turnDetection: new livekit.turnDetector.MultilingualModel(), - // to use realtime model, replace the stt, llm, tts and vad with the following - // llm: new openai.realtime.RealtimeModel(), - voiceOptions: { - // allow the LLM to generate a response while waiting for the end of turn - preemptiveGeneration: true, - }, - }); - - // timestamp handling for inworld TTS - session.tts!.on('alignment' as any, (data: any) => { - if (data.wordAlignment) { - const { words, starts, ends } = data.wordAlignment; - for (let i = 0; i < words.length; i++) { - console.log(`[Inworld TTS] Word: "${words[i]}", Start: ${starts[i]}, End: ${ends[i]}`); - } + + const session = new voice.AgentSession({ + // Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand + // See all available models at https://docs.livekit.io/agents/models/stt/ + stt: 'assemblyai/universal-streaming:en', + // A Large Language Model (LLM) is your agent's brain, processing user input and generating a response + // See all available models at https://docs.livekit.io/agents/models/llm/ + llm: 'openai/gpt-4.1-mini', + // Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear + // See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ + tts, + // VAD and turn detection are used to determine when the user is speaking and when the agent should respond + // See more at https://docs.livekit.io/agents/build/turns + vad: ctx.proc.userData.vad! as silero.VAD, + turnDetection: new livekit.turnDetector.MultilingualModel(), + // to use realtime model, replace the stt, llm, tts and vad with the following + // llm: new openai.realtime.RealtimeModel(), + voiceOptions: { + // allow the LLM to generate a response while waiting for the end of turn + preemptiveGeneration: true, + }, + }); + + // timestamp handling for inworld TTS + session.tts!.on('alignment' as any, (data: any) => { + if (data.wordAlignment) { + const { words, starts, ends } = data.wordAlignment; + for (let i = 0; i < words.length; i++) { + console.log(`[Inworld TTS] Word: "${words[i]}", Start: ${starts[i]}, End: ${ends[i]}`); } - if (data.characterAlignment) { - const { chars, starts, ends } = data.characterAlignment; - for (let i = 0; i < chars.length; i++) { - console.log(`[Inworld TTS] Char: "${chars[i]}", Start: ${starts[i]}, End: ${ends[i]}`); - } + } + if (data.characterAlignment) { + const { chars, starts, ends } = data.characterAlignment; + for (let i = 0; i < chars.length; i++) { + console.log(`[Inworld TTS] Char: "${chars[i]}", Start: ${starts[i]}, End: ${ends[i]}`); } - }); - - const usageCollector = new metrics.UsageCollector(); - - session.on(voice.AgentSessionEventTypes.MetricsCollected, (ev) => { - metrics.logMetrics(ev.metrics); - usageCollector.collect(ev.metrics); - }); - - await session.start({ - agent, - room: ctx.room, - inputOptions: { - noiseCancellation: BackgroundVoiceCancellation(), - }, - }); - - session.say('Hello, how can I help you today?'); - }, - }); - - cli.runApp(new WorkerOptions({ agent: fileURLToPath(import.meta.url) })); - \ No newline at end of file + } + }); + + const usageCollector = new metrics.UsageCollector(); + + session.on(voice.AgentSessionEventTypes.MetricsCollected, (ev) => { + metrics.logMetrics(ev.metrics); + usageCollector.collect(ev.metrics); + }); + + await session.start({ + agent, + room: ctx.room, + inputOptions: { + noiseCancellation: BackgroundVoiceCancellation(), + }, + }); + + session.say('Hello, how can I help you today?'); + }, +}); + +cli.runApp(new WorkerOptions({ agent: fileURLToPath(import.meta.url) })); diff --git a/plugins/inworld/src/index.ts b/plugins/inworld/src/index.ts index 932950e2..5d0b0b5b 100644 --- a/plugins/inworld/src/index.ts +++ b/plugins/inworld/src/index.ts @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 import { Plugin } from '@livekit/agents'; -import { TTS } from './tts.js'; +import './tts.js'; export * from './tts.js'; @@ -17,4 +17,3 @@ class InworldPlugin extends Plugin { } Plugin.registerPlugin(new InworldPlugin()); - diff --git a/plugins/inworld/src/tts.ts b/plugins/inworld/src/tts.ts index ee0f7f78..fa2b56c2 100644 --- a/plugins/inworld/src/tts.ts +++ b/plugins/inworld/src/tts.ts @@ -1,19 +1,13 @@ // SPDX-FileCopyrightText: 2025 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { - AudioByteStream, - log, - shortuuid, - tts, - tokenize, -} from '@livekit/agents'; +import { AudioByteStream, log, shortuuid, tokenize, tts } from '@livekit/agents'; import { type RawData, WebSocket } from 'ws'; const DEFAULT_BIT_RATE = 64000; const DEFAULT_ENCODING = 'LINEAR16'; const DEFAULT_MODEL = 'inworld-tts-1'; -const DEFAULT_SAMPLE_RATE = 24000; +const DEFAULT_SAMPLE_RATE = 24000; const DEFAULT_URL = 'https://api.inworld.ai/'; const DEFAULT_WS_URL = 'wss://api.inworld.ai/'; const DEFAULT_VOICE = 'Ashley'; @@ -236,7 +230,7 @@ class WSConnectionPool { this.#listeners.get(contextId)!(json); } } else if (json.error) { - this.#logger.warn({ error: json.error }, 'Inworld received error message'); + this.#logger.warn({ error: json.error }, 'Inworld received error message'); } } catch (e) { this.#logger.warn({ error: e }, 'Failed to parse Inworld WebSocket message'); @@ -332,18 +326,18 @@ export class TTS extends tts.TTS { updateOptions(opts: Partial) { this.#opts = { ...this.#opts, ...opts }; if (opts.apiKey) { - this.#authorization = `Basic ${opts.apiKey}`; - // If API key changes, we might need to reset the pool or create a new one? - // For now, assume WS url doesn't change or we just create new pool if needed. - // But existing pool has hardcoded auth in constructor. - // Re-creating pool is safer. - this.#pool.close(); - this.#pool = new WSConnectionPool(this.#opts.wsURL, this.#authorization); + this.#authorization = `Basic ${opts.apiKey}`; + // If API key changes, we might need to reset the pool or create a new one? + // For now, assume WS url doesn't change or we just create new pool if needed. + // But existing pool has hardcoded auth in constructor. + // Re-creating pool is safer. + this.#pool.close(); + this.#pool = new WSConnectionPool(this.#opts.wsURL, this.#authorization); } } async close() { - this.#pool.close(); + this.#pool.close(); } } @@ -378,7 +372,7 @@ class ChunkedStream extends tts.ChunkedStream { }; const url = new URL('tts/v1/voice:stream', this.#opts.baseURL); - + const response = await fetch(url.toString(), { method: 'POST', headers: { @@ -399,7 +393,7 @@ class ChunkedStream extends tts.ChunkedStream { const reader = response.body.getReader(); const decoder = new TextDecoder(); let buffer = ''; - + const requestId = shortuuid(); const bstream = new AudioByteStream(this.#opts.sampleRate, NUM_CHANNELS); @@ -412,40 +406,40 @@ class ChunkedStream extends tts.ChunkedStream { buffer = lines.pop() || ''; for (const line of lines) { - if (line.trim()) { - try { - const data = JSON.parse(line); - if (data.result) { - if (data.result.audioContent) { - const audio = Buffer.from(data.result.audioContent, 'base64'); - - let pcmData = audio; - if (audio.length > 44 && audio.subarray(0, 4).toString() === 'RIFF') { - // This is a WAV header, skip 44 bytes - pcmData = audio.subarray(44); - } - - for (const frame of bstream.write( - pcmData.buffer.slice(pcmData.byteOffset, pcmData.byteOffset + pcmData.byteLength), - )) { - this.queue.put({ - requestId, - segmentId: requestId, - frame, - final: false, - }); - } - } - } else if (data.error) { - throw new Error(data.error.message); + if (line.trim()) { + try { + const data = JSON.parse(line); + if (data.result) { + if (data.result.audioContent) { + const audio = Buffer.from(data.result.audioContent, 'base64'); + + let pcmData = audio; + if (audio.length > 44 && audio.subarray(0, 4).toString() === 'RIFF') { + // This is a WAV header, skip 44 bytes + pcmData = audio.subarray(44); + } + + for (const frame of bstream.write( + pcmData.buffer.slice(pcmData.byteOffset, pcmData.byteOffset + pcmData.byteLength), + )) { + this.queue.put({ + requestId, + segmentId: requestId, + frame, + final: false, + }); } - } catch (e) { - log().warn({ error: e, line }, 'Failed to parse Inworld chunk'); } + } else if (data.error) { + throw new Error(data.error.message); } + } catch (e) { + log().warn({ error: e, line }, 'Failed to parse Inworld chunk'); + } + } } } - + // Flush remaining frames for (const frame of bstream.flush()) { this.queue.put({ @@ -479,8 +473,8 @@ class SynthesizeStream extends tts.SynthesizeStream { let resolveProcessing: () => void; let rejectProcessing: (err: Error) => void; const processing = new Promise((resolve, reject) => { - resolveProcessing = resolve; - rejectProcessing = reject; + resolveProcessing = resolve; + rejectProcessing = reject; }); const handleMessage = (msg: InworldMessage) => { @@ -488,87 +482,89 @@ class SynthesizeStream extends tts.SynthesizeStream { if (!result) return; if (result.contextCreated) { - // context created + // context created } else if (result.contextClosed) { - resolveProcessing(); + resolveProcessing(); } else if (result.audioChunk) { - if (result.audioChunk.timestampInfo) { - // Log word timestamps if available - const tsInfo = result.audioChunk.timestampInfo; - if (tsInfo.wordAlignment) { - const words = tsInfo.wordAlignment.words || []; - const starts = tsInfo.wordAlignment.wordStartTimeSeconds || []; - const ends = tsInfo.wordAlignment.wordEndTimeSeconds || []; - - for (let i = 0; i < words.length; i++) { - // console.log(`[Inworld TTS] Word: "${words[i]}", Start: ${starts[i]}, End: ${ends[i]}`); - } - - (this.#tts as any).emit('alignment', { - requestId: this.#contextId, - segmentId: this.#contextId, - wordAlignment: { words, starts, ends } - }); - } - - if (tsInfo.characterAlignment) { - const chars = tsInfo.characterAlignment.characters || []; - const starts = tsInfo.characterAlignment.characterStartTimeSeconds || []; - const ends = tsInfo.characterAlignment.characterEndTimeSeconds || []; - - (this.#tts as any).emit('alignment', { - requestId: this.#contextId, - segmentId: this.#contextId, - characterAlignment: { chars, starts, ends } - }); - } + if (result.audioChunk.timestampInfo) { + // Log word timestamps if available + const tsInfo = result.audioChunk.timestampInfo; + if (tsInfo.wordAlignment) { + const words = tsInfo.wordAlignment.words || []; + const starts = tsInfo.wordAlignment.wordStartTimeSeconds || []; + const ends = tsInfo.wordAlignment.wordEndTimeSeconds || []; + + for (let i = 0; i < words.length; i++) { + // console.log(`[Inworld TTS] Word: "${words[i]}", Start: ${starts[i]}, End: ${ends[i]}`); + } + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (this.#tts as any).emit('alignment', { + requestId: this.#contextId, + segmentId: this.#contextId, + wordAlignment: { words, starts, ends }, + }); } - if (result.audioChunk.audioContent) { - // Some servers may return either nested audioContent or top-level - const b64Content = result.audioChunk.audioContent || result.audioContent; - if (b64Content) { - const audio = Buffer.from(b64Content, 'base64'); - let pcmData = audio; - if (audio.length > 44 && audio.subarray(0, 4).toString() === 'RIFF') { - // This is a WAV header, skip 44 bytes - pcmData = audio.subarray(44); - } - for (const frame of bstream.write( - pcmData.buffer.slice(pcmData.byteOffset, pcmData.byteOffset + pcmData.byteLength), - )) { - this.queue.put({ - requestId: this.#contextId, - segmentId: this.#contextId, - frame, - final: false, - }); - } - } + if (tsInfo.characterAlignment) { + const chars = tsInfo.characterAlignment.characters || []; + const starts = tsInfo.characterAlignment.characterStartTimeSeconds || []; + const ends = tsInfo.characterAlignment.characterEndTimeSeconds || []; + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (this.#tts as any).emit('alignment', { + requestId: this.#contextId, + segmentId: this.#contextId, + characterAlignment: { chars, starts, ends }, + }); + } + } + + if (result.audioChunk.audioContent) { + // Some servers may return either nested audioContent or top-level + const b64Content = result.audioChunk.audioContent || result.audioContent; + if (b64Content) { + const audio = Buffer.from(b64Content, 'base64'); + let pcmData = audio; + if (audio.length > 44 && audio.subarray(0, 4).toString() === 'RIFF') { + // This is a WAV header, skip 44 bytes + pcmData = audio.subarray(44); + } + for (const frame of bstream.write( + pcmData.buffer.slice(pcmData.byteOffset, pcmData.byteOffset + pcmData.byteLength), + )) { + this.queue.put({ + requestId: this.#contextId, + segmentId: this.#contextId, + frame, + final: false, + }); + } } + } } else if (result.status && result.status.code !== 0) { - const error = new Error(`Inworld stream error: ${result.status.message}`); - rejectProcessing(error); + const error = new Error(`Inworld stream error: ${result.status.message}`); + rejectProcessing(error); } }; this.#tts.pool.registerListener(this.#contextId, handleMessage); const sendLoop = async () => { - for await (const ev of tokenizerStream) { - await this.#sendText(ws, ev.token); - } + for await (const ev of tokenizerStream) { + await this.#sendText(ws, ev.token); + } }; const sendPromise = sendLoop(); try { await this.#createContext(ws); - + for await (const text of this.input) { if (text === tts.SynthesizeStream.FLUSH_SENTINEL) { - tokenizerStream.flush(); + tokenizerStream.flush(); } else { - tokenizerStream.pushText(text); + tokenizerStream.pushText(text); } } tokenizerStream.endInput(); @@ -577,20 +573,19 @@ class SynthesizeStream extends tts.SynthesizeStream { await this.#flushContext(ws); await this.#closeContext(ws); await processing; - + // Flush remaining frames for (const frame of bstream.flush()) { - this.queue.put({ - requestId: this.#contextId, - segmentId: this.#contextId, - frame, - final: false, - }); + this.queue.put({ + requestId: this.#contextId, + segmentId: this.#contextId, + frame, + final: false, + }); } - } catch (e) { - log().error({ error: e }, 'Error in SynthesizeStream run'); - throw e; + log().error({ error: e }, 'Error in SynthesizeStream run'); + throw e; } finally { this.#tts.pool.unregisterListener(this.#contextId); } diff --git a/turbo.json b/turbo.json index 6d1d09b5..6c420a36 100644 --- a/turbo.json +++ b/turbo.json @@ -47,7 +47,8 @@ "ANAM_AVATAR_ID", "RIME_API_KEY", "LK_OPENAI_DEBUG", - "OVHCLOUD_API_KEY" + "OVHCLOUD_API_KEY", + "INWORLD_API_KEY" ], "pipeline": { "build": { From 687075c2264801651985633cc8fe1b27ec7eb269 Mon Sep 17 00:00:00 2001 From: Cale Shapera <25466659+cshape@users.noreply.github.com> Date: Fri, 28 Nov 2025 15:23:23 -0800 Subject: [PATCH 3/3] clean up code and comments --- examples/src/inworld_tts.ts | 4 ++-- plugins/inworld/src/tts.ts | 15 +-------------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/examples/src/inworld_tts.ts b/examples/src/inworld_tts.ts index 20960d04..bb365c4c 100644 --- a/examples/src/inworld_tts.ts +++ b/examples/src/inworld_tts.ts @@ -41,7 +41,7 @@ export default defineAgent({ maxBufferDelayMs: 3000, }); - // List available voices (non-blocking) + // List available voices tts .listVoices() .then((voices: inworld.Voice[]) => { @@ -79,7 +79,7 @@ export default defineAgent({ }, }); - // timestamp handling for inworld TTS + // timestamp handling (if enabled) session.tts!.on('alignment' as any, (data: any) => { if (data.wordAlignment) { const { words, starts, ends } = data.wordAlignment; diff --git a/plugins/inworld/src/tts.ts b/plugins/inworld/src/tts.ts index fa2b56c2..c924b64f 100644 --- a/plugins/inworld/src/tts.ts +++ b/plugins/inworld/src/tts.ts @@ -192,7 +192,6 @@ class WSConnectionPool { #attemptConnection(): Promise { return new Promise((resolve, reject) => { const wsUrl = new URL('tts/v1/voice:streamBidirectional', this.#url); - // Ensure protocol is wss if (wsUrl.protocol === 'https:') wsUrl.protocol = 'wss:'; else if (wsUrl.protocol === 'http:') wsUrl.protocol = 'ws:'; @@ -224,7 +223,6 @@ class WSConnectionPool { const json = JSON.parse(data.toString()) as InworldMessage; const result = json.result; if (result) { - // Try to find contextId in result or top level const contextId = result.contextId || json.contextId; if (contextId && this.#listeners.has(contextId)) { this.#listeners.get(contextId)!(json); @@ -288,7 +286,7 @@ export class TTS extends tts.TTS { } /** - * List all available voices. + * List all available voices in the workspace associated with the API key. * @param language - Optional ISO 639-1 language code to filter voices (e.g., 'en', 'es', 'fr') */ async listVoices(language?: string): Promise { @@ -327,10 +325,6 @@ export class TTS extends tts.TTS { this.#opts = { ...this.#opts, ...opts }; if (opts.apiKey) { this.#authorization = `Basic ${opts.apiKey}`; - // If API key changes, we might need to reset the pool or create a new one? - // For now, assume WS url doesn't change or we just create new pool if needed. - // But existing pool has hardcoded auth in constructor. - // Re-creating pool is safer. this.#pool.close(); this.#pool = new WSConnectionPool(this.#opts.wsURL, this.#authorization); } @@ -482,22 +476,16 @@ class SynthesizeStream extends tts.SynthesizeStream { if (!result) return; if (result.contextCreated) { - // context created } else if (result.contextClosed) { resolveProcessing(); } else if (result.audioChunk) { if (result.audioChunk.timestampInfo) { - // Log word timestamps if available const tsInfo = result.audioChunk.timestampInfo; if (tsInfo.wordAlignment) { const words = tsInfo.wordAlignment.words || []; const starts = tsInfo.wordAlignment.wordStartTimeSeconds || []; const ends = tsInfo.wordAlignment.wordEndTimeSeconds || []; - for (let i = 0; i < words.length; i++) { - // console.log(`[Inworld TTS] Word: "${words[i]}", Start: ${starts[i]}, End: ${ends[i]}`); - } - // eslint-disable-next-line @typescript-eslint/no-explicit-any (this.#tts as any).emit('alignment', { requestId: this.#contextId, @@ -521,7 +509,6 @@ class SynthesizeStream extends tts.SynthesizeStream { } if (result.audioChunk.audioContent) { - // Some servers may return either nested audioContent or top-level const b64Content = result.audioChunk.audioContent || result.audioContent; if (b64Content) { const audio = Buffer.from(b64Content, 'base64');