From 068ae5ae52c506ae95de8fbdebf16f885da63a9e Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 16 Nov 2025 20:02:10 +0100 Subject: [PATCH 1/2] markdown and emoji filters + advanced preprocessing --- agents/src/voice/agent_activity.ts | 3 + agents/src/voice/agent_session.ts | 3 + agents/src/voice/generation.ts | 10 +- agents/src/voice/index.ts | 1 + agents/src/voice/transcription/index.ts | 4 + .../voice/transcription/transforms.test.ts | 103 ++++ agents/src/voice/transcription/transforms.ts | 295 +++++++++++ .../transcription/transforms_agnostic.test.ts | 243 +++++++++ .../transcription/transforms_agnostic.ts | 439 ++++++++++++++++ .../voice/transcription/transforms_de.test.ts | 244 +++++++++ .../src/voice/transcription/transforms_de.ts | 401 +++++++++++++++ .../voice/transcription/transforms_en.test.ts | 267 ++++++++++ .../src/voice/transcription/transforms_en.ts | 477 ++++++++++++++++++ 13 files changed, 2489 insertions(+), 1 deletion(-) create mode 100644 agents/src/voice/transcription/transforms.test.ts create mode 100644 agents/src/voice/transcription/transforms.ts create mode 100644 agents/src/voice/transcription/transforms_agnostic.test.ts create mode 100644 agents/src/voice/transcription/transforms_agnostic.ts create mode 100644 agents/src/voice/transcription/transforms_de.test.ts create mode 100644 agents/src/voice/transcription/transforms_de.ts create mode 100644 agents/src/voice/transcription/transforms_en.test.ts create mode 100644 agents/src/voice/transcription/transforms_en.ts diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 799de0e4..5ba068ff 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -1201,6 +1201,7 @@ export class AgentActivity implements RecognitionHooks { audioSource, modelSettings, replyAbortController, + this.agentSession.options.ttsTextTransforms || null, ); tasks.push(ttsTask); @@ -1314,6 +1315,7 @@ export class AgentActivity implements RecognitionHooks { ttsTextInput, modelSettings, replyAbortController, + this.agentSession.options.ttsTextTransforms || null, ); tasks.push(ttsTask); } @@ -1700,6 +1702,7 @@ export class AgentActivity implements RecognitionHooks { ttsTextInput, modelSettings, abortController, + this.agentSession.options.ttsTextTransforms || null, ); tasks.push(ttsTask); realtimeAudioResult = ttsStream; diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index b005a2f1..44e4ab80 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -49,6 +49,7 @@ import { AgentInput, AgentOutput } from './io.js'; import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io/index.js'; import type { UnknownUserData } from './run_context.js'; import type { SpeechHandle } from './speech_handle.js'; +import { DEFAULT_TTS_TEXT_TRANSFORMS, type TextTransformSpec } from './transcription/transforms.js'; export interface VoiceOptions { allowInterruptions: boolean; @@ -60,6 +61,7 @@ export interface VoiceOptions { maxToolSteps: number; preemptiveGeneration: boolean; userAwayTimeout?: number | null; + ttsTextTransforms?: TextTransformSpec[] | null; } const defaultVoiceOptions: VoiceOptions = { @@ -72,6 +74,7 @@ const defaultVoiceOptions: VoiceOptions = { maxToolSteps: 3, preemptiveGeneration: false, userAwayTimeout: 15.0, + ttsTextTransforms: DEFAULT_TTS_TEXT_TRANSFORMS, } as const; export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector; diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts index ca0a96b2..817774c4 100644 --- a/agents/src/voice/generation.ts +++ b/agents/src/voice/generation.ts @@ -27,6 +27,7 @@ import type { AgentSession } from './agent_session.js'; import type { AudioOutput, LLMNode, TTSNode, TextOutput } from './io.js'; import { RunContext } from './run_context.js'; import type { SpeechHandle } from './speech_handle.js'; +import { type TextTransformSpec, applyTextTransforms } from './transcription/index.js'; /** @internal */ export class _LLMGenerationData { @@ -474,6 +475,7 @@ export function performTTSInference( text: ReadableStream, modelSettings: ModelSettings, controller: AbortController, + textTransforms?: readonly TextTransformSpec[] | null, ): [Task, ReadableStream] { const audioStream = new IdentityTransform(); const outputWriter = audioStream.writable.getWriter(); @@ -484,7 +486,13 @@ export function performTTSInference( let ttsStream: ReadableStream | null = null; try { - ttsStream = await node(text, modelSettings); + // Apply text transforms + let transformedText = text; + if (textTransforms && textTransforms.length > 0) { + transformedText = await applyTextTransforms(text, textTransforms); + } + + ttsStream = await node(transformedText, modelSettings); if (ttsStream === null) { await outputWriter.close(); return; diff --git a/agents/src/voice/index.ts b/agents/src/voice/index.ts index decd22bb..78cf0228 100644 --- a/agents/src/voice/index.ts +++ b/agents/src/voice/index.ts @@ -9,3 +9,4 @@ export * from './events.js'; export * from './report.js'; export * from './room_io/index.js'; export { RunContext } from './run_context.js'; +export * from './transcription/index.js'; diff --git a/agents/src/voice/transcription/index.ts b/agents/src/voice/transcription/index.ts index ae60aa3d..defddf3e 100644 --- a/agents/src/voice/transcription/index.ts +++ b/agents/src/voice/transcription/index.ts @@ -2,3 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 export * from './_utils.js'; +export * from './transforms.js'; +export * from './transforms_agnostic.js'; +export * from './transforms_en.js'; +export * from './transforms_de.js'; diff --git a/agents/src/voice/transcription/transforms.test.ts b/agents/src/voice/transcription/transforms.test.ts new file mode 100644 index 00000000..1d07baca --- /dev/null +++ b/agents/src/voice/transcription/transforms.test.ts @@ -0,0 +1,103 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ReadableStream } from 'node:stream/web'; +import { describe, expect, it } from 'vitest'; +import { + DEFAULT_TTS_TEXT_TRANSFORMS, + applyTextTransforms, + getAvailableTransforms, +} from './transforms.js'; + +/** + * Helper to convert a string to a ReadableStream + */ +function stringToStream(text: string): ReadableStream { + return new ReadableStream({ + start(controller) { + controller.enqueue(text); + controller.close(); + }, + }); +} + +/** + * Helper to read a stream to a string + */ +async function streamToString(stream: ReadableStream): Promise { + const reader = stream.getReader(); + let result = ''; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + result += value; + } + return result; +} + +describe('Text Transforms Core', () => { + it('should export DEFAULT_TTS_TEXT_TRANSFORMS', () => { + expect(DEFAULT_TTS_TEXT_TRANSFORMS).toBeDefined(); + expect(DEFAULT_TTS_TEXT_TRANSFORMS).toEqual(['filter_markdown', 'filter_emoji']); + }); + + it('should list available transforms for English', () => { + const transforms = getAvailableTransforms('en'); + expect(transforms.has('filter_markdown')).toBe(true); + expect(transforms.has('filter_emoji')).toBe(true); + expect(transforms.has('format_numbers')).toBe(true); + expect(transforms.has('format_dollar_amounts')).toBe(true); + }); + + it('should list available transforms for German', () => { + const transforms = getAvailableTransforms('de'); + expect(transforms.has('filter_markdown')).toBe(true); + expect(transforms.has('filter_emoji')).toBe(true); + expect(transforms.has('format_numbers_de')).toBe(true); + expect(transforms.has('format_euro_amounts')).toBe(true); + }); + + it('should throw error for invalid transform name', async () => { + const stream = stringToStream('test'); + await expect( + applyTextTransforms(stream, ['invalid_transform' as any], { language: 'en' }), + ).rejects.toThrow('Invalid transform'); + }); + + it('should apply custom transform function', async () => { + const customTransform = (text: ReadableStream) => { + return new ReadableStream({ + async start(controller) { + const reader = text.getReader(); + while (true) { + const { done, value } = await reader.read(); + if (done) { + controller.close(); + break; + } + controller.enqueue(value.toUpperCase()); + } + }, + }); + }; + + const stream = stringToStream('hello world'); + const result = await applyTextTransforms(stream, [customTransform]); + const output = await streamToString(result); + expect(output).toBe('HELLO WORLD'); + }); + + it('should apply multiple transforms in sequence', async () => { + const stream = stringToStream('**Price: $5** 🎉'); + const result = await applyTextTransforms( + stream, + ['filter_markdown', 'filter_emoji', 'format_dollar_amounts'], + { language: 'en' }, + ); + const output = await streamToString(result); + expect(output).toContain('Price:'); + expect(output).toContain('five dollars'); + expect(output).not.toContain('**'); + expect(output).not.toContain('🎉'); + }); +}); diff --git a/agents/src/voice/transcription/transforms.ts b/agents/src/voice/transcription/transforms.ts new file mode 100644 index 00000000..5166a3b1 --- /dev/null +++ b/agents/src/voice/transcription/transforms.ts @@ -0,0 +1,295 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ReadableStream } from 'node:stream/web'; +// Import transform implementations (will be created next) +import { languageAgnosticTransforms } from './transforms_agnostic.js'; +import { germanTransforms } from './transforms_de.js'; +import { englishTransforms } from './transforms_en.js'; + +/** + * Language codes supported by the transform system + */ +export type Language = 'en' | 'de' | string; + +/** + * Base type for all text transforms + */ +export type TextTransform = (text: ReadableStream) => ReadableStream; + +/** + * Configuration for language-specific transforms + */ +export interface LanguageTransformConfig { + /** + * The language code for language-specific transforms + * Defaults to 'en' (English) + */ + language?: Language; +} + +/** + * Built-in language-agnostic transform names + */ +export type LanguageAgnosticTransformName = + | 'filter_markdown' + | 'filter_emoji' + | 'remove_angle_bracket_content' + | 'replace_newlines_with_periods' + | 'format_emails' + | 'format_phone_numbers' + | 'format_times'; + +/** + * Built-in English-specific transform names + */ +export type EnglishTransformName = + | 'format_numbers' + | 'format_dollar_amounts' + | 'format_percentages' + | 'format_distances' + | 'format_units' + | 'format_dates' + | 'format_acronyms'; + +/** + * Built-in German-specific transform names + */ +export type GermanTransformName = + | 'format_numbers_de' + | 'format_euro_amounts' + | 'format_percentages_de' + | 'format_distances_de' + | 'format_units_de' + | 'format_dates_de'; + +/** + * Union of all built-in transform names + */ +export type BuiltInTransformName = + | LanguageAgnosticTransformName + | EnglishTransformName + | GermanTransformName; + +/** + * Text transform specification - can be a built-in name or a custom function + */ +export type TextTransformSpec = BuiltInTransformName | TextTransform; + +/** + * Default transforms applied to TTS text + */ +export const DEFAULT_TTS_TEXT_TRANSFORMS: BuiltInTransformName[] = [ + 'filter_markdown', + 'filter_emoji', +]; + +/** + * Get recommended TTS text transforms for a specific language + * + * This helper returns a curated set of transforms that work well for TTS + * in the specified language, including both language-agnostic and + * language-specific transforms. + * + * @param language - The language code (e.g., 'en', 'de') + * @returns Array of recommended transform names + * + * @example + * ```typescript + * // Get transforms for English + * const transforms = getRecommendedTTSTransforms('en'); + * // Returns: ['filter_markdown', 'filter_emoji', 'format_numbers', 'format_dollar_amounts', ...] + * + * // Get transforms for German + * const transforms = getRecommendedTTSTransforms('de'); + * // Returns: ['filter_markdown', 'filter_emoji', 'format_numbers_de', 'format_euro_amounts', ...] + * ``` + */ +export function getRecommendedTTSTransforms(language: Language = 'en'): BuiltInTransformName[] { + const baseTransforms: BuiltInTransformName[] = ['filter_markdown', 'filter_emoji']; + + const languageSpecificRecommendations: Record = { + en: [ + 'format_numbers', + 'format_dollar_amounts', + 'format_percentages', + 'format_distances', + 'format_units', + 'format_dates', + 'format_acronyms', + ], + de: [ + 'format_numbers_de', + 'format_euro_amounts', + 'format_percentages_de', + 'format_distances_de', + 'format_units_de', + 'format_dates_de', + ], + }; + + const langSpecific = languageSpecificRecommendations[language] || []; + return [...baseTransforms, ...langSpecific]; +} + +/** + * Apply a sequence of text transforms to a text stream + * + * @param text - Input text stream + * @param transforms - Array of transform names or custom transform functions + * @param config - Configuration for language-specific transforms + * @returns Transformed text stream + */ +export async function applyTextTransforms( + text: ReadableStream, + transforms: readonly TextTransformSpec[], + config: LanguageTransformConfig = {}, +): Promise> { + const { language = 'en' } = config; + let result = text; + + for (const transform of transforms) { + if (typeof transform === 'function') { + // Custom transform function + result = transform(result); + } else { + // Built-in transform name + const transformFn = getBuiltInTransform(transform, language); + if (!transformFn) { + throw new Error( + `Invalid transform: ${transform}. ` + + `Available transforms: ${Array.from(getAvailableTransforms(language)).join(', ')}`, + ); + } + result = transformFn(result); + } + } + + return result; +} + +/** + * Get a built-in transform function by name + */ +function getBuiltInTransform(name: BuiltInTransformName, language: Language): TextTransform | null { + // Check language-agnostic transforms first + const agnostic = languageAgnosticTransforms.get(name as LanguageAgnosticTransformName); + if (agnostic) { + return agnostic; + } + + // Check language-specific transforms + const langTransforms = languageSpecificTransforms.get(language); + if (langTransforms) { + return langTransforms.get(name) || null; + } + + return null; +} + +/** + * Get all available transform names for a given language + */ +export function getAvailableTransforms(language: Language = 'en'): Set { + const available = new Set(); + + // Add language-agnostic transforms + for (const name of languageAgnosticTransforms.keys()) { + available.add(name); + } + + // Add language-specific transforms + const langTransforms = languageSpecificTransforms.get(language); + if (langTransforms) { + for (const name of langTransforms.keys()) { + available.add(name); + } + } + + return available; +} + +/** + * Helper to create a transform function with buffering for sentence boundaries + * + * This is useful for transforms that need to see complete sentences or tokens + * before processing them. It buffers input until a sentence boundary is reached, + * then applies the regex pattern with optional preprocessing. + * + * @param pattern - Regex pattern to match + * @param replacement - Replacement string or function + * @param options - Additional options for buffering and preprocessing + */ +export function createBufferedRegexTransform( + pattern: RegExp, + replacement: string | ((match: string, ...args: any[]) => string), + options: { + /** Buffer until these characters are encountered (sentence boundaries) */ + sentenceBoundaries?: string[]; + /** Preprocessing function applied before regex matching */ + preprocess?: (text: string) => string; + /** Minimum buffer size before attempting to process */ + minBufferSize?: number; + } = {}, +): TextTransform { + const { sentenceBoundaries = ['.', '!', '?', '\n'], preprocess, minBufferSize = 0 } = options; + + return (text: ReadableStream): ReadableStream => { + let buffer = ''; + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + // Process remaining buffer + if (buffer.length > 0) { + let processed = preprocess ? preprocess(buffer) : buffer; + processed = + typeof replacement === 'function' + ? processed.replace(pattern, replacement as any) + : processed.replace(pattern, replacement); + controller.enqueue(processed); + } + controller.close(); + break; + } + + buffer += value; + + // Check if we have a sentence boundary + let lastBoundaryPos = -1; + for (const boundary of sentenceBoundaries) { + const pos = buffer.lastIndexOf(boundary); + lastBoundaryPos = Math.max(lastBoundaryPos, pos); + } + + // Process if we found a boundary and have enough buffer + if (lastBoundaryPos > 0 && buffer.length >= minBufferSize) { + const processable = buffer.substring(0, lastBoundaryPos + 1); + buffer = buffer.substring(lastBoundaryPos + 1); + + let processed = preprocess ? preprocess(processable) : processable; + processed = + typeof replacement === 'function' + ? processed.replace(pattern, replacement as any) + : processed.replace(pattern, replacement); + controller.enqueue(processed); + } + } + } catch (error) { + controller.error(error); + } + }, + }); + }; +} + +// Build the language-specific transforms registry +const languageSpecificTransforms = new Map>(); +languageSpecificTransforms.set('en', englishTransforms); +languageSpecificTransforms.set('de', germanTransforms); diff --git a/agents/src/voice/transcription/transforms_agnostic.test.ts b/agents/src/voice/transcription/transforms_agnostic.test.ts new file mode 100644 index 00000000..e494c577 --- /dev/null +++ b/agents/src/voice/transcription/transforms_agnostic.test.ts @@ -0,0 +1,243 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ReadableStream } from 'node:stream/web'; +import { describe, expect, it } from 'vitest'; +import { + filterEmoji, + filterMarkdown, + formatEmails, + formatPhoneNumbers, + formatTimes, + removeAngleBracketContent, + replaceNewlinesWithPeriods, +} from './transforms_agnostic.js'; + +/** + * Helper to apply a transform and get the result + */ +async function applyTransform( + transform: (text: ReadableStream) => ReadableStream, + input: string, +): Promise { + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(input); + controller.close(); + }, + }); + + const result = transform(stream); + const reader = result.getReader(); + let output = ''; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + output += value; + } + return output; +} + +describe('filterMarkdown', () => { + it('should remove headers', async () => { + const result = await applyTransform(filterMarkdown, '# Header\n## Subheader\n'); + expect(result).toBe('Header\nSubheader\n'); + }); + + it('should remove bold asterisks', async () => { + const result = await applyTransform(filterMarkdown, 'This is **bold** text'); + expect(result).toBe('This is bold text'); + }); + + it('should remove italic asterisks', async () => { + const result = await applyTransform(filterMarkdown, 'This is *italic* text'); + expect(result).toBe('This is italic text'); + }); + + it('should remove bold underscores', async () => { + const result = await applyTransform(filterMarkdown, 'This is __bold__ text'); + expect(result).toBe('This is bold text'); + }); + + it('should remove italic underscores', async () => { + const result = await applyTransform(filterMarkdown, 'This is _italic_ text'); + expect(result).toBe('This is italic text'); + }); + + it('should remove inline code', async () => { + const result = await applyTransform(filterMarkdown, 'Use `console.log()` function'); + expect(result).toBe('Use console.log() function'); + }); + + it('should remove code blocks', async () => { + const result = await applyTransform(filterMarkdown, '```javascript\ncode\n```'); + expect(result).toBe('\ncode\n'); + }); + + it('should extract link text', async () => { + const result = await applyTransform(filterMarkdown, 'Click [here](https://example.com)'); + expect(result).toBe('Click here'); + }); + + it('should extract image alt text', async () => { + const result = await applyTransform(filterMarkdown, '![Logo](logo.png)'); + expect(result).toBe('Logo'); + }); + + it('should remove list markers', async () => { + const result = await applyTransform(filterMarkdown, '- Item 1\n* Item 2\n+ Item 3\n'); + expect(result).toBe('Item 1\nItem 2\nItem 3\n'); + }); + + it('should remove block quotes', async () => { + const result = await applyTransform(filterMarkdown, '> Quote\n'); + expect(result).toBe('Quote\n'); + }); + + it('should remove strikethrough', async () => { + const result = await applyTransform(filterMarkdown, 'This is ~~crossed~~ text'); + expect(result).toBe('This is text'); + }); + + it('should handle complex mixed markdown', async () => { + const input = '# Title\n\nThis is **bold** and *italic* with `code` and [link](url).'; + const result = await applyTransform(filterMarkdown, input); + expect(result).not.toContain('**'); + expect(result).not.toContain('*'); + expect(result).not.toContain('`'); + expect(result).not.toContain('['); + expect(result).not.toContain(']'); + expect(result).toContain('Title'); + expect(result).toContain('bold'); + expect(result).toContain('italic'); + expect(result).toContain('code'); + expect(result).toContain('link'); + }); +}); + +describe('filterEmoji', () => { + it('should remove emoji', async () => { + const result = await applyTransform(filterEmoji, 'Hello 👋 World 🌍'); + expect(result).toBe('Hello World '); + }); + + it('should remove multiple emoji types', async () => { + const result = await applyTransform(filterEmoji, 'Party 🎉🎊🎈'); + expect(result).toBe('Party '); + }); + + it('should preserve text without emoji', async () => { + const result = await applyTransform(filterEmoji, 'Hello World'); + expect(result).toBe('Hello World'); + }); + + it('should handle text with mixed emoji', async () => { + const result = await applyTransform(filterEmoji, 'I ❤️ coding 💻 with ☕'); + expect(result).not.toContain('❤'); + expect(result).not.toContain('💻'); + expect(result).not.toContain('☕'); + expect(result).toContain('I'); + expect(result).toContain('coding'); + expect(result).toContain('with'); + }); +}); + +describe('removeAngleBracketContent', () => { + it('should remove HTML tags', async () => { + const result = await applyTransform(removeAngleBracketContent, '
text
'); + expect(result).toBe('text'); + }); + + it('should remove multiple tags', async () => { + const result = await applyTransform( + removeAngleBracketContent, + '

Hello World

', + ); + expect(result).toBe('Hello World'); + }); + + it('should preserve TTS tags', async () => { + const result = await applyTransform(removeAngleBracketContent, 'Say this'); + expect(result).toContain(' { + const result = await applyTransform(removeAngleBracketContent, 'Plain text'); + expect(result).toBe('Plain text'); + }); +}); + +describe('replaceNewlinesWithPeriods', () => { + it('should replace multiple newlines with period', async () => { + const result = await applyTransform(replaceNewlinesWithPeriods, 'Line 1\n\nLine 2'); + expect(result).toBe('Line 1. Line 2'); + }); + + it('should replace single newlines with space', async () => { + const result = await applyTransform(replaceNewlinesWithPeriods, 'Line 1\nLine 2'); + expect(result).toBe('Line 1 Line 2'); + }); + + it('should handle multiple consecutive newlines', async () => { + const result = await applyTransform(replaceNewlinesWithPeriods, 'A\n\n\nB'); + expect(result).toBe('A. B'); + }); +}); + +describe('formatEmails', () => { + it('should format email addresses', async () => { + const result = await applyTransform(formatEmails, 'Contact: john.doe@example.com'); + expect(result).toContain('john dot doe at example dot com'); + }); + + it('should handle multiple email addresses', async () => { + const result = await applyTransform(formatEmails, 'user1@test.com and user2@test.com'); + expect(result).toContain('user1 at test dot com'); + expect(result).toContain('user2 at test dot com'); + }); + + it('should preserve non-email text', async () => { + const result = await applyTransform(formatEmails, 'No email here'); + expect(result).toBe('No email here'); + }); +}); + +describe('formatPhoneNumbers', () => { + it('should format phone number with dashes', async () => { + const result = await applyTransform(formatPhoneNumbers, 'Call 555-123-4567'); + expect(result).toContain('5 5 5 1 2 3 4 5 6 7'); + }); + + it('should format phone number with parentheses', async () => { + const result = await applyTransform(formatPhoneNumbers, 'Call (555) 123-4567'); + expect(result).toContain('5 5 5 1 2 3 4 5 6 7'); + }); + + it('should format phone number with dots', async () => { + const result = await applyTransform(formatPhoneNumbers, 'Call 555.123.4567'); + expect(result).toContain('5 5 5 1 2 3 4 5 6 7'); + }); + + it('should preserve non-phone text', async () => { + const result = await applyTransform(formatPhoneNumbers, 'No phone here'); + expect(result).toBe('No phone here'); + }); +}); + +describe('formatTimes', () => { + it('should simplify times with 00 minutes', async () => { + const result = await applyTransform(formatTimes, 'Meeting at 14:00'); + expect(result).toBe('Meeting at 14'); + }); + + it('should preserve times with non-zero minutes', async () => { + const result = await applyTransform(formatTimes, 'Meeting at 14:30'); + expect(result).toBe('Meeting at 14:30'); + }); + + it('should handle multiple times', async () => { + const result = await applyTransform(formatTimes, '9:00 to 10:00 or 14:30'); + expect(result).toContain('9 to 10'); + expect(result).toContain('14:30'); + }); +}); diff --git a/agents/src/voice/transcription/transforms_agnostic.ts b/agents/src/voice/transcription/transforms_agnostic.ts new file mode 100644 index 00000000..880c7cf3 --- /dev/null +++ b/agents/src/voice/transcription/transforms_agnostic.ts @@ -0,0 +1,439 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ReadableStream } from 'node:stream/web'; +import type { LanguageAgnosticTransformName, TextTransform } from './transforms.js'; + +/** + * Filter out markdown syntax from text + * + * Removes common markdown formatting like: + * - Headers (# text) + * - List markers (-, *, +) + * - Bold (**text**, __text__) + * - Italic (*text*, _text_) + * - Links ([text](url)) + * - Images (![alt](url)) + * - Code blocks (```code```) + * - Inline code (`code`) + * - Strikethrough (~~text~~) + * - Block quotes (> text) + */ +export const filterMarkdown: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + // Line-level patterns (applied at start of lines) + const linePatterns: Array<[RegExp, string]> = [ + [/^#{1,6}\s+/gm, ''], // headers + [/^\s*[-+*]\s+/gm, ''], // list markers + [/^\s*>\s+/gm, ''], // block quotes + ]; + + // Inline patterns (applied anywhere in text) + const inlinePatterns: Array<[RegExp, string]> = [ + [/!\[([^\]]*)\]\([^)]*\)/g, '$1'], // images: keep alt text + [/\[([^\]]*)\]\([^)]*\)/g, '$1'], // links: keep text + [/(?!`~ ]$/.test(buffer)) { + return true; + } + + // Check for unpaired bold/italic asterisks + const doubleAsterisks = (buffer.match(/\*\*/g) || []).length; + if (doubleAsterisks % 2 === 1) return true; + + const singleAsterisks = (buffer.match(/\*/g) || []).length - doubleAsterisks * 2; + if (singleAsterisks % 2 === 1) return true; + + // Check for unpaired underscores + const doubleUnderscores = (buffer.match(/__/g) || []).length; + if (doubleUnderscores % 2 === 1) return true; + + const singleUnderscores = (buffer.match(/_/g) || []).length - doubleUnderscores * 2; + if (singleUnderscores % 2 === 1) return true; + + // Check for unpaired backticks + const backticks = (buffer.match(/`/g) || []).length; + if (backticks % 2 === 1) return true; + + // Check for unpaired tildes + const doubleTildes = (buffer.match(/~~/g) || []).length; + if (doubleTildes % 2 === 1) return true; + + // Check for incomplete links/images + const openBrackets = (buffer.match(/\[/g) || []).length; + const completeLinks = (buffer.match(completeLinksPattern) || []).length; + const completeImages = (buffer.match(completeImagesPattern) || []).length; + + if (openBrackets - completeLinks - completeImages > 0) { + return true; + } + + return false; + } + + function processCompleteText(textToProcess: string, isNewline: boolean): string { + let processed = textToProcess; + + if (isNewline) { + for (const [pattern, replacement] of linePatterns) { + processed = processed.replace(pattern, replacement); + } + } + + for (const [pattern, replacement] of inlinePatterns) { + processed = processed.replace(pattern, replacement); + } + + return processed; + } + + return new ReadableStream({ + async start(controller) { + let buffer = ''; + let bufferIsNewline = true; + + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + if (buffer.length > 0) { + controller.enqueue(processCompleteText(buffer, bufferIsNewline)); + } + controller.close(); + break; + } + + buffer += value; + + // Handle newlines + if (buffer.includes('\n')) { + const lines = buffer.split('\n'); + buffer = lines[lines.length - 1] || ''; + + for (let i = 0; i < lines.length - 1; i++) { + const isNewline = i === 0 ? bufferIsNewline : true; + const line = lines[i]; + if (line !== undefined) { + const processedLine = processCompleteText(line, isNewline); + controller.enqueue(processedLine + '\n'); + } + } + + bufferIsNewline = true; + continue; + } + + // Find last split token + let lastSplitPos = -1; + for (let i = buffer.length - 1; i >= 0; i--) { + const char = buffer[i]; + if (char && splitTokens.has(char)) { + lastSplitPos = i; + break; + } + } + + if (lastSplitPos >= 0) { + const processable = buffer.substring(0, lastSplitPos); + const rest = buffer.substring(lastSplitPos); + + if (!hasIncompletePattern(processable)) { + controller.enqueue(processCompleteText(processable, bufferIsNewline)); + buffer = rest; + bufferIsNewline = false; + } + } + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Filter out emoji characters from text + * + * Removes emoji characters from Unicode blocks including: + * - Emoji symbols and pictographs + * - Miscellaneous symbols + * - Dingbats + * - Variation selectors + * - Zero-width joiners and keycaps + */ +export const filterEmoji: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + // Unicode emoji pattern covering major emoji blocks + const emojiPattern = + /[\u{1F000}-\u{1FBFF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{2B00}-\u{2BFF}]|[\u{FE00}-\u{FE0F}]|\u{200D}|\u{20E3}/gu; + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + const filtered = value.replace(emojiPattern, ''); + if (filtered.length > 0) { + controller.enqueue(filtered); + } + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Remove HTML-like angle bracket content from text + * + * Removes content within angle brackets like
text
. + * Preserves special TTS tags like , , etc. + */ +export const removeAngleBracketContent: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + // Preserve these TTS-specific tags + const preservedTags = new Set(['break', 'spell', 'say-as', 'phoneme', 'prosody', 'emphasis']); + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + let buffer = ''; + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + if (buffer.length > 0) { + controller.enqueue(buffer); + } + controller.close(); + break; + } + + buffer += value; + + // Process complete tags + let processed = buffer; + let changed = true; + while (changed) { + changed = false; + const tagMatch = /<\/?([a-zA-Z][a-zA-Z0-9-]*)[^>]*>/; + const match = processed.match(tagMatch); + + if (match) { + const [fullMatch, tagName] = match; + if (tagName && !preservedTags.has(tagName.toLowerCase())) { + processed = processed.replace(fullMatch, ''); + changed = true; + } else { + // Can't process further, need more context + break; + } + } + } + + // Only emit if we made progress + if (processed !== buffer) { + controller.enqueue(processed); + buffer = ''; + } + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Replace newlines with periods for better TTS flow + * + * - Multiple consecutive newlines → ". " + * - Single newlines → " " + */ +export const replaceNewlinesWithPeriods: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + let processed = value; + // Multiple newlines to period + processed = processed.replace(/\n{2,}/g, '. '); + // Single newlines to space + processed = processed.replace(/\n/g, ' '); + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format email addresses for TTS + * + * Example: "john.doe@example.com" → "john dot doe at example dot com" + */ +export const formatEmails: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + const emailPattern = /\b[\w.+-]+@[\w.-]+\.[a-zA-Z]{2,}\b/g; + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + const processed = value.replace(emailPattern, (email) => { + return email.replace(/\./g, ' dot ').replace(/@/g, ' at '); + }); + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format phone numbers for TTS + * + * Example: "555-123-4567" → "5 5 5 1 2 3 4 5 6 7" + */ +export const formatPhoneNumbers: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + // Match phone number patterns + const phonePattern = + /\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b|\b\d{3}[-.\s]\d{3}[-.\s]\d{4}\b/g; + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + const processed = value.replace(phonePattern, (phone) => { + // Extract only digits + const digits = phone.replace(/\D/g, ''); + // Space them out + return digits.split('').join(' '); + }); + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format times for TTS + * + * Example: "14:00" → "14" (simplify when minutes are 00) + * Other times remain unchanged + */ +export const formatTimes: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + const timePattern = /\b(\d{1,2}):00\b/g; + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + const processed = value.replace(timePattern, '$1'); + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Registry of all language-agnostic transforms + */ +export const languageAgnosticTransforms = new Map([ + ['filter_markdown', filterMarkdown], + ['filter_emoji', filterEmoji], + ['remove_angle_bracket_content', removeAngleBracketContent], + ['replace_newlines_with_periods', replaceNewlinesWithPeriods], + ['format_emails', formatEmails], + ['format_phone_numbers', formatPhoneNumbers], + ['format_times', formatTimes], +]); diff --git a/agents/src/voice/transcription/transforms_de.test.ts b/agents/src/voice/transcription/transforms_de.test.ts new file mode 100644 index 00000000..462c5609 --- /dev/null +++ b/agents/src/voice/transcription/transforms_de.test.ts @@ -0,0 +1,244 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ReadableStream } from 'node:stream/web'; +import { describe, expect, it } from 'vitest'; +import { + formatDatesDE, + formatDistancesDE, + formatEuroAmounts, + formatNumbersDE, + formatPercentagesDE, + formatUnitsDE, +} from './transforms_de.js'; + +/** + * Helper to apply a transform and get the result + */ +async function applyTransform( + transform: (text: ReadableStream) => ReadableStream, + input: string, +): Promise { + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(input); + controller.close(); + }, + }); + + const result = transform(stream); + const reader = result.getReader(); + let output = ''; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + output += value; + } + return output; +} + +describe('formatNumbersDE (German)', () => { + it('should convert single digit numbers to German words', async () => { + const result = await applyTransform(formatNumbersDE, 'Ich habe 5 Artikel'); + expect(result).toBe('Ich habe fünf Artikel'); + }); + + it('should convert teen numbers to German words', async () => { + const result = await applyTransform(formatNumbersDE, 'Es sind 15 Leute'); + expect(result).toBe('Es sind fünfzehn Leute'); + }); + + it('should convert compound numbers with German reversed format', async () => { + const result = await applyTransform(formatNumbersDE, 'Ich bin 21 Jahre alt'); + expect(result).toBe('Ich bin einundzwanzig Jahre alt'); + }); + + it('should handle 42 correctly', async () => { + const result = await applyTransform(formatNumbersDE, 'Zahl 42'); + expect(result).toBe('Zahl zweiundvierzig'); + }); + + it('should preserve years', async () => { + const result = await applyTransform(formatNumbersDE, 'Geboren 1995'); + expect(result).toBe('Geboren 1995'); + }); + + it('should preserve large numbers', async () => { + const result = await applyTransform(formatNumbersDE, 'Bevölkerung: 150 Millionen'); + expect(result).toBe('Bevölkerung: 150 Millionen'); + }); + + it('should format decimal numbers with Komma', async () => { + const result = await applyTransform(formatNumbersDE, 'Pi ist 3,14'); + // Decimal digits are converted to words too + expect(result).toBe('Pi ist drei Komma eins vier'); + }); + + it('should remove German thousands separators', async () => { + const result = await applyTransform(formatNumbersDE, 'Gesamt: 1.234'); + expect(result).toBe('Gesamt: 1234'); + }); + + it('should handle zero', async () => { + const result = await applyTransform(formatNumbersDE, 'Anzahl: 0'); + expect(result).toBe('Anzahl: null'); + }); + + it('should handle dreißig (30)', async () => { + const result = await applyTransform(formatNumbersDE, 'Alter: 30'); + expect(result).toBe('Alter: dreißig'); + }); +}); + +describe('formatEuroAmounts (German)', () => { + it('should format whole Euro amounts', async () => { + const result = await applyTransform(formatEuroAmounts, 'Preis: 5€'); + expect(result).toBe('Preis: fünf Euro'); + }); + + it('should format Euro amounts with cents', async () => { + const result = await applyTransform(formatEuroAmounts, 'Preis: 12,50€'); + expect(result).toBe('Preis: zwölf Euro und fünfzig Cent'); + }); + + it('should handle one Euro', async () => { + const result = await applyTransform(formatEuroAmounts, 'Nur 1€'); + expect(result).toBe('Nur eins Euro'); + }); + + it('should handle zero Euro', async () => { + const result = await applyTransform(formatEuroAmounts, 'Gratis: 0€'); + expect(result).toBe('Gratis: null Euro'); + }); + + it('should format large amounts', async () => { + const result = await applyTransform(formatEuroAmounts, 'Gesamt: 99€'); + expect(result).toContain('Euro'); + }); + + it('should handle Euro with space before symbol', async () => { + const result = await applyTransform(formatEuroAmounts, 'Preis: 10 €'); + expect(result).toBe('Preis: zehn Euro'); + }); +}); + +describe('formatPercentagesDE (German)', () => { + it('should format whole number percentages', async () => { + const result = await applyTransform(formatPercentagesDE, 'Rabatt: 50%'); + expect(result).toBe('Rabatt: 50 Prozent'); + }); + + it('should format decimal percentages with comma', async () => { + const result = await applyTransform(formatPercentagesDE, 'Rate: 3,5%'); + expect(result).toBe('Rate: 3,5 Prozent'); + }); + + it('should handle multiple percentages', async () => { + const result = await applyTransform(formatPercentagesDE, '10% bis 20%'); + expect(result).toBe('10 Prozent bis 20 Prozent'); + }); +}); + +describe('formatDistancesDE (German)', () => { + it('should format kilometers', async () => { + const result = await applyTransform(formatDistancesDE, 'Entfernung: 5 km'); + expect(result).toBe('Entfernung: 5 Kilometer'); + }); + + it('should format Meilen (miles)', async () => { + const result = await applyTransform(formatDistancesDE, 'Lauf 10 mi'); + expect(result).toBe('Lauf 10 Meilen'); + }); + + it('should format Meter', async () => { + const result = await applyTransform(formatDistancesDE, 'Höhe: 100 m'); + expect(result).toBe('Höhe: 100 Meter'); + }); + + it('should format Fuß (feet)', async () => { + const result = await applyTransform(formatDistancesDE, 'Tiefe: 20 ft'); + expect(result).toBe('Tiefe: 20 Fuß'); + }); + + it('should handle decimal distances with comma', async () => { + const result = await applyTransform(formatDistancesDE, 'Strecke: 3,5 km'); + expect(result).toBe('Strecke: 3,5 Kilometer'); + }); + + it('should remove German thousands separators', async () => { + const result = await applyTransform(formatDistancesDE, 'Weit: 1.000 km'); + expect(result).toBe('Weit: 1000 Kilometer'); + }); +}); + +describe('formatUnitsDE (German)', () => { + it('should format Kilogramm', async () => { + const result = await applyTransform(formatUnitsDE, 'Gewicht: 10 kg'); + expect(result).toBe('Gewicht: zehn Kilogramm'); + }); + + it('should format Pfund (pounds)', async () => { + const result = await applyTransform(formatUnitsDE, 'Gewicht: 5 lb'); + expect(result).toBe('Gewicht: fünf Pfund'); + }); + + it('should format Gramm', async () => { + const result = await applyTransform(formatUnitsDE, 'Masse: 50 g'); + expect(result).toBe('Masse: fünfzig Gramm'); + }); + + it('should format Liter', async () => { + const result = await applyTransform(formatUnitsDE, 'Volumen: 2 l'); + expect(result).toBe('Volumen: zwei Liter'); + }); + + it('should format Milliliter', async () => { + const result = await applyTransform(formatUnitsDE, 'Dosis: 10 ml'); + expect(result).toBe('Dosis: zehn Milliliter'); + }); + + it('should format Gallonen', async () => { + const result = await applyTransform(formatUnitsDE, 'Tank: 15 gal'); + expect(result).toBe('Tank: fünfzehn Gallonen'); + }); + + it('should handle plural lbs', async () => { + const result = await applyTransform(formatUnitsDE, 'Gewicht: 10 lbs'); + expect(result).toBe('Gewicht: zehn Pfund'); + }); +}); + +describe('formatDatesDE (German)', () => { + it('should format ISO dates in German', async () => { + const result = await applyTransform(formatDatesDE, 'Datum: 2024-12-25'); + expect(result).toContain('Dezember'); + expect(result).toContain('2024'); + }); + + it('should include German day of week', async () => { + const result = await applyTransform(formatDatesDE, 'Datum: 2024-12-25'); + expect(result).toContain('Mittwoch'); + }); + + it('should use German date format (DD. Month YYYY)', async () => { + const result = await applyTransform(formatDatesDE, 'Datum: 2024-12-25'); + expect(result).toContain('25. Dezember 2024'); + }); + + it('should format multiple dates', async () => { + const result = await applyTransform(formatDatesDE, '2024-01-01 bis 2024-12-31'); + expect(result).toContain('Januar'); + expect(result).toContain('Dezember'); + }); + + it('should handle leap years', async () => { + const result = await applyTransform(formatDatesDE, 'Schalttag: 2024-02-29'); + expect(result).toContain('Februar'); + expect(result).toContain('29'); + }); + + it('should use German month names', async () => { + const result = await applyTransform(formatDatesDE, '2024-03-15'); + expect(result).toContain('März'); + }); +}); diff --git a/agents/src/voice/transcription/transforms_de.ts b/agents/src/voice/transcription/transforms_de.ts new file mode 100644 index 00000000..c6613edb --- /dev/null +++ b/agents/src/voice/transcription/transforms_de.ts @@ -0,0 +1,401 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ReadableStream } from 'node:stream/web'; +import type { TextTransform } from './transforms.js'; + +/** + * Number to word mappings for German (0-99) + */ +const NUMBER_TO_WORDS_DE: Record = { + 0: 'null', + 1: 'eins', + 2: 'zwei', + 3: 'drei', + 4: 'vier', + 5: 'fünf', + 6: 'sechs', + 7: 'sieben', + 8: 'acht', + 9: 'neun', + 10: 'zehn', + 11: 'elf', + 12: 'zwölf', + 13: 'dreizehn', + 14: 'vierzehn', + 15: 'fünfzehn', + 16: 'sechzehn', + 17: 'siebzehn', + 18: 'achtzehn', + 19: 'neunzehn', + 20: 'zwanzig', + 30: 'dreißig', + 40: 'vierzig', + 50: 'fünfzig', + 60: 'sechzig', + 70: 'siebzig', + 80: 'achtzig', + 90: 'neunzig', +}; + +const ONES_DE: Record = { + 1: 'ein', + 2: 'zwei', + 3: 'drei', + 4: 'vier', + 5: 'fünf', + 6: 'sechs', + 7: 'sieben', + 8: 'acht', + 9: 'neun', +}; + +function numberToWordsDE(num: number): string { + const word = NUMBER_TO_WORDS_DE[num]; + if (word) { + return word; + } + if (num < 100) { + const tens = Math.floor(num / 10) * 10; + const ones = num % 10; + const onesWord = ONES_DE[ones]; + const tensWord = NUMBER_TO_WORDS_DE[tens]; + // German numbers are reversed: 21 = "einundzwanzig" (one-and-twenty) + if (onesWord && tensWord) { + return `${onesWord}und${tensWord}`; + } + } + return num.toString(); +} + +/** + * Format numbers in German text for TTS + * + * - Small numbers (0-99) → German words + * - Years (1900-2099) → preserved as numbers + * - Large numbers → preserved + * - Decimals → "X Komma Y Z..." (individual digits after decimal) + * - Removes dots/spaces from thousands separators + */ +export const formatNumbersDE: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + let processed = value; + + // Remove German thousands separators (dots and spaces) + processed = processed.replace(/(\d+)[.\s](\d{3})/g, '$1$2'); + + // Format decimal numbers (German uses comma as decimal separator) + processed = processed.replace(/\b(\d+),(\d+)\b/g, (match, whole, decimal) => { + const wholeNum = parseInt(whole, 10); + // Don't format years + if (wholeNum >= 1900 && wholeNum <= 2099) { + return match; + } + + const wholePart = wholeNum <= 99 ? numberToWordsDE(wholeNum) : wholeNum.toString(); + const decimalPart = decimal.split('').join(' '); + return `${wholePart} Komma ${decimalPart}`; + }); + + // Format whole numbers + processed = processed.replace(/\b(\d+)\b/g, (match) => { + const num = parseInt(match, 10); + // Don't format years or large numbers + if ((num >= 1900 && num <= 2099) || num > 99) { + return match; + } + return numberToWordsDE(num); + }); + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format Euro amounts for TTS in German + * + * Examples: + * - "5€" → "fünf Euro" + * - "12,50€" → "zwölf Euro und fünfzig Cent" + * - "1€" → "ein Euro" (singular) + */ +export const formatEuroAmounts: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + const processed = value.replace(/(\d+)(?:,(\d+))?\s*€/g, (match, euros, cents) => { + const euroNum = parseInt(euros, 10); + const euroWord = euroNum <= 99 ? numberToWordsDE(euroNum) : euroNum.toString(); + const euroUnit = 'Euro'; // Euro doesn't change in plural in German + + if (cents) { + const centsNum = parseInt(cents, 10); + const centsWord = centsNum <= 99 ? numberToWordsDE(centsNum) : centsNum.toString(); + const centsUnit = 'Cent'; // Cent doesn't change in plural + return `${euroWord} ${euroUnit} und ${centsWord} ${centsUnit}`; + } + + return `${euroWord} ${euroUnit}`; + }); + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format percentages for TTS in German + * + * Example: "67%" → "67 Prozent" + */ +export const formatPercentagesDE: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + const processed = value.replace(/(\d+(?:,\d+)?)%/g, '$1 Prozent'); + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format distance measurements for TTS in German + * + * Examples: + * - "5 km" → "5 Kilometer" + * - "10 mi" → "10 Meilen" + * - "3,5 m" → "3,5 Meter" + */ +export const formatDistancesDE: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + const unitMap: Record = { + km: 'Kilometer', + mi: 'Meilen', + m: 'Meter', + ft: 'Fuß', + yd: 'Yards', + }; + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + let processed = value; + + // Remove German thousands separators + processed = processed.replace(/(\d+)[.\s](\d{3})/g, '$1$2'); + + // Format distances + for (const [abbrev, full] of Object.entries(unitMap)) { + const pattern = new RegExp(`\\b(\\d+(?:,\\d+)?)\\s*${abbrev}\\b`, 'gi'); + processed = processed.replace(pattern, `$1 ${full}`); + } + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format weight and volume units for TTS in German + * + * Examples: + * - "10 kg" → "zehn Kilogramm" + * - "2,5 lb" → "2,5 Pfund" + * - "500 ml" → "500 Milliliter" + */ +export const formatUnitsDE: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + const unitMap: Record = { + lb: 'Pfund', + lbs: 'Pfund', + oz: 'Unzen', + kg: 'Kilogramm', + g: 'Gramm', + mg: 'Milligramm', + l: 'Liter', + ml: 'Milliliter', + gal: 'Gallonen', + }; + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + let processed = value; + + // Format units + for (const [abbrev, full] of Object.entries(unitMap)) { + const pattern = new RegExp(`\\b(\\d+)\\s*${abbrev}\\b`, 'gi'); + processed = processed.replace(pattern, (match, num) => { + const number = parseInt(num, 10); + const word = number <= 99 ? numberToWordsDE(number) : num; + return `${word} ${full}`; + }); + } + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format dates for TTS in German + * + * Example: "2024-12-25" → "Mittwoch, 25. Dezember 2024" + */ +export const formatDatesDE: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + const monthNames = [ + 'Januar', + 'Februar', + 'März', + 'April', + 'Mai', + 'Juni', + 'Juli', + 'August', + 'September', + 'Oktober', + 'November', + 'Dezember', + ]; + + const dayNames = [ + 'Sonntag', + 'Montag', + 'Dienstag', + 'Mittwoch', + 'Donnerstag', + 'Freitag', + 'Samstag', + ]; + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + const processed = value.replace( + /\b(\d{4})-(\d{2})-(\d{2})\b/g, + (match, year, month, day) => { + try { + const date = new Date(parseInt(year), parseInt(month) - 1, parseInt(day)); + const dayName = dayNames[date.getDay()]; + const monthName = monthNames[date.getMonth()]; + // German date format: "Mittwoch, 25. Dezember 2024" + return `${dayName}, ${parseInt(day)}. ${monthName} ${year}`; + } catch { + return match; // Return original if parsing fails + } + }, + ); + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Registry of all German-specific transforms + */ +export const germanTransforms = new Map([ + ['format_numbers_de', formatNumbersDE], + ['format_euro_amounts', formatEuroAmounts], + ['format_percentages_de', formatPercentagesDE], + ['format_distances_de', formatDistancesDE], + ['format_units_de', formatUnitsDE], + ['format_dates_de', formatDatesDE], +]); diff --git a/agents/src/voice/transcription/transforms_en.test.ts b/agents/src/voice/transcription/transforms_en.test.ts new file mode 100644 index 00000000..d6c335c1 --- /dev/null +++ b/agents/src/voice/transcription/transforms_en.test.ts @@ -0,0 +1,267 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ReadableStream } from 'node:stream/web'; +import { describe, expect, it } from 'vitest'; +import { + formatAcronyms, + formatDates, + formatDistances, + formatDollarAmounts, + formatNumbers, + formatPercentages, + formatUnits, +} from './transforms_en.js'; + +/** + * Helper to apply a transform and get the result + */ +async function applyTransform( + transform: (text: ReadableStream) => ReadableStream, + input: string, +): Promise { + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(input); + controller.close(); + }, + }); + + const result = transform(stream); + const reader = result.getReader(); + let output = ''; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + output += value; + } + return output; +} + +describe('formatNumbers (English)', () => { + it('should convert single digit numbers to words', async () => { + const result = await applyTransform(formatNumbers, 'I have 5 items'); + expect(result).toBe('I have five items'); + }); + + it('should convert teen numbers to words', async () => { + const result = await applyTransform(formatNumbers, 'There are 15 people'); + expect(result).toBe('There are fifteen people'); + }); + + it('should convert tens to words', async () => { + const result = await applyTransform(formatNumbers, 'Count to 20 and 30'); + expect(result).toBe('Count to twenty and thirty'); + }); + + it('should convert compound numbers to words', async () => { + const result = await applyTransform(formatNumbers, 'I am 42 years old'); + expect(result).toBe('I am forty-two years old'); + }); + + it('should preserve years', async () => { + const result = await applyTransform(formatNumbers, 'Born in 1995'); + expect(result).toBe('Born in 1995'); + }); + + it('should preserve large numbers', async () => { + const result = await applyTransform(formatNumbers, 'Population: 150 million'); + expect(result).toBe('Population: 150 million'); + }); + + it('should format decimal numbers', async () => { + const result = await applyTransform(formatNumbers, 'Pi is 3.14'); + expect(result).toBe('Pi is three point one four'); + }); + + it('should remove commas from numbers', async () => { + const result = await applyTransform(formatNumbers, 'Total: 1,234'); + expect(result).toBe('Total: 1234'); + }); + + it('should handle zero', async () => { + const result = await applyTransform(formatNumbers, 'Count: 0'); + expect(result).toBe('Count: zero'); + }); +}); + +describe('formatDollarAmounts (English)', () => { + it('should format whole dollar amounts', async () => { + const result = await applyTransform(formatDollarAmounts, 'Price: $5'); + expect(result).toBe('Price: five dollars'); + }); + + it('should format dollar amounts with cents', async () => { + const result = await applyTransform(formatDollarAmounts, 'Price: $12.50'); + expect(result).toBe('Price: twelve dollars and fifty cents'); + }); + + it('should use singular for one dollar', async () => { + const result = await applyTransform(formatDollarAmounts, 'Only $1'); + expect(result).toBe('Only one dollar'); + }); + + it('should use singular for one cent', async () => { + const result = await applyTransform(formatDollarAmounts, 'Cost: $0.01'); + expect(result).toBe('Cost: zero dollars and one cent'); + }); + + it('should handle large amounts', async () => { + const result = await applyTransform(formatDollarAmounts, 'Total: $999'); + expect(result).toContain('dollars'); + }); + + it('should format zero dollars', async () => { + const result = await applyTransform(formatDollarAmounts, 'Free: $0'); + expect(result).toBe('Free: zero dollars'); + }); +}); + +describe('formatPercentages (English)', () => { + it('should format whole number percentages', async () => { + const result = await applyTransform(formatPercentages, 'Discount: 50%'); + expect(result).toBe('Discount: 50 percent'); + }); + + it('should format decimal percentages', async () => { + const result = await applyTransform(formatPercentages, 'Rate: 3.5%'); + expect(result).toBe('Rate: 3.5 percent'); + }); + + it('should handle multiple percentages', async () => { + const result = await applyTransform(formatPercentages, '10% to 20%'); + expect(result).toBe('10 percent to 20 percent'); + }); +}); + +describe('formatDistances (English)', () => { + it('should format kilometers', async () => { + const result = await applyTransform(formatDistances, 'Distance: 5 km'); + expect(result).toBe('Distance: 5 kilometers'); + }); + + it('should format miles', async () => { + const result = await applyTransform(formatDistances, 'Run 10 mi'); + expect(result).toBe('Run 10 miles'); + }); + + it('should format meters', async () => { + const result = await applyTransform(formatDistances, 'Height: 100 m'); + expect(result).toBe('Height: 100 meters'); + }); + + it('should format feet', async () => { + const result = await applyTransform(formatDistances, 'Depth: 20 ft'); + expect(result).toBe('Depth: 20 feet'); + }); + + it('should format yards', async () => { + const result = await applyTransform(formatDistances, 'Length: 50 yd'); + expect(result).toBe('Length: 50 yards'); + }); + + it('should handle decimal distances', async () => { + const result = await applyTransform(formatDistances, 'Distance: 3.5 km'); + expect(result).toBe('Distance: 3.5 kilometers'); + }); + + it('should remove commas from distances', async () => { + const result = await applyTransform(formatDistances, 'Far: 1,000 km'); + expect(result).toBe('Far: 1000 kilometers'); + }); +}); + +describe('formatUnits (English)', () => { + it('should format kilograms', async () => { + const result = await applyTransform(formatUnits, 'Weight: 10 kg'); + expect(result).toBe('Weight: ten kilograms'); + }); + + it('should format pounds', async () => { + const result = await applyTransform(formatUnits, 'Weight: 5 lb'); + expect(result).toBe('Weight: five pounds'); + }); + + it('should format grams', async () => { + const result = await applyTransform(formatUnits, 'Mass: 50 g'); + expect(result).toBe('Mass: fifty grams'); + }); + + it('should format liters', async () => { + const result = await applyTransform(formatUnits, 'Volume: 2 l'); + expect(result).toBe('Volume: two liters'); + }); + + it('should format milliliters', async () => { + const result = await applyTransform(formatUnits, 'Dose: 10 ml'); + expect(result).toBe('Dose: ten milliliters'); + }); + + it('should format gallons', async () => { + const result = await applyTransform(formatUnits, 'Tank: 15 gal'); + expect(result).toBe('Tank: fifteen gallons'); + }); + + it('should handle plural lbs', async () => { + const result = await applyTransform(formatUnits, 'Weight: 10 lbs'); + expect(result).toBe('Weight: ten pounds'); + }); +}); + +describe('formatDates (English)', () => { + it('should format ISO dates', async () => { + const result = await applyTransform(formatDates, 'Date: 2024-12-25'); + expect(result).toContain('December 25, 2024'); + }); + + it('should include day of week', async () => { + const result = await applyTransform(formatDates, 'Date: 2024-12-25'); + expect(result).toContain('Wednesday'); + }); + + it('should format multiple dates', async () => { + const result = await applyTransform(formatDates, '2024-01-01 to 2024-12-31'); + expect(result).toContain('January'); + expect(result).toContain('December'); + }); + + it('should handle leap years', async () => { + const result = await applyTransform(formatDates, 'Leap day: 2024-02-29'); + expect(result).toContain('February 29'); + }); +}); + +describe('formatAcronyms (English)', () => { + it('should lowercase known acronyms', async () => { + const result = await applyTransform(formatAcronyms, 'NASA launched'); + expect(result).toBe('nasa launched'); + }); + + it('should lowercase acronyms with vowels', async () => { + const result = await applyTransform(formatAcronyms, 'SCUBA diving'); + expect(result).toBe('scuba diving'); + }); + + it('should space out consonant-only acronyms', async () => { + const result = await applyTransform(formatAcronyms, 'XYZ Corp'); + expect(result).toBe('X Y Z Corp'); + }); + + it('should handle API', async () => { + const result = await applyTransform(formatAcronyms, 'REST API'); + // REST has vowels, so it becomes lowercase + expect(result).toBe('rest api'); + }); + + it('should handle multiple acronyms', async () => { + const result = await applyTransform(formatAcronyms, 'NASA and FBI'); + expect(result).toContain('nasa'); + // FBI has vowel 'I', so it becomes lowercase + expect(result).toContain('fbi'); + }); + + it('should preserve regular words', async () => { + const result = await applyTransform(formatAcronyms, 'Hello World'); + expect(result).toBe('Hello World'); + }); +}); diff --git a/agents/src/voice/transcription/transforms_en.ts b/agents/src/voice/transcription/transforms_en.ts new file mode 100644 index 00000000..36f85c01 --- /dev/null +++ b/agents/src/voice/transcription/transforms_en.ts @@ -0,0 +1,477 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ReadableStream } from 'node:stream/web'; +import type { TextTransform } from './transforms.js'; + +/** + * Number to word mappings for 0-99 + */ +const NUMBER_TO_WORDS: Record = { + 0: 'zero', + 1: 'one', + 2: 'two', + 3: 'three', + 4: 'four', + 5: 'five', + 6: 'six', + 7: 'seven', + 8: 'eight', + 9: 'nine', + 10: 'ten', + 11: 'eleven', + 12: 'twelve', + 13: 'thirteen', + 14: 'fourteen', + 15: 'fifteen', + 16: 'sixteen', + 17: 'seventeen', + 18: 'eighteen', + 19: 'nineteen', + 20: 'twenty', + 30: 'thirty', + 40: 'forty', + 50: 'fifty', + 60: 'sixty', + 70: 'seventy', + 80: 'eighty', + 90: 'ninety', +}; + +function numberToWords(num: number): string { + const word = NUMBER_TO_WORDS[num]; + if (word) { + return word; + } + if (num < 100) { + const tens = Math.floor(num / 10) * 10; + const ones = num % 10; + const tensWord = NUMBER_TO_WORDS[tens]; + const onesWord = NUMBER_TO_WORDS[ones]; + if (tensWord && onesWord) { + return `${tensWord}-${onesWord}`; + } + } + return num.toString(); +} + +/** + * Format numbers in text for TTS + * + * - Small numbers (0-99) → words + * - Years (1900-2099) → preserved as numbers + * - Large numbers → preserved + * - Decimals → "X point Y Z..." (individual digits after decimal) + * - Removes commas from numbers + */ +export const formatNumbers: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + let processed = value; + + // Remove commas from numbers + processed = processed.replace(/(\d+),(\d+)/g, '$1$2'); + + // Format numbers + processed = processed.replace(/\b(\d+)\.(\d+)\b/g, (match, whole, decimal) => { + const wholeNum = parseInt(whole, 10); + // Don't format years + if (wholeNum >= 1900 && wholeNum <= 2099) { + return match; + } + + const wholePart = wholeNum <= 99 ? numberToWords(wholeNum) : wholeNum.toString(); + const decimalPart = decimal.split('').join(' '); + return `${wholePart} point ${decimalPart}`; + }); + + // Format whole numbers + processed = processed.replace(/\b(\d+)\b/g, (match) => { + const num = parseInt(match, 10); + // Don't format years or large numbers + if ((num >= 1900 && num <= 2099) || num > 99) { + return match; + } + return numberToWords(num); + }); + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format dollar amounts for TTS + * + * Examples: + * - "$5" → "five dollars" + * - "$12.50" → "twelve dollars and fifty cents" + * - "$1" → "one dollar" (singular) + */ +export const formatDollarAmounts: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + const processed = value.replace(/\$(\d+)(?:\.(\d+))?/g, (match, dollars, cents) => { + const dollarNum = parseInt(dollars, 10); + const dollarWord = dollarNum <= 99 ? numberToWords(dollarNum) : dollarNum.toString(); + const dollarUnit = dollarNum === 1 ? 'dollar' : 'dollars'; + + if (cents) { + const centsNum = parseInt(cents, 10); + const centsWord = centsNum <= 99 ? numberToWords(centsNum) : centsNum.toString(); + const centsUnit = centsNum === 1 ? 'cent' : 'cents'; + return `${dollarWord} ${dollarUnit} and ${centsWord} ${centsUnit}`; + } + + return `${dollarWord} ${dollarUnit}`; + }); + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format percentages for TTS + * + * Example: "67%" → "67 percent" + */ +export const formatPercentages: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + const processed = value.replace(/(\d+(?:\.\d+)?)%/g, '$1 percent'); + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format distance measurements for TTS + * + * Examples: + * - "5 km" → "5 kilometers" + * - "10 mi" → "10 miles" + * - "3.5 m" → "3.5 meters" + */ +export const formatDistances: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + const unitMap: Record = { + km: 'kilometers', + mi: 'miles', + m: 'meters', + ft: 'feet', + yd: 'yards', + }; + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + let processed = value; + + // Remove commas from numbers + processed = processed.replace(/(\d+),(\d+)/g, '$1$2'); + + // Format distances + for (const [abbrev, full] of Object.entries(unitMap)) { + const pattern = new RegExp(`\\b(\\d+(?:\\.\\d+)?)\\s*${abbrev}\\b`, 'gi'); + processed = processed.replace(pattern, `$1 ${full}`); + } + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format weight and volume units for TTS + * + * Examples: + * - "10 kg" → "ten kilograms" + * - "2.5 lb" → "2.5 pounds" + * - "500 ml" → "500 milliliters" + */ +export const formatUnits: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + const unitMap: Record = { + lb: 'pounds', + lbs: 'pounds', + oz: 'ounces', + kg: 'kilograms', + g: 'grams', + mg: 'milligrams', + l: 'liters', + ml: 'milliliters', + gal: 'gallons', + }; + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + let processed = value; + + // Format units + for (const [abbrev, full] of Object.entries(unitMap)) { + const pattern = new RegExp(`\\b(\\d+)\\s*${abbrev}\\b`, 'gi'); + processed = processed.replace(pattern, (match, num) => { + const number = parseInt(num, 10); + const word = number <= 99 ? numberToWords(number) : num; + return `${word} ${full}`; + }); + } + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Format dates for TTS + * + * Example: "2024-12-25" → "Wednesday, December 25, 2024" + */ +export const formatDates: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + const monthNames = [ + 'January', + 'February', + 'March', + 'April', + 'May', + 'June', + 'July', + 'August', + 'September', + 'October', + 'November', + 'December', + ]; + + const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']; + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + const processed = value.replace( + /\b(\d{4})-(\d{2})-(\d{2})\b/g, + (match, year, month, day) => { + try { + const date = new Date(parseInt(year), parseInt(month) - 1, parseInt(day)); + const dayName = dayNames[date.getDay()]; + const monthName = monthNames[date.getMonth()]; + return `${dayName}, ${monthName} ${parseInt(day)}, ${year}`; + } catch { + return match; // Return original if parsing fails + } + }, + ); + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Common acronyms that should be spoken as words + */ +const KNOWN_ACRONYMS = new Set([ + 'NASA', + 'NATO', + 'UNICEF', + 'UNESCO', + 'SCUBA', + 'RADAR', + 'LASER', + 'API', + 'SDK', + 'JSON', + 'XML', + 'HTML', + 'CSS', + 'HTTP', + 'HTTPS', + 'FTP', + 'SQL', + 'URL', + 'URI', + 'PDF', + 'JPG', + 'JPEG', + 'PNG', + 'GIF', + 'MP3', + 'MP4', + 'CPU', + 'GPU', + 'RAM', + 'ROM', + 'SSD', + 'HDD', + 'USB', + 'DVD', + 'CD', + 'AI', + 'ML', + 'VR', + 'AR', +]); + +/** + * Format acronyms for TTS + * + * - Known acronyms (NASA, API, etc.) → lowercase + * - Acronyms with vowels → lowercase + * - Consonant-only acronyms → space-separated letters + * + * Example: "XYZ" → "X Y Z", "NASA" → "nasa" + */ +export const formatAcronyms: TextTransform = ( + text: ReadableStream, +): ReadableStream => { + const vowels = new Set(['A', 'E', 'I', 'O', 'U']); + + return new ReadableStream({ + async start(controller) { + try { + const reader = text.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + controller.close(); + break; + } + + const processed = value.replace(/\b[A-Z]{2,}\b/g, (match) => { + // Known acronyms -> lowercase + if (KNOWN_ACRONYMS.has(match)) { + return match.toLowerCase(); + } + + // Has vowels -> lowercase + const hasVowel = match.split('').some((char) => vowels.has(char)); + if (hasVowel) { + return match.toLowerCase(); + } + + // Consonants only -> space-separated + return match.split('').join(' '); + }); + + controller.enqueue(processed); + } + } catch (error) { + controller.error(error); + } + }, + }); +}; + +/** + * Registry of all English-specific transforms + */ +export const englishTransforms = new Map([ + ['format_numbers', formatNumbers], + ['format_dollar_amounts', formatDollarAmounts], + ['format_percentages', formatPercentages], + ['format_distances', formatDistances], + ['format_units', formatUnits], + ['format_dates', formatDates], + ['format_acronyms', formatAcronyms], +]); From 74180802e7ce66a318381a75b49507276b25cc8d Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 17 Nov 2025 12:39:39 +0100 Subject: [PATCH 2/2] language fix --- .../voice/transcription/transforms.test.ts | 53 +++++++++-- agents/src/voice/transcription/transforms.ts | 93 ++++++++++++++----- 2 files changed, 116 insertions(+), 30 deletions(-) diff --git a/agents/src/voice/transcription/transforms.test.ts b/agents/src/voice/transcription/transforms.test.ts index 1d07baca..27770248 100644 --- a/agents/src/voice/transcription/transforms.test.ts +++ b/agents/src/voice/transcription/transforms.test.ts @@ -6,6 +6,7 @@ import { describe, expect, it } from 'vitest'; import { DEFAULT_TTS_TEXT_TRANSFORMS, applyTextTransforms, + getAllAvailableTransforms, getAvailableTransforms, } from './transforms.js'; @@ -57,11 +58,24 @@ describe('Text Transforms Core', () => { expect(transforms.has('format_euro_amounts')).toBe(true); }); + it('should list all available transforms across all languages', () => { + const transforms = getAllAvailableTransforms(); + // Language-agnostic transforms + expect(transforms.has('filter_markdown')).toBe(true); + expect(transforms.has('filter_emoji')).toBe(true); + // English transforms + expect(transforms.has('format_numbers')).toBe(true); + expect(transforms.has('format_dollar_amounts')).toBe(true); + // German transforms + expect(transforms.has('format_numbers_de')).toBe(true); + expect(transforms.has('format_euro_amounts')).toBe(true); + }); + it('should throw error for invalid transform name', async () => { const stream = stringToStream('test'); - await expect( - applyTextTransforms(stream, ['invalid_transform' as any], { language: 'en' }), - ).rejects.toThrow('Invalid transform'); + await expect(applyTextTransforms(stream, ['invalid_transform' as any])).rejects.toThrow( + 'Invalid transform', + ); }); it('should apply custom transform function', async () => { @@ -89,15 +103,38 @@ describe('Text Transforms Core', () => { it('should apply multiple transforms in sequence', async () => { const stream = stringToStream('**Price: $5** 🎉'); - const result = await applyTextTransforms( - stream, - ['filter_markdown', 'filter_emoji', 'format_dollar_amounts'], - { language: 'en' }, - ); + const result = await applyTextTransforms(stream, [ + 'filter_markdown', + 'filter_emoji', + 'format_dollar_amounts', + ]); const output = await streamToString(result); expect(output).toContain('Price:'); expect(output).toContain('five dollars'); expect(output).not.toContain('**'); expect(output).not.toContain('🎉'); }); + + it('should find transforms across all languages without specifying language', async () => { + // Test that English transform can be found without language config + const stream1 = stringToStream('$5'); + const result1 = await applyTextTransforms(stream1, ['format_dollar_amounts']); + const output1 = await streamToString(result1); + expect(output1).toBe('five dollars'); + + // Test that German transform can be found without language config + const stream2 = stringToStream('5€'); + const result2 = await applyTextTransforms(stream2, ['format_euro_amounts']); + const output2 = await streamToString(result2); + expect(output2).toBe('fünf Euro'); + + // Test that mixed language transforms can be used together + const stream3 = stringToStream('$5 and 5€'); + const result3 = await applyTextTransforms(stream3, [ + 'format_dollar_amounts', + 'format_euro_amounts', + ]); + const output3 = await streamToString(result3); + expect(output3).toBe('five dollars and fünf Euro'); + }); }); diff --git a/agents/src/voice/transcription/transforms.ts b/agents/src/voice/transcription/transforms.ts index 5166a3b1..0b765c2f 100644 --- a/agents/src/voice/transcription/transforms.ts +++ b/agents/src/voice/transcription/transforms.ts @@ -17,17 +17,6 @@ export type Language = 'en' | 'de' | string; */ export type TextTransform = (text: ReadableStream) => ReadableStream; -/** - * Configuration for language-specific transforms - */ -export interface LanguageTransformConfig { - /** - * The language code for language-specific transforms - * Defaults to 'en' (English) - */ - language?: Language; -} - /** * Built-in language-agnostic transform names */ @@ -135,17 +124,36 @@ export function getRecommendedTTSTransforms(language: Language = 'en'): BuiltInT /** * Apply a sequence of text transforms to a text stream * + * Transforms can be specified either as built-in transform names (strings) + * or as custom transform functions. Built-in transforms are looked up across + * all registered transforms (language-agnostic and all language-specific). + * * @param text - Input text stream * @param transforms - Array of transform names or custom transform functions - * @param config - Configuration for language-specific transforms * @returns Transformed text stream + * + * @example + * ```typescript + * // Use built-in transforms (names) + * const text = new ReadableStream(...); + * const transformed = await applyTextTransforms(text, [ + * 'filter_markdown', + * 'format_numbers', // English + * 'format_euro_amounts', // German + * ]); + * + * // Mix built-in and custom transforms + * const customTransform: TextTransform = (stream) => ...; + * const transformed = await applyTextTransforms(text, [ + * 'filter_markdown', + * customTransform, + * ]); + * ``` */ export async function applyTextTransforms( text: ReadableStream, transforms: readonly TextTransformSpec[], - config: LanguageTransformConfig = {}, ): Promise> { - const { language = 'en' } = config; let result = text; for (const transform of transforms) { @@ -153,12 +161,12 @@ export async function applyTextTransforms( // Custom transform function result = transform(result); } else { - // Built-in transform name - const transformFn = getBuiltInTransform(transform, language); + // Built-in transform name - search across all registries + const transformFn = getBuiltInTransform(transform); if (!transformFn) { throw new Error( `Invalid transform: ${transform}. ` + - `Available transforms: ${Array.from(getAvailableTransforms(language)).join(', ')}`, + `Available transforms: ${Array.from(getAllAvailableTransforms()).join(', ')}`, ); } result = transformFn(result); @@ -170,25 +178,66 @@ export async function applyTextTransforms( /** * Get a built-in transform function by name + * + * Searches across all registered transforms: + * 1. First checks language-agnostic transforms + * 2. Then searches through all language-specific transform registries + * + * @param name - The built-in transform name + * @returns The transform function, or null if not found */ -function getBuiltInTransform(name: BuiltInTransformName, language: Language): TextTransform | null { +function getBuiltInTransform(name: BuiltInTransformName): TextTransform | null { // Check language-agnostic transforms first const agnostic = languageAgnosticTransforms.get(name as LanguageAgnosticTransformName); if (agnostic) { return agnostic; } - // Check language-specific transforms - const langTransforms = languageSpecificTransforms.get(language); - if (langTransforms) { - return langTransforms.get(name) || null; + // Check all language-specific transform registries + for (const langTransforms of languageSpecificTransforms.values()) { + const transform = langTransforms.get(name); + if (transform) { + return transform; + } } return null; } +/** + * Get all available transform names across all languages + * + * Returns a set containing all registered transform names, including + * language-agnostic and all language-specific transforms. + * + * @returns Set of all available transform names + */ +export function getAllAvailableTransforms(): Set { + const available = new Set(); + + // Add language-agnostic transforms + for (const name of languageAgnosticTransforms.keys()) { + available.add(name); + } + + // Add all language-specific transforms + for (const langTransforms of languageSpecificTransforms.values()) { + for (const name of langTransforms.keys()) { + available.add(name); + } + } + + return available; +} + /** * Get all available transform names for a given language + * + * Returns a set containing language-agnostic transforms plus transforms + * specific to the requested language. + * + * @param language - The language code (e.g., 'en', 'de') + * @returns Set of available transform names for the language */ export function getAvailableTransforms(language: Language = 'en'): Set { const available = new Set();