From b6c2251c144f31f5cdfa99e6f407c2b1f73ede12 Mon Sep 17 00:00:00 2001 From: Oleg Shulyakov Date: Fri, 17 Oct 2025 01:16:12 +0300 Subject: [PATCH 1/2] feat(audio): add text-to-speech support to OpenAI provider - Implement VoiceProvider and TextToSpeechProvider interfaces - Add getVoices() method to fetch available speech voices - Add postSpeech() method to generate audio from text - Include caching for voice list with 5-minute expiration - Add TTS configuration options to provider settings - Create audio types for speech parameters and voice definitions --- src/api/providers/BaseOpenAIProvider.ts | 104 ++++++++++++++++++++++-- src/config/config-default.json | 4 + src/types/audio.ts | 40 +++++++++ src/types/configuration.ts | 7 ++ src/types/provider.ts | 43 ++++++++++ 5 files changed, 189 insertions(+), 9 deletions(-) create mode 100644 src/types/audio.ts diff --git a/src/api/providers/BaseOpenAIProvider.ts b/src/api/providers/BaseOpenAIProvider.ts index 07c32ac..41b4a60 100644 --- a/src/api/providers/BaseOpenAIProvider.ts +++ b/src/api/providers/BaseOpenAIProvider.ts @@ -6,7 +6,10 @@ import { LLMProvider, ModelProvider, SSEChatCompletionMessage, + TextToSpeechProvider, + VoiceProvider, } from '../../types'; +import { SpeechCreateParams, SpeechVoice } from '../../types/audio'; import { normalizeUrl } from '../../utils'; import { noResponse, processSSEStream } from '../utils'; @@ -36,7 +39,12 @@ import { noResponse, processSSEStream } from '../utils'; * ``` */ export class BaseOpenAIProvider - implements LLMProvider, ModelProvider, ChatCompletionProvider + implements + LLMProvider, + ModelProvider, + ChatCompletionProvider, + VoiceProvider, + TextToSpeechProvider { /** * The base URL of the OpenAI-compatible API endpoint. @@ -61,7 +69,20 @@ export class BaseOpenAIProvider * Used to determine cache expiration (5 minutes). * @internal */ - protected lastUpdated: number; + protected lastUpdatedModels: number; + + /** + * Cached list of available voices fetched from the API. + * @internal + */ + private voices: SpeechVoice[] = []; + + /** + * Timestamp of the last successful voice list fetch. + * Used to determine cache expiration (5 minutes). + * @internal + */ + protected lastUpdatedVoices: number; /** * Constructs a new BaseOpenAIProvider instance. @@ -76,7 +97,8 @@ export class BaseOpenAIProvider if (!baseUrl) throw new Error(`Base URL is not specified`); this.baseUrl = baseUrl; this.apiKey = apiKey; - this.lastUpdated = Date.now(); + this.lastUpdatedModels = 0; + this.lastUpdatedVoices = 0; } /** @@ -113,7 +135,7 @@ export class BaseOpenAIProvider async getModels(): Promise { if (isDev) console.debug('v1Models', this.models); - if (this.models.length > 0 && !this.isExpired()) { + if (this.models.length > 0 && !this.isExpired(this.lastUpdatedModels)) { return this.models; } @@ -134,7 +156,7 @@ export class BaseOpenAIProvider const json = await fetchResponse.json(); this.models = this.jsonToModels(json.data); - if (this.models.length > 0) this.lastUpdated = Date.now(); + if (this.models.length > 0) this.lastUpdatedModels = Date.now(); return this.models; } @@ -220,6 +242,70 @@ export class BaseOpenAIProvider return processSSEStream(fetchResponse); } + /** + * Retrieves the list of available speech voices from the provider. + * @returns {Promise} A promise that resolves to an array of voice definitions. + */ + async getVoices(): Promise { + if (isDev) console.debug('v1Voices', this.voices); + + if (this.voices.length > 0 && !this.isExpired(this.lastUpdatedVoices)) { + return this.voices; + } + + let fetchResponse = noResponse; + try { + fetchResponse = await fetch( + normalizeUrl('v1/audio/voices', this.getBaseUrl()), + { + method: 'GET', + headers: this.getHeaders(), + signal: AbortSignal.timeout(1000), + } + ); + } catch { + // Silently ignore network/timeout errors; will be caught in isErrorResponse + } + await this.isErrorResponse(fetchResponse); + const json = await fetchResponse.json(); + this.voices = json.data && Array.isArray(json.data) ? [...json.data] : []; + + if (this.voices.length > 0) this.lastUpdatedVoices = Date.now(); + + return this.voices; + } + + /** + * Generates audio from text using the specified parameters. + * @param {SpeechCreateParams} params - Configuration for speech synthesis (text, voice, speed, etc.). + * @param {AbortSignal} abortSignal - Signal to cancel the request if needed. + * @returns {Promise} The generated audio data as a Blob (e.g., MP3 or WAV). + */ + async postSpeech( + params: SpeechCreateParams, + abortSignal: AbortSignal + ): Promise { + if (isDev) console.debug('v1Speech', params); + + let fetchResponse = noResponse; + try { + fetchResponse = await fetch( + normalizeUrl('v1/audio/speech', this.getBaseUrl()), + { + method: 'POST', + headers: this.getHeaders(), + body: JSON.stringify(params), + signal: abortSignal, + } + ); + } catch { + // Silently ignore network/timeout errors; will be caught in isErrorResponse + } + + await this.isErrorResponse(fetchResponse); + return fetchResponse.blob(); + } + /** * Generates HTTP headers for API requests, including authentication. * @@ -317,8 +403,8 @@ export class BaseOpenAIProvider * * @protected */ - protected isExpired(): boolean { - return Date.now() - this.lastUpdated > 5 * 60 * 1000; + protected isExpired(lastUpdated: number): boolean { + return Date.now() - lastUpdated > 5 * 60 * 1000; } /** @@ -441,7 +527,7 @@ export class SelfHostedOpenAIProvider extends BaseOpenAIProvider { * @inheritdoc */ protected isExpired(): boolean { - return Date.now() - this.lastUpdated > 60 * 1000; + return Date.now() - this.lastUpdatedModels > 60 * 1000; } } @@ -467,7 +553,7 @@ export class CloudOpenAIProvider extends BaseOpenAIProvider { * @inheritdoc */ protected isExpired(): boolean { - return Date.now() - this.lastUpdated > 15 * 60 * 1000; + return Date.now() - this.lastUpdatedModels > 15 * 60 * 1000; } /** @inheritdoc */ diff --git a/src/config/config-default.json b/src/config/config-default.json index 74c4eeb..179187c 100644 --- a/src/config/config-default.json +++ b/src/config/config-default.json @@ -4,6 +4,10 @@ "apiKey": "", "model": "", "systemMessage": "", + "ttsProvider": "", + "ttsBaseUrl": "", + "ttsApiKey": "", + "ttsModel": "", "initials": "You", "showRawUserMessage": false, "showRawAssistantMessage": false, diff --git a/src/types/audio.ts b/src/types/audio.ts new file mode 100644 index 0000000..ce09f7e --- /dev/null +++ b/src/types/audio.ts @@ -0,0 +1,40 @@ +export type SpeechModel = string; +export type SpeechVoice = string; + +export interface SpeechCreateParams { + /** + * The text to generate audio for. The maximum length is 4096 characters. + */ + input: string; + + /** + * One of the available [TTS models](https://platform.openai.com/docs/models#tts). + */ + model: SpeechModel; + + /** + * The voice to use when generating the audio. Previews of the voices are available in the + * [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options). + */ + voice: SpeechVoice; + + /** + * Control the voice of your generated audio with additional instructions. + */ + instructions?: string; + + /** + * The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`. + */ + response_format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm'; + + /** + * The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default. + */ + speed?: number; + + /** + * The format to stream the audio in. Supported formats are `sse` and `audio`. + */ + stream_format?: 'sse' | 'audio'; +} diff --git a/src/types/configuration.ts b/src/types/configuration.ts index c3901ca..b83361e 100644 --- a/src/types/configuration.ts +++ b/src/types/configuration.ts @@ -15,12 +15,19 @@ export type ProviderOption = { }; export interface Configuration { + /* text inference */ provider: string; baseUrl: string; apiKey: string; model: string; systemMessage: string; + /* text to speech */ + ttsProvider: string; + ttsBaseUrl: string; + ttsApiKey: string; + ttsModel: string; + /* ui */ initials: string; showRawUserMessage: boolean; diff --git a/src/types/provider.ts b/src/types/provider.ts index f51e5c3..08c4e87 100644 --- a/src/types/provider.ts +++ b/src/types/provider.ts @@ -1,3 +1,4 @@ +import { SpeechCreateParams, SpeechVoice } from './audio'; import { InferenceApiMessage, InferenceApiModel } from './inference'; /** @@ -197,3 +198,45 @@ export interface InferenceProvider extends LLMProvider, ModelProvider, ChatCompletionProvider {} + +/** + * Interface defining the contract for retrieving available text-to-speech voices. + * @interface + * @method getVoices - Fetches the list of available speech voices asynchronously. + * @returns {Promise} A promise resolving to an array of voice definitions. + */ +export interface VoiceProvider { + /** + * Retrieves the list of available speech voices from the provider. + * @returns {Promise} A promise that resolves to an array of voice definitions. + */ + getVoices(): Promise; +} + +/** + * Interface defining the contract for text-to-speech synthesis. + * Combines authentication, voice discovery, and audio generation capabilities. + * @interface + * @extends {LLMProvider} + * @extends {ModelProvider} + * @extends {VoiceProvider} + * @method postSpeech - Generates audio from text using the specified parameters. + * @param {SpeechCreateParams} params - Parameters for speech generation (text, voice, speed, etc.). + * @param {AbortSignal} abortSignal - Signal to cancel the audio generation request. + * @returns {Promise} A promise resolving to the generated audio as a Blob. + */ +export interface TextToSpeechProvider + extends LLMProvider, + ModelProvider, + VoiceProvider { + /** + * Generates audio from text using the specified parameters. + * @param {SpeechCreateParams} params - Configuration for speech synthesis (text, voice, speed, etc.). + * @param {AbortSignal} abortSignal - Signal to cancel the request if needed. + * @returns {Promise} The generated audio data as a Blob (e.g., MP3 or WAV). + */ + postSpeech( + params: SpeechCreateParams, + abortSignal: AbortSignal + ): Promise; +} From c06f26f53deb1d7a45cb4bda3364bb34296c1cec Mon Sep 17 00:00:00 2001 From: Oleg Shulyakov Date: Fri, 17 Oct 2025 02:32:26 +0300 Subject: [PATCH 2/2] feat(settings): add voice manager with TTS mode selection and browser support - Introduce new VoiceManager component for text-to-speech configuration - Add ttsMode setting with 'browser' and 'provider' options - Move TTS settings from main settings to new voice manager component - Add test speech functionality with customizable message --- src/config/config-default.json | 11 +- src/i18n/en.json | 24 +- .../Settings/components/VoiceManager.tsx | 230 ++++++++++++++++++ src/pages/Settings/components/index.tsx | 1 + src/pages/Settings/index.tsx | 97 +------- src/types/configuration.ts | 13 +- src/types/settings.ts | 3 +- 7 files changed, 277 insertions(+), 102 deletions(-) create mode 100644 src/pages/Settings/components/VoiceManager.tsx diff --git a/src/config/config-default.json b/src/config/config-default.json index 179187c..09f99c0 100644 --- a/src/config/config-default.json +++ b/src/config/config-default.json @@ -4,10 +4,6 @@ "apiKey": "", "model": "", "systemMessage": "", - "ttsProvider": "", - "ttsBaseUrl": "", - "ttsApiKey": "", - "ttsModel": "", "initials": "You", "showRawUserMessage": false, "showRawAssistantMessage": false, @@ -40,8 +36,13 @@ "dry_penalty_last_n": -1, "custom": "", "pyIntepreterEnabled": false, + "ttsMode": "browser", "ttsVoice": "", "ttsPitch": 1, "ttsRate": 1, - "ttsVolume": 1 + "ttsVolume": 1, + "ttsProvider": "", + "ttsBaseUrl": "", + "ttsApiKey": "", + "ttsModel": "" } diff --git a/src/i18n/en.json b/src/i18n/en.json index f62023f..68347d0 100644 --- a/src/i18n/en.json +++ b/src/i18n/en.json @@ -374,10 +374,26 @@ } }, "textToSpeech": { - "check": { - "label": "Check", - "text": "This is a demo of Web Speech Synthesis." - } + "mode": { + "browser": "Browser", + "provider": "Voice Provider" + }, + "buttons": { + "play": "Play" + }, + "title": { + "play": "Play" + }, + "ariaLabels": { + "play": "Play test message" + }, + "modeLabel": "Mode", + "modeNote": "Choose between browser-based or model-based text-to-speech.", + "testSpeech": "Test Speech", + "browserNotSupported": "Your browser does not support text-to-speech.", + "testText": "This is a demo of Web Speech Synthesis.", + "testTextPlaceholder": "Enter text to test speech...", + "modelTtsNotImplemented": "Model-based TTS is not implemented for this provider. This would require a dedicated TTS service or model capability." }, "presetManager": { "newPreset": "Save the current settings as a preset", diff --git a/src/pages/Settings/components/VoiceManager.tsx b/src/pages/Settings/components/VoiceManager.tsx new file mode 100644 index 0000000..e414370 --- /dev/null +++ b/src/pages/Settings/components/VoiceManager.tsx @@ -0,0 +1,230 @@ +import { useState } from 'react'; +import { Trans, useTranslation } from 'react-i18next'; +import { LuSpeech, LuVolume2 } from 'react-icons/lu'; +import { Button, Dropdown, Icon, Label } from '../../../components'; +import { + IS_SPEECH_SYNTHESIS_SUPPORTED, + getSpeechSynthesisVoiceByName, + getSpeechSynthesisVoices, +} from '../../../hooks/useTextToSpeech'; +import { Configuration, ConfigurationKey } from '../../../types'; +import { SettingInputType } from '../../../types/settings'; +import { + SettingsModalDropdown, + SettingsModalRangeInput, + SettingsSectionLabel, +} from './'; + +interface VoiceManagerProps { + config: Configuration; + handleChange: ( + key: ConfigurationKey + ) => (value: string | number | boolean) => void; +} + +export function VoiceManager({ config, handleChange }: VoiceManagerProps) { + const { t } = useTranslation(); + + const [testMessage, setTestMessage] = useState( + t('settings.textToSpeech.testText') + ); + + const availableVoices = IS_SPEECH_SYNTHESIS_SUPPORTED + ? getSpeechSynthesisVoices().map((voice) => ({ + value: `${voice.name} (${voice.lang})`, + label: `${voice.name} (${voice.lang})`, + })) + : []; + + const handleTestMessageChange = ( + e: React.ChangeEvent + ) => { + setTestMessage(e.target.value); + }; + + const playBrowserTts = () => { + if (!IS_SPEECH_SYNTHESIS_SUPPORTED) return; + + const utterance = new SpeechSynthesisUtterance(testMessage); + const voice = getSpeechSynthesisVoiceByName(config.ttsVoice); + + if (voice) { + utterance.voice = voice; + } + utterance.pitch = config.ttsPitch; + utterance.rate = config.ttsRate; + utterance.volume = config.ttsVolume; + + speechSynthesis.speak(utterance); + }; + + const playModelTts = async () => { + try { + // Check if we have the necessary configuration for provider TTS + if (config.ttsMode !== 'provider') { + console.warn('Provider TTS is not enabled'); + return; + } + + alert(t('settings.textToSpeech.modelTtsNotImplemented')); + } catch (error) { + console.error('Error with provider TTS:', error); + } + }; + + // Determine which play function to use based on provider + const handlePlayTest = () => { + if (config.ttsMode === 'provider') { + playModelTts(); + } else { + playBrowserTts(); + } + }; + + return ( + <> + + + + + + + + {/* TTS Mode Settings */} +
+
+ +
+ +
+ +
+
+ + {/* Browser TTS Settings */} + {config.ttsMode === 'browser' && ( + <> + {IS_SPEECH_SYNTHESIS_SUPPORTED ? ( + <> + + + + + + + + + ) : ( +
+ +
+ )} + + )} + + {/* Provider TTS Settings */} + {config.ttsMode === 'provider' && <>} + + {/* Test Section */} + + + + +