From b6c2251c144f31f5cdfa99e6f407c2b1f73ede12 Mon Sep 17 00:00:00 2001
From: Oleg Shulyakov <olegshulyakov@users.noreply.github.com>
Date: Fri, 17 Oct 2025 01:16:12 +0300
Subject: [PATCH 1/2] feat(audio): add text-to-speech support to OpenAI
 provider

- Implement VoiceProvider and TextToSpeechProvider interfaces
- Add getVoices() method to fetch available speech voices
- Add postSpeech() method to generate audio from text
- Include caching for voice list with 5-minute expiration
- Add TTS configuration options to provider settings
- Create audio types for speech parameters and voice definitions
---
 src/api/providers/BaseOpenAIProvider.ts | 104 ++++++++++++++++++++++--
 src/config/config-default.json          |   4 +
 src/types/audio.ts                      |  40 +++++++++
 src/types/configuration.ts              |   7 ++
 src/types/provider.ts                   |  43 ++++++++++
 5 files changed, 189 insertions(+), 9 deletions(-)
 create mode 100644 src/types/audio.ts

diff --git a/src/api/providers/BaseOpenAIProvider.ts b/src/api/providers/BaseOpenAIProvider.ts
index 07c32ac..41b4a60 100644
--- a/src/api/providers/BaseOpenAIProvider.ts
+++ b/src/api/providers/BaseOpenAIProvider.ts
@@ -6,7 +6,10 @@ import {
   LLMProvider,
   ModelProvider,
   SSEChatCompletionMessage,
+  TextToSpeechProvider,
+  VoiceProvider,
 } from '../../types';
+import { SpeechCreateParams, SpeechVoice } from '../../types/audio';
 import { normalizeUrl } from '../../utils';
 import { noResponse, processSSEStream } from '../utils';
 
@@ -36,7 +39,12 @@ import { noResponse, processSSEStream } from '../utils';
  * ```
  */
 export class BaseOpenAIProvider
-  implements LLMProvider, ModelProvider, ChatCompletionProvider
+  implements
+    LLMProvider,
+    ModelProvider,
+    ChatCompletionProvider,
+    VoiceProvider,
+    TextToSpeechProvider
 {
   /**
    * The base URL of the OpenAI-compatible API endpoint.
@@ -61,7 +69,20 @@ export class BaseOpenAIProvider
    * Used to determine cache expiration (5 minutes).
    * @internal
    */
-  protected lastUpdated: number;
+  protected lastUpdatedModels: number;
+
+  /**
+   * Cached list of available voices fetched from the API.
+   * @internal
+   */
+  private voices: SpeechVoice[] = [];
+
+  /**
+   * Timestamp of the last successful voice list fetch.
+   * Used to determine cache expiration (5 minutes).
+   * @internal
+   */
+  protected lastUpdatedVoices: number;
 
   /**
    * Constructs a new BaseOpenAIProvider instance.
@@ -76,7 +97,8 @@ export class BaseOpenAIProvider
     if (!baseUrl) throw new Error(`Base URL is not specified`);
     this.baseUrl = baseUrl;
     this.apiKey = apiKey;
-    this.lastUpdated = Date.now();
+    this.lastUpdatedModels = 0;
+    this.lastUpdatedVoices = 0;
   }
 
   /**
@@ -113,7 +135,7 @@ export class BaseOpenAIProvider
   async getModels(): Promise<InferenceApiModel[]> {
     if (isDev) console.debug('v1Models', this.models);
 
-    if (this.models.length > 0 && !this.isExpired()) {
+    if (this.models.length > 0 && !this.isExpired(this.lastUpdatedModels)) {
       return this.models;
     }
 
@@ -134,7 +156,7 @@ export class BaseOpenAIProvider
     const json = await fetchResponse.json();
     this.models = this.jsonToModels(json.data);
 
-    if (this.models.length > 0) this.lastUpdated = Date.now();
+    if (this.models.length > 0) this.lastUpdatedModels = Date.now();
 
     return this.models;
   }
@@ -220,6 +242,70 @@ export class BaseOpenAIProvider
     return processSSEStream<SSEChatCompletionMessage>(fetchResponse);
   }
 
+  /**
+   * Retrieves the list of available speech voices from the provider.
+   * @returns {Promise<SpeechVoice[]>} A promise that resolves to an array of voice definitions.
+   */
+  async getVoices(): Promise<SpeechVoice[]> {
+    if (isDev) console.debug('v1Voices', this.voices);
+
+    if (this.voices.length > 0 && !this.isExpired(this.lastUpdatedVoices)) {
+      return this.voices;
+    }
+
+    let fetchResponse = noResponse;
+    try {
+      fetchResponse = await fetch(
+        normalizeUrl('v1/audio/voices', this.getBaseUrl()),
+        {
+          method: 'GET',
+          headers: this.getHeaders(),
+          signal: AbortSignal.timeout(1000),
+        }
+      );
+    } catch {
+      // Silently ignore network/timeout errors; will be caught in isErrorResponse
+    }
+    await this.isErrorResponse(fetchResponse);
+    const json = await fetchResponse.json();
+    this.voices = json.data && Array.isArray(json.data) ? [...json.data] : [];
+
+    if (this.voices.length > 0) this.lastUpdatedVoices = Date.now();
+
+    return this.voices;
+  }
+
+  /**
+   * Generates audio from text using the specified parameters.
+   * @param {SpeechCreateParams} params - Configuration for speech synthesis (text, voice, speed, etc.).
+   * @param {AbortSignal} abortSignal - Signal to cancel the request if needed.
+   * @returns {Promise<Blob>} The generated audio data as a Blob (e.g., MP3 or WAV).
+   */
+  async postSpeech(
+    params: SpeechCreateParams,
+    abortSignal: AbortSignal
+  ): Promise<Blob> {
+    if (isDev) console.debug('v1Speech', params);
+
+    let fetchResponse = noResponse;
+    try {
+      fetchResponse = await fetch(
+        normalizeUrl('v1/audio/speech', this.getBaseUrl()),
+        {
+          method: 'POST',
+          headers: this.getHeaders(),
+          body: JSON.stringify(params),
+          signal: abortSignal,
+        }
+      );
+    } catch {
+      // Silently ignore network/timeout errors; will be caught in isErrorResponse
+    }
+
+    await this.isErrorResponse(fetchResponse);
+    return fetchResponse.blob();
+  }
+
   /**
    * Generates HTTP headers for API requests, including authentication.
    *
@@ -317,8 +403,8 @@ export class BaseOpenAIProvider
    *
    * @protected
    */
-  protected isExpired(): boolean {
-    return Date.now() - this.lastUpdated > 5 * 60 * 1000;
+  protected isExpired(lastUpdated: number): boolean {
+    return Date.now() - lastUpdated > 5 * 60 * 1000;
   }
 
   /**
@@ -441,7 +527,7 @@ export class SelfHostedOpenAIProvider extends BaseOpenAIProvider {
    * @inheritdoc
    */
   protected isExpired(): boolean {
-    return Date.now() - this.lastUpdated > 60 * 1000;
+    return Date.now() - this.lastUpdatedModels > 60 * 1000;
   }
 }
 
@@ -467,7 +553,7 @@ export class CloudOpenAIProvider extends BaseOpenAIProvider {
    * @inheritdoc
    */
   protected isExpired(): boolean {
-    return Date.now() - this.lastUpdated > 15 * 60 * 1000;
+    return Date.now() - this.lastUpdatedModels > 15 * 60 * 1000;
   }
 
   /** @inheritdoc */
diff --git a/src/config/config-default.json b/src/config/config-default.json
index 74c4eeb..179187c 100644
--- a/src/config/config-default.json
+++ b/src/config/config-default.json
@@ -4,6 +4,10 @@
   "apiKey": "",
   "model": "",
   "systemMessage": "",
+  "ttsProvider": "",
+  "ttsBaseUrl": "",
+  "ttsApiKey": "",
+  "ttsModel": "",
   "initials": "You",
   "showRawUserMessage": false,
   "showRawAssistantMessage": false,
diff --git a/src/types/audio.ts b/src/types/audio.ts
new file mode 100644
index 0000000..ce09f7e
--- /dev/null
+++ b/src/types/audio.ts
@@ -0,0 +1,40 @@
+export type SpeechModel = string;
+export type SpeechVoice = string;
+
+export interface SpeechCreateParams {
+  /**
+   * The text to generate audio for. The maximum length is 4096 characters.
+   */
+  input: string;
+
+  /**
+   * One of the available [TTS models](https://platform.openai.com/docs/models#tts).
+   */
+  model: SpeechModel;
+
+  /**
+   * The voice to use when generating the audio. Previews of the voices are available in the
+   * [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
+   */
+  voice: SpeechVoice;
+
+  /**
+   * Control the voice of your generated audio with additional instructions.
+   */
+  instructions?: string;
+
+  /**
+   * The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
+   */
+  response_format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm';
+
+  /**
+   * The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default.
+   */
+  speed?: number;
+
+  /**
+   * The format to stream the audio in. Supported formats are `sse` and `audio`.
+   */
+  stream_format?: 'sse' | 'audio';
+}
diff --git a/src/types/configuration.ts b/src/types/configuration.ts
index c3901ca..b83361e 100644
--- a/src/types/configuration.ts
+++ b/src/types/configuration.ts
@@ -15,12 +15,19 @@ export type ProviderOption = {
 };
 
 export interface Configuration {
+  /* text inference */
   provider: string;
   baseUrl: string;
   apiKey: string;
   model: string;
   systemMessage: string;
 
+  /* text to speech */
+  ttsProvider: string;
+  ttsBaseUrl: string;
+  ttsApiKey: string;
+  ttsModel: string;
+
   /* ui */
   initials: string;
   showRawUserMessage: boolean;
diff --git a/src/types/provider.ts b/src/types/provider.ts
index f51e5c3..08c4e87 100644
--- a/src/types/provider.ts
+++ b/src/types/provider.ts
@@ -1,3 +1,4 @@
+import { SpeechCreateParams, SpeechVoice } from './audio';
 import { InferenceApiMessage, InferenceApiModel } from './inference';
 
 /**
@@ -197,3 +198,45 @@ export interface InferenceProvider
   extends LLMProvider,
     ModelProvider,
     ChatCompletionProvider {}
+
+/**
+ * Interface defining the contract for retrieving available text-to-speech voices.
+ * @interface
+ * @method getVoices - Fetches the list of available speech voices asynchronously.
+ * @returns {Promise<SpeechVoice[]>} A promise resolving to an array of voice definitions.
+ */
+export interface VoiceProvider {
+  /**
+   * Retrieves the list of available speech voices from the provider.
+   * @returns {Promise<SpeechVoice[]>} A promise that resolves to an array of voice definitions.
+   */
+  getVoices(): Promise<SpeechVoice[]>;
+}
+
+/**
+ * Interface defining the contract for text-to-speech synthesis.
+ * Combines authentication, voice discovery, and audio generation capabilities.
+ * @interface
+ * @extends {LLMProvider}
+ * @extends {ModelProvider}
+ * @extends {VoiceProvider}
+ * @method postSpeech - Generates audio from text using the specified parameters.
+ * @param {SpeechCreateParams} params - Parameters for speech generation (text, voice, speed, etc.).
+ * @param {AbortSignal} abortSignal - Signal to cancel the audio generation request.
+ * @returns {Promise<Blob>} A promise resolving to the generated audio as a Blob.
+ */
+export interface TextToSpeechProvider
+  extends LLMProvider,
+    ModelProvider,
+    VoiceProvider {
+  /**
+   * Generates audio from text using the specified parameters.
+   * @param {SpeechCreateParams} params - Configuration for speech synthesis (text, voice, speed, etc.).
+   * @param {AbortSignal} abortSignal - Signal to cancel the request if needed.
+   * @returns {Promise<Blob>} The generated audio data as a Blob (e.g., MP3 or WAV).
+   */
+  postSpeech(
+    params: SpeechCreateParams,
+    abortSignal: AbortSignal
+  ): Promise<Blob>;
+}

From c06f26f53deb1d7a45cb4bda3364bb34296c1cec Mon Sep 17 00:00:00 2001
From: Oleg Shulyakov <olegshulyakov@users.noreply.github.com>
Date: Fri, 17 Oct 2025 02:32:26 +0300
Subject: [PATCH 2/2] feat(settings): add voice manager with TTS mode selection
 and browser support

- Introduce new VoiceManager component for text-to-speech configuration
- Add ttsMode setting with 'browser' and 'provider' options
- Move TTS settings from main settings to new voice manager component
- Add test speech functionality with customizable message
---
 src/config/config-default.json                |  11 +-
 src/i18n/en.json                              |  24 +-
 .../Settings/components/VoiceManager.tsx      | 230 ++++++++++++++++++
 src/pages/Settings/components/index.tsx       |   1 +
 src/pages/Settings/index.tsx                  |  97 +-------
 src/types/configuration.ts                    |  13 +-
 src/types/settings.ts                         |   3 +-
 7 files changed, 277 insertions(+), 102 deletions(-)
 create mode 100644 src/pages/Settings/components/VoiceManager.tsx

diff --git a/src/config/config-default.json b/src/config/config-default.json
index 179187c..09f99c0 100644
--- a/src/config/config-default.json
+++ b/src/config/config-default.json
@@ -4,10 +4,6 @@
   "apiKey": "",
   "model": "",
   "systemMessage": "",
-  "ttsProvider": "",
-  "ttsBaseUrl": "",
-  "ttsApiKey": "",
-  "ttsModel": "",
   "initials": "You",
   "showRawUserMessage": false,
   "showRawAssistantMessage": false,
@@ -40,8 +36,13 @@
   "dry_penalty_last_n": -1,
   "custom": "",
   "pyIntepreterEnabled": false,
+  "ttsMode": "browser",
   "ttsVoice": "",
   "ttsPitch": 1,
   "ttsRate": 1,
-  "ttsVolume": 1
+  "ttsVolume": 1,
+  "ttsProvider": "",
+  "ttsBaseUrl": "",
+  "ttsApiKey": "",
+  "ttsModel": ""
 }
diff --git a/src/i18n/en.json b/src/i18n/en.json
index f62023f..68347d0 100644
--- a/src/i18n/en.json
+++ b/src/i18n/en.json
@@ -374,10 +374,26 @@
       }
     },
     "textToSpeech": {
-      "check": {
-        "label": "Check",
-        "text": "This is a demo of Web Speech Synthesis."
-      }
+      "mode": {
+        "browser": "Browser",
+        "provider": "Voice Provider"
+      },
+      "buttons": {
+        "play": "Play"
+      },
+      "title": {
+        "play": "Play"
+      },
+      "ariaLabels": {
+        "play": "Play test message"
+      },
+      "modeLabel": "Mode",
+      "modeNote": "Choose between browser-based or model-based text-to-speech.",
+      "testSpeech": "Test Speech",
+      "browserNotSupported": "Your browser does not support text-to-speech.",
+      "testText": "This is a demo of Web Speech Synthesis.",
+      "testTextPlaceholder": "Enter text to test speech...",
+      "modelTtsNotImplemented": "Model-based TTS is not implemented for this provider. This would require a dedicated TTS service or model capability."
     },
     "presetManager": {
       "newPreset": "Save the current settings as a preset",
diff --git a/src/pages/Settings/components/VoiceManager.tsx b/src/pages/Settings/components/VoiceManager.tsx
new file mode 100644
index 0000000..e414370
--- /dev/null
+++ b/src/pages/Settings/components/VoiceManager.tsx
@@ -0,0 +1,230 @@
+import { useState } from 'react';
+import { Trans, useTranslation } from 'react-i18next';
+import { LuSpeech, LuVolume2 } from 'react-icons/lu';
+import { Button, Dropdown, Icon, Label } from '../../../components';
+import {
+  IS_SPEECH_SYNTHESIS_SUPPORTED,
+  getSpeechSynthesisVoiceByName,
+  getSpeechSynthesisVoices,
+} from '../../../hooks/useTextToSpeech';
+import { Configuration, ConfigurationKey } from '../../../types';
+import { SettingInputType } from '../../../types/settings';
+import {
+  SettingsModalDropdown,
+  SettingsModalRangeInput,
+  SettingsSectionLabel,
+} from './';
+
+interface VoiceManagerProps {
+  config: Configuration;
+  handleChange: (
+    key: ConfigurationKey
+  ) => (value: string | number | boolean) => void;
+}
+
+export function VoiceManager({ config, handleChange }: VoiceManagerProps) {
+  const { t } = useTranslation();
+
+  const [testMessage, setTestMessage] = useState(
+    t('settings.textToSpeech.testText')
+  );
+
+  const availableVoices = IS_SPEECH_SYNTHESIS_SUPPORTED
+    ? getSpeechSynthesisVoices().map((voice) => ({
+        value: `${voice.name} (${voice.lang})`,
+        label: `${voice.name} (${voice.lang})`,
+      }))
+    : [];
+
+  const handleTestMessageChange = (
+    e: React.ChangeEvent<HTMLTextAreaElement>
+  ) => {
+    setTestMessage(e.target.value);
+  };
+
+  const playBrowserTts = () => {
+    if (!IS_SPEECH_SYNTHESIS_SUPPORTED) return;
+
+    const utterance = new SpeechSynthesisUtterance(testMessage);
+    const voice = getSpeechSynthesisVoiceByName(config.ttsVoice);
+
+    if (voice) {
+      utterance.voice = voice;
+    }
+    utterance.pitch = config.ttsPitch;
+    utterance.rate = config.ttsRate;
+    utterance.volume = config.ttsVolume;
+
+    speechSynthesis.speak(utterance);
+  };
+
+  const playModelTts = async () => {
+    try {
+      // Check if we have the necessary configuration for provider TTS
+      if (config.ttsMode !== 'provider') {
+        console.warn('Provider TTS is not enabled');
+        return;
+      }
+
+      alert(t('settings.textToSpeech.modelTtsNotImplemented'));
+    } catch (error) {
+      console.error('Error with provider TTS:', error);
+    }
+  };
+
+  // Determine which play function to use based on provider
+  const handlePlayTest = () => {
+    if (config.ttsMode === 'provider') {
+      playModelTts();
+    } else {
+      playBrowserTts();
+    }
+  };
+
+  return (
+    <>
+      <SettingsSectionLabel>
+        <Icon size="sm" variant="leftside">
+          <LuSpeech />
+        </Icon>
+        <Trans i18nKey="settings.sections.textToSpeech" />
+      </SettingsSectionLabel>
+
+      {/* TTS Mode Settings */}
+      <div className="form-control flex flex-col justify-center mb-3">
+        <div className="font-bold mb-1 md:hidden">
+          <Trans i18nKey="settings.textToSpeech.modeLabel" />
+        </div>
+        <Label variant="input-bordered" className="mb-1">
+          <div className="font-bold hidden md:block">
+            <Trans i18nKey="settings.textToSpeech.modeLabel" />
+          </div>
+
+          <Dropdown
+            className="grow"
+            entity="ttsMode"
+            options={[
+              {
+                value: 'browser',
+                label: t('settings.textToSpeech.mode.browser'),
+              },
+              {
+                value: 'provider',
+                label: t('settings.textToSpeech.mode.provider'),
+              },
+            ]}
+            currentValue={
+              <span>
+                <Trans
+                  i18nKey={`settings.textToSpeech.mode.${config.ttsMode}`}
+                />
+              </span>
+            }
+            renderOption={(option) => <span>{option.label}</span>}
+            isSelected={(option) => config.ttsMode === option.value}
+            onSelect={(option) => handleChange('ttsMode')(option.value)}
+          />
+        </Label>
+        <div className="text-xs opacity-75 max-w-80">
+          <Trans i18nKey="settings.textToSpeech.modeNote" />
+        </div>
+      </div>
+
+      {/* Browser TTS Settings */}
+      {config.ttsMode === 'browser' && (
+        <>
+          {IS_SPEECH_SYNTHESIS_SUPPORTED ? (
+            <>
+              <SettingsModalDropdown
+                field={{
+                  type: SettingInputType.DROPDOWN,
+                  key: 'ttsVoice',
+                  translateKey: 'ttsVoice',
+                }}
+                options={availableVoices}
+                filterable={true}
+                value={config.ttsVoice}
+                onChange={handleChange('ttsVoice')}
+              />
+
+              <SettingsModalRangeInput
+                field={{
+                  type: SettingInputType.RANGE_INPUT,
+                  key: 'ttsPitch',
+                  translateKey: 'ttsPitch',
+                }}
+                value={config.ttsPitch}
+                min={0}
+                max={2}
+                step={0.5}
+                onChange={handleChange('ttsPitch')}
+              />
+
+              <SettingsModalRangeInput
+                field={{
+                  type: SettingInputType.RANGE_INPUT,
+                  key: 'ttsRate',
+                  translateKey: 'ttsRate',
+                }}
+                value={config.ttsRate}
+                min={0.5}
+                max={2}
+                step={0.5}
+                onChange={handleChange('ttsRate')}
+              />
+
+              <SettingsModalRangeInput
+                field={{
+                  type: SettingInputType.RANGE_INPUT,
+                  key: 'ttsVolume',
+                  translateKey: 'ttsVolume',
+                }}
+                value={config.ttsVolume}
+                min={0}
+                max={1}
+                step={0.25}
+                onChange={handleChange('ttsVolume')}
+              />
+            </>
+          ) : (
+            <div className="text-sm text-error mb-3">
+              <Trans i18nKey="settings.textToSpeech.browserNotSupported" />
+            </div>
+          )}
+        </>
+      )}
+
+      {/* Provider TTS Settings */}
+      {config.ttsMode === 'provider' && <></>}
+
+      {/* Test Section */}
+      <SettingsSectionLabel>
+        <Trans i18nKey="settings.textToSpeech.testSpeech" />
+      </SettingsSectionLabel>
+
+      <Label variant="form-control" className="max-w-80 mb-3">
+        <textarea
+          value={testMessage}
+          onChange={handleTestMessageChange}
+          className="textarea textarea-bordered w-full max-w-80 h-24"
+          placeholder={t('settings.textToSpeech.testTextPlaceholder')}
+        />
+      </Label>
+
+      <Button
+        onClick={handlePlayTest}
+        disabled={
+          (config.ttsMode === 'browser' && !IS_SPEECH_SYNTHESIS_SUPPORTED) ||
+          (config.ttsMode === 'provider' && false)
+        }
+        title={t('settings.textToSpeech.title.play')}
+        aria-label={t('settings.textToSpeech.ariaLabels.play')}
+      >
+        <Icon size="sm" variant="leftside">
+          <LuVolume2 />
+        </Icon>
+        <Trans i18nKey="settings.textToSpeech.buttons.play" />
+      </Button>
+    </>
+  );
+}
diff --git a/src/pages/Settings/components/index.tsx b/src/pages/Settings/components/index.tsx
index 9dd43f2..9b3fe6b 100644
--- a/src/pages/Settings/components/index.tsx
+++ b/src/pages/Settings/components/index.tsx
@@ -2,3 +2,4 @@ export * from './common';
 export * from './ImportExportComponent';
 export * from './PresetManager';
 export * from './ThemeController';
+export * from './VoiceManager';
diff --git a/src/pages/Settings/index.tsx b/src/pages/Settings/index.tsx
index ae2eb73..80a75f4 100644
--- a/src/pages/Settings/index.tsx
+++ b/src/pages/Settings/index.tsx
@@ -23,19 +23,11 @@ import {
   LuRefreshCw,
   LuRocket,
   LuSettings,
-  LuSpeech,
-  LuVolume2,
-  LuVolumeX,
 } from 'react-icons/lu';
 import { useNavigate } from 'react-router';
 import { Button, Dropdown, Icon } from '../../components';
 import { CONFIG_DEFAULT, INFERENCE_PROVIDERS } from '../../config';
 import { useDebouncedCallback } from '../../hooks/useDebouncedCallback';
-import TextToSpeech, {
-  getSpeechSynthesisVoiceByName,
-  getSpeechSynthesisVoices,
-  IS_SPEECH_SYNTHESIS_SUPPORTED,
-} from '../../hooks/useTextToSpeech';
 import { SUPPORTED_LANGUAGES } from '../../i18n';
 import { useAppContext } from '../../store/app';
 import { useChatContext } from '../../store/chat';
@@ -71,6 +63,7 @@ import {
   SettingsModalShortInput,
   SettingsSectionLabel,
   ThemeController,
+  VoiceManager,
 } from './components';
 
 // --- Constants ---
@@ -224,86 +217,10 @@ function getSettingTabsConfiguration(
         </>
       ),
       fields: [
-        /* Text to Speech */
-        toSection(
-          t('settings.sections.textToSpeech'),
-          <Icon size="sm" variant="leftside">
-            <LuSpeech />
-          </Icon>
-        ),
-        toDropdown(
-          'ttsVoice',
-          !IS_SPEECH_SYNTHESIS_SUPPORTED
-            ? []
-            : getSpeechSynthesisVoices().map((voice) => ({
-                value: `${voice.name} (${voice.lang})`,
-                label: `${voice.name} (${voice.lang})`,
-              })),
-          true
-        ),
-        toInput(
-          SettingInputType.RANGE_INPUT,
-          'ttsPitch',
-          !IS_SPEECH_SYNTHESIS_SUPPORTED,
-          {
-            min: 0,
-            max: 2,
-            step: 0.5,
-          }
-        ),
-        toInput(
-          SettingInputType.RANGE_INPUT,
-          'ttsRate',
-          !IS_SPEECH_SYNTHESIS_SUPPORTED,
-          {
-            min: 0.5,
-            max: 2,
-            step: 0.5,
-          }
-        ),
-        toInput(
-          SettingInputType.RANGE_INPUT,
-          'ttsVolume',
-          !IS_SPEECH_SYNTHESIS_SUPPORTED,
-          {
-            min: 0,
-            max: 1,
-            step: 0.25,
-          }
-        ),
         {
           type: SettingInputType.CUSTOM,
-          key: 'custom', // dummy key, won't be used
-          component: () => (
-            <TextToSpeech
-              text={t('settings.textToSpeech.check.text')}
-              voice={getSpeechSynthesisVoiceByName(config.ttsVoice)}
-              pitch={config.ttsPitch}
-              rate={config.ttsRate}
-              volume={config.ttsVolume}
-            >
-              {({ isPlaying, play, stop }) => (
-                <Button
-                  onClick={() => (!isPlaying ? play() : stop())}
-                  disabled={!IS_SPEECH_SYNTHESIS_SUPPORTED}
-                  title="Play test message"
-                  aria-label="Play test message"
-                >
-                  {!isPlaying && (
-                    <Icon size="sm" variant="leftside">
-                      <LuVolume2 />
-                    </Icon>
-                  )}
-                  {isPlaying && (
-                    <Icon size="sm" variant="leftside">
-                      <LuVolumeX />
-                    </Icon>
-                  )}
-                  {t('settings.textToSpeech.check.label')}
-                </Button>
-              )}
-            </TextToSpeech>
-          ),
+          key: 'voice-manager',
+          component: () => null,
         },
       ],
     },
@@ -721,6 +638,14 @@ export default function Settings() {
             );
           case 'theme-manager':
             return <ThemeController key={key} />;
+          case 'voice-manager':
+            return (
+              <VoiceManager
+                key={key}
+                config={localConfig}
+                handleChange={onChange}
+              />
+            );
           case 'fetch-models':
             return (
               <Button
diff --git a/src/types/configuration.ts b/src/types/configuration.ts
index b83361e..d5029f9 100644
--- a/src/types/configuration.ts
+++ b/src/types/configuration.ts
@@ -14,6 +14,8 @@ export type ProviderOption = {
   allowCustomBaseUrl: boolean;
 };
 
+export type TtsMode = 'browser' | 'provider';
+
 export interface Configuration {
   /* text inference */
   provider: string;
@@ -22,12 +24,6 @@ export interface Configuration {
   model: string;
   systemMessage: string;
 
-  /* text to speech */
-  ttsProvider: string;
-  ttsBaseUrl: string;
-  ttsApiKey: string;
-  ttsModel: string;
-
   /* ui */
   initials: string;
   showRawUserMessage: boolean;
@@ -79,6 +75,11 @@ export interface Configuration {
   ttsPitch: number;
   ttsRate: number;
   ttsVolume: number;
+  ttsMode: string;
+  ttsProvider: string /* TODO placeholder */;
+  ttsBaseUrl: string;
+  ttsApiKey: string;
+  ttsModel: string;
 }
 export type ConfigurationKey = keyof Configuration;
 
diff --git a/src/types/settings.ts b/src/types/settings.ts
index 9cf6047..27d140f 100644
--- a/src/types/settings.ts
+++ b/src/types/settings.ts
@@ -36,7 +36,8 @@ export interface SettingFieldCustom {
     | 'import-export'
     | 'preset-manager'
     | 'fetch-models'
-    | 'theme-manager';
+    | 'theme-manager'
+    | 'voice-manager';
   component:
     | string
     | React.FC<{