olegshulyakov · mpetruc · Oct 7, 2025
diff --git a/public/kokoro_openapi.json b/public/kokoro_openapi.json
diff --git a/src/components/ChatMessage.tsx b/src/components/ChatMessage.tsx
@@ -36,10 +36,12 @@ import ChatInputExtraContextItem from './ChatInputExtraContextItem';
 import { IntlIconButton } from './common';
 import { DropzoneArea } from './DropzoneArea';
 import MarkdownDisplay from './MarkdownDisplay';
-import TextToSpeech, {
-  getSpeechSynthesisVoiceByName,
-  IS_SPEECH_SYNTHESIS_SUPPORTED,
-} from './TextToSpeech';
+import TextToSpeech from '../components/TextToSpeech';
+import {
+  UnifiedVoice,
+  useAvailableVoices,
+} from '../components/useAvailableVoices';
+const IS_SPEECH_SYNTHESIS_SUPPORTED = 'speechSynthesis' in window;
 
 interface SplitMessage {
   content: PendingMessage['content'];
@@ -356,7 +358,6 @@ export default memo(function ChatMessage({
           {/* play message */}
           <PlayButton
             className="btn btn-ghost w-8 h-8 p-0"
-            disabled={!IS_SPEECH_SYNTHESIS_SUPPORTED || !content}
             text={content ?? ''}
           />
 
@@ -538,33 +539,57 @@ const ThinkingSection = memo(function ThinkingSection({
 
 interface PlayButtonProps {
   className?: string;
-  disabled?: boolean;
   text: string;
 }
 const PlayButton = memo(function PlayButton({
   className,
-  disabled,
   text,
 }: PlayButtonProps) {
   const { t } = useTranslation();
   const {
-    config: { ttsVoice, ttsPitch, ttsRate, ttsVolume },
+    config: {
+      ttsVoice,
+      ttsPitch,
+      ttsRate,
+      ttsVolume,
+      ttsServerIp,
+      ttsServerPort,
+    },
   } = useAppContext();
+  const { voices } = useAvailableVoices(ttsServerIp, ttsServerPort);
+  const selectedVoice = useMemo(
+    () => voices.find((v: UnifiedVoice) => v.id === ttsVoice) || null,
+    [voices, ttsVoice]
+  );
+
+  const isDisabled = useMemo(() => {
+    if (!text) return true;
+    if (selectedVoice?.type === 'kokoro') {
+      return !ttsServerIp || !ttsServerPort;
+    }
+    return !IS_SPEECH_SYNTHESIS_SUPPORTED;
+  }, [text, selectedVoice?.type, ttsServerIp, ttsServerPort]);
+
   return (
     <TextToSpeech
       text={text}
-      voice={getSpeechSynthesisVoiceByName(ttsVoice)}
+      selectedVoice={selectedVoice}
       pitch={ttsPitch}
       rate={ttsRate}
       volume={ttsVolume}
+      serverConfig={
+        ttsServerIp && ttsServerPort
+          ? { serverIp: ttsServerIp, serverPort: ttsServerPort }
+          : undefined
+      }
     >
       {({ isPlaying, play, stop }) => (
         <Fragment>
           {!isPlaying && (
             <IntlIconButton
               className={className}
               onClick={play}
-              disabled={disabled}
+              disabled={isDisabled}
               t={t}
               titleKey="chatScreen.titles.play"
               ariaLabelKey="chatScreen.ariaLabels.playMessage"
@@ -575,7 +600,7 @@ const PlayButton = memo(function PlayButton({
             <IntlIconButton
               className={className}
               onClick={stop}
-              disabled={disabled}
+              disabled={isDisabled}
               t={t}
               titleKey="chatScreen.titles.stop"
               ariaLabelKey="chatScreen.ariaLabels.stopMessage"

diff --git a/src/components/TextToSpeech.tsx b/src/components/TextToSpeech.tsx
@@ -1,63 +1,24 @@
+// TextToSpeech.tsx
+
 import {
   forwardRef,
   Fragment,
   ReactNode,
   useCallback,
-  useEffect,
+  // useEffect,
   useImperativeHandle,
   useRef,
   useState,
 } from 'react';
-
-// Define language popularity order (you can customize this)
-const popularLanguages = [
-  'en',
-  'zh',
-  'hi',
-  'es',
-  'fr',
-  'ru',
-  'pt',
-  'de',
-  'ja',
-  'ko',
-  'it',
-  'ar',
-];
-
-export const IS_SPEECH_SYNTHESIS_SUPPORTED = !!window.speechSynthesis;
-export const getSpeechSynthesisVoices = () =>
-  speechSynthesis
-    ?.getVoices()
-    .filter((voice) => voice.localService)
-    .sort((a, b) => {
-      // Default voice first
-      if (a.default !== b.default) return a.default ? -1 : 1;
-
-      // Popular languages on top
-      const aRank = popularLanguages.indexOf(a.lang.substring(0, 2));
-      const bRank = popularLanguages.indexOf(b.lang.substring(0, 2));
-      if (aRank !== bRank) {
-        const aEffectiveRank = aRank === -1 ? Infinity : aRank;
-        const bEffectiveRank = bRank === -1 ? Infinity : bRank;
-        return aEffectiveRank - bEffectiveRank;
-      }
-
-      // Sort by language and name (alphabetically)
-      return a.lang.localeCompare(b.lang) || a.name.localeCompare(b.name);
-    }) || [];
-export function getSpeechSynthesisVoiceByName(name: string) {
-  return getSpeechSynthesisVoices().find(
-    (voice) => `${voice.name} (${voice.lang})` === name
-  );
-}
+import { UnifiedVoice } from './useAvailableVoices';
 
 interface TextToSpeechProps {
   text: string;
-  voice?: SpeechSynthesisVoice;
+  selectedVoice: UnifiedVoice | null;
   pitch?: number;
   rate?: number;
   volume?: number;
+  serverConfig?: TTSServerConfig;
 }
 
 interface TextToSpeechState {
@@ -66,117 +27,119 @@ interface TextToSpeechState {
   stop: () => void;
 }
 
+interface TTSServerConfig {
+  serverIp: string;
+  serverPort: string;
+}
+
 const useTextToSpeech = ({
   text,
-  voice = getSpeechSynthesisVoices()[0],
+  selectedVoice,
   pitch = 1,
   rate = 1,
   volume = 1,
-}: TextToSpeechProps) => {
+  serverConfig,
+}: TextToSpeechProps & {
+  serverConfig?: TTSServerConfig;
+}): TextToSpeechState => {
   const [isPlaying, setIsPlaying] = useState(false);
-  const utteranceRef = useRef<SpeechSynthesisUtterance | null>(null);
-
-  useEffect(() => {
-    if (!IS_SPEECH_SYNTHESIS_SUPPORTED) {
-      console.warn('Speech synthesis not supported');
-      return;
-    }
-    if (!text) {
-      console.warn('No text provided');
-      return;
-    }
+  const audioRef = useRef<HTMLAudioElement | null>(null);
 
-    // Clean up previous utterance
-    if (utteranceRef.current) {
-      utteranceRef.current.onstart = null;
-      utteranceRef.current.onend = null;
-      utteranceRef.current.onerror = null;
-    }
-
-    const utterance = new window.SpeechSynthesisUtterance(text);
-
-    utterance.voice = voice;
-    utterance.pitch = pitch;
-    utterance.rate = rate;
-    utterance.volume = volume;
-
-    // Event handlers
-    utterance.onstart = () => {
-      setIsPlaying(true);
-    };
-
-    utterance.onend = () => {
-      setIsPlaying(false);
-    };
+  const play = useCallback(async () => {
+    if (!selectedVoice) return;
 
-    utterance.onerror = (event) => {
-      console.error('Speech synthesis error: ', event.error);
-      setIsPlaying(false);
-    };
-
-    utteranceRef.current = utterance;
-
-    return () => {
-      speechSynthesis.cancel();
-      if (utteranceRef.current === utterance) {
-        utteranceRef.current.onstart = null;
-        utteranceRef.current.onend = null;
-        utteranceRef.current.onerror = null;
-        utteranceRef.current = null;
+    if (selectedVoice.type === 'kokoro') {
+      if (!serverConfig?.serverIp || !serverConfig?.serverPort) {
+        console.error('TTS server configuration is missing');
+        return;
       }
-    };
-  }, [pitch, rate, text, voice, volume]);
-
-  const play = useCallback(() => {
-    if (!IS_SPEECH_SYNTHESIS_SUPPORTED) {
-      console.warn('Speech synthesis not supported');
-      return;
-    }
-    speechSynthesis.cancel();
 
-    if (utteranceRef.current) {
       try {
-        speechSynthesis.speak(utteranceRef.current);
-      } catch (error) {
-        console.error('Speech synthesis error:', error);
-        setIsPlaying(false);
+        const response = await fetch(
+          `http://${serverConfig.serverIp}:${serverConfig.serverPort}/v1/audio/speech`,
+          {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+              model: 'kokoro',
+              input: text,
+              voice: selectedVoice.raw, // ✅ string
+              response_format: 'mp3',
+              speed: rate,
+            }),
+          }
+        );
+
+        const blob = await response.blob();
+        const url = URL.createObjectURL(blob);
+
+        const audio = new Audio(url);
+        audio.volume = volume;
+        audioRef.current = audio;
+
+        audio.onplay = () => setIsPlaying(true);
+        audio.onended = () => {
+          setIsPlaying(false);
+          URL.revokeObjectURL(url);
+        };
+        await audio.play();
+      } catch (err) {
+        console.error('Kokoro TTS failed:', err);
       }
+    } else if (selectedVoice.type === 'browser') {
+      const utterance = new SpeechSynthesisUtterance(text);
+      utterance.voice = selectedVoice.raw; // ✅ SpeechSynthesisVoice
+      utterance.pitch = pitch;
+      utterance.rate = rate;
+      utterance.volume = volume;
+
+      utterance.onstart = () => setIsPlaying(true);
+      utterance.onend = () => setIsPlaying(false);
+      utterance.onerror = () => setIsPlaying(false);
+
+      speechSynthesis.speak(utterance);
     }
-  }, []);
+  }, [
+    text,
+    selectedVoice,
+    pitch,
+    rate,
+    volume,
+    serverConfig?.serverIp,
+    serverConfig?.serverPort,
+  ]);
 
   const stop = useCallback(() => {
-    speechSynthesis.cancel();
     setIsPlaying(false);
-  }, []);
 
-  return {
-    isPlaying,
-    play,
-    stop,
-  };
+    // Stop browser speech synthesis if active
+    if (selectedVoice?.type === 'browser') {
+      speechSynthesis.cancel();
+    }
+
+    // Stop and cleanup Kokoro audio if active
+    if (audioRef.current) {
+      audioRef.current.pause();
+      audioRef.current.currentTime = 0;
+      audioRef.current.src = '';
+      audioRef.current = null;
+    }
+  }, [selectedVoice?.type]);
+
+  return { isPlaying, play, stop };
 };
 
 const TextToSpeech = forwardRef<
   TextToSpeechState,
   TextToSpeechProps & { children: (props: TextToSpeechState) => ReactNode }
->(({ children, text, voice, pitch, rate, volume }, ref) => {
-  const { isPlaying, play, stop } = useTextToSpeech({
-    text,
-    voice,
-    pitch,
-    rate,
-    volume,
-  });
-
-  useImperativeHandle(
-    ref,
-    () => ({
-      isPlaying,
-      play,
-      stop,
-    }),
-    [isPlaying, play, stop]
-  );
+>(({ children, ...props }, ref) => {
+  const { isPlaying, play, stop } = useTextToSpeech(props);
+
+  useImperativeHandle(ref, () => ({ isPlaying, play, stop }), [
+    isPlaying,
+    play,
+    stop,
+  ]);
 
   return <Fragment>{children({ isPlaying, play, stop })}</Fragment>;
 });