Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
838 changes: 838 additions & 0 deletions public/kokoro_openapi.json

Large diffs are not rendered by default.

47 changes: 36 additions & 11 deletions src/components/ChatMessage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@ import ChatInputExtraContextItem from './ChatInputExtraContextItem';
import { IntlIconButton } from './common';
import { DropzoneArea } from './DropzoneArea';
import MarkdownDisplay from './MarkdownDisplay';
import TextToSpeech, {
getSpeechSynthesisVoiceByName,
IS_SPEECH_SYNTHESIS_SUPPORTED,
} from './TextToSpeech';
import TextToSpeech from '../components/TextToSpeech';
import {
UnifiedVoice,
useAvailableVoices,
} from '../components/useAvailableVoices';
const IS_SPEECH_SYNTHESIS_SUPPORTED = 'speechSynthesis' in window;

interface SplitMessage {
content: PendingMessage['content'];
Expand Down Expand Up @@ -356,7 +358,6 @@ export default memo(function ChatMessage({
{/* play message */}
<PlayButton
className="btn btn-ghost w-8 h-8 p-0"
disabled={!IS_SPEECH_SYNTHESIS_SUPPORTED || !content}
text={content ?? ''}
/>

Expand Down Expand Up @@ -538,33 +539,57 @@ const ThinkingSection = memo(function ThinkingSection({

interface PlayButtonProps {
className?: string;
disabled?: boolean;
text: string;
}
const PlayButton = memo(function PlayButton({
className,
disabled,
text,
}: PlayButtonProps) {
const { t } = useTranslation();
const {
config: { ttsVoice, ttsPitch, ttsRate, ttsVolume },
config: {
ttsVoice,
ttsPitch,
ttsRate,
ttsVolume,
ttsServerIp,
ttsServerPort,
},
} = useAppContext();
const { voices } = useAvailableVoices(ttsServerIp, ttsServerPort);
const selectedVoice = useMemo(
() => voices.find((v: UnifiedVoice) => v.id === ttsVoice) || null,
[voices, ttsVoice]
);

const isDisabled = useMemo(() => {
if (!text) return true;
if (selectedVoice?.type === 'kokoro') {
return !ttsServerIp || !ttsServerPort;
}
return !IS_SPEECH_SYNTHESIS_SUPPORTED;
}, [text, selectedVoice?.type, ttsServerIp, ttsServerPort]);

return (
<TextToSpeech
text={text}
voice={getSpeechSynthesisVoiceByName(ttsVoice)}
selectedVoice={selectedVoice}
pitch={ttsPitch}
rate={ttsRate}
volume={ttsVolume}
serverConfig={
ttsServerIp && ttsServerPort
? { serverIp: ttsServerIp, serverPort: ttsServerPort }
: undefined
}
>
{({ isPlaying, play, stop }) => (
<Fragment>
{!isPlaying && (
<IntlIconButton
className={className}
onClick={play}
disabled={disabled}
disabled={isDisabled}
t={t}
titleKey="chatScreen.titles.play"
ariaLabelKey="chatScreen.ariaLabels.playMessage"
Expand All @@ -575,7 +600,7 @@ const PlayButton = memo(function PlayButton({
<IntlIconButton
className={className}
onClick={stop}
disabled={disabled}
disabled={isDisabled}
t={t}
titleKey="chatScreen.titles.stop"
ariaLabelKey="chatScreen.ariaLabels.stopMessage"
Expand Down
231 changes: 97 additions & 134 deletions src/components/TextToSpeech.tsx
Original file line number Diff line number Diff line change
@@ -1,63 +1,24 @@
// TextToSpeech.tsx

import {
forwardRef,
Fragment,
ReactNode,
useCallback,
useEffect,
// useEffect,
useImperativeHandle,
useRef,
useState,
} from 'react';

// Define language popularity order (you can customize this)
const popularLanguages = [
'en',
'zh',
'hi',
'es',
'fr',
'ru',
'pt',
'de',
'ja',
'ko',
'it',
'ar',
];

export const IS_SPEECH_SYNTHESIS_SUPPORTED = !!window.speechSynthesis;
export const getSpeechSynthesisVoices = () =>
speechSynthesis
?.getVoices()
.filter((voice) => voice.localService)
.sort((a, b) => {
// Default voice first
if (a.default !== b.default) return a.default ? -1 : 1;

// Popular languages on top
const aRank = popularLanguages.indexOf(a.lang.substring(0, 2));
const bRank = popularLanguages.indexOf(b.lang.substring(0, 2));
if (aRank !== bRank) {
const aEffectiveRank = aRank === -1 ? Infinity : aRank;
const bEffectiveRank = bRank === -1 ? Infinity : bRank;
return aEffectiveRank - bEffectiveRank;
}

// Sort by language and name (alphabetically)
return a.lang.localeCompare(b.lang) || a.name.localeCompare(b.name);
}) || [];
export function getSpeechSynthesisVoiceByName(name: string) {
return getSpeechSynthesisVoices().find(
(voice) => `${voice.name} (${voice.lang})` === name
);
}
import { UnifiedVoice } from './useAvailableVoices';

interface TextToSpeechProps {
text: string;
voice?: SpeechSynthesisVoice;
selectedVoice: UnifiedVoice | null;
pitch?: number;
rate?: number;
volume?: number;
serverConfig?: TTSServerConfig;
}

interface TextToSpeechState {
Expand All @@ -66,117 +27,119 @@ interface TextToSpeechState {
stop: () => void;
}

interface TTSServerConfig {
serverIp: string;
serverPort: string;
}

const useTextToSpeech = ({
text,
voice = getSpeechSynthesisVoices()[0],
selectedVoice,
pitch = 1,
rate = 1,
volume = 1,
}: TextToSpeechProps) => {
serverConfig,
}: TextToSpeechProps & {
serverConfig?: TTSServerConfig;
}): TextToSpeechState => {
const [isPlaying, setIsPlaying] = useState(false);
const utteranceRef = useRef<SpeechSynthesisUtterance | null>(null);

useEffect(() => {
if (!IS_SPEECH_SYNTHESIS_SUPPORTED) {
console.warn('Speech synthesis not supported');
return;
}
if (!text) {
console.warn('No text provided');
return;
}
const audioRef = useRef<HTMLAudioElement | null>(null);

// Clean up previous utterance
if (utteranceRef.current) {
utteranceRef.current.onstart = null;
utteranceRef.current.onend = null;
utteranceRef.current.onerror = null;
}

const utterance = new window.SpeechSynthesisUtterance(text);

utterance.voice = voice;
utterance.pitch = pitch;
utterance.rate = rate;
utterance.volume = volume;

// Event handlers
utterance.onstart = () => {
setIsPlaying(true);
};

utterance.onend = () => {
setIsPlaying(false);
};
const play = useCallback(async () => {
if (!selectedVoice) return;

utterance.onerror = (event) => {
console.error('Speech synthesis error: ', event.error);
setIsPlaying(false);
};

utteranceRef.current = utterance;

return () => {
speechSynthesis.cancel();
if (utteranceRef.current === utterance) {
utteranceRef.current.onstart = null;
utteranceRef.current.onend = null;
utteranceRef.current.onerror = null;
utteranceRef.current = null;
if (selectedVoice.type === 'kokoro') {
if (!serverConfig?.serverIp || !serverConfig?.serverPort) {
console.error('TTS server configuration is missing');
return;
}
};
}, [pitch, rate, text, voice, volume]);

const play = useCallback(() => {
if (!IS_SPEECH_SYNTHESIS_SUPPORTED) {
console.warn('Speech synthesis not supported');
return;
}
speechSynthesis.cancel();

if (utteranceRef.current) {
try {
speechSynthesis.speak(utteranceRef.current);
} catch (error) {
console.error('Speech synthesis error:', error);
setIsPlaying(false);
const response = await fetch(
`http://${serverConfig.serverIp}:${serverConfig.serverPort}/v1/audio/speech`,
{
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: 'kokoro',
input: text,
voice: selectedVoice.raw, // ✅ string
response_format: 'mp3',
speed: rate,
}),
}
);

const blob = await response.blob();
const url = URL.createObjectURL(blob);

const audio = new Audio(url);
audio.volume = volume;
audioRef.current = audio;

audio.onplay = () => setIsPlaying(true);
audio.onended = () => {
setIsPlaying(false);
URL.revokeObjectURL(url);
};
await audio.play();
} catch (err) {
console.error('Kokoro TTS failed:', err);
}
} else if (selectedVoice.type === 'browser') {
const utterance = new SpeechSynthesisUtterance(text);
utterance.voice = selectedVoice.raw; // ✅ SpeechSynthesisVoice
utterance.pitch = pitch;
utterance.rate = rate;
utterance.volume = volume;

utterance.onstart = () => setIsPlaying(true);
utterance.onend = () => setIsPlaying(false);
utterance.onerror = () => setIsPlaying(false);

speechSynthesis.speak(utterance);
}
}, []);
}, [
text,
selectedVoice,
pitch,
rate,
volume,
serverConfig?.serverIp,
serverConfig?.serverPort,
]);

const stop = useCallback(() => {
speechSynthesis.cancel();
setIsPlaying(false);
}, []);

return {
isPlaying,
play,
stop,
};
// Stop browser speech synthesis if active
if (selectedVoice?.type === 'browser') {
speechSynthesis.cancel();
}

// Stop and cleanup Kokoro audio if active
if (audioRef.current) {
audioRef.current.pause();
audioRef.current.currentTime = 0;
audioRef.current.src = '';
audioRef.current = null;
}
}, [selectedVoice?.type]);

return { isPlaying, play, stop };
};

const TextToSpeech = forwardRef<
TextToSpeechState,
TextToSpeechProps & { children: (props: TextToSpeechState) => ReactNode }
>(({ children, text, voice, pitch, rate, volume }, ref) => {
const { isPlaying, play, stop } = useTextToSpeech({
text,
voice,
pitch,
rate,
volume,
});

useImperativeHandle(
ref,
() => ({
isPlaying,
play,
stop,
}),
[isPlaying, play, stop]
);
>(({ children, ...props }, ref) => {
const { isPlaying, play, stop } = useTextToSpeech(props);

useImperativeHandle(ref, () => ({ isPlaying, play, stop }), [
isPlaying,
play,
stop,
]);

return <Fragment>{children({ isPlaying, play, stop })}</Fragment>;
});
Expand Down
Loading