From 68a023cf3c83da972445c77de06f9cb34a6c96cc Mon Sep 17 00:00:00 2001 From: Su Wei Date: Fri, 7 Jul 2023 08:42:54 +0800 Subject: [PATCH 1/2] add AWS Polly,Transcribe --- package.json | 6 + src/config.ts | 10 ++ src/constant.ts | 2 + src/hooks/useGlobalSetting.ts | 7 +- src/hooks/useSpeechService.ts | 176 +++++++++++++++++++- src/pages/Home/components/Content.vue | 117 ++++++++----- src/pages/Setting/components/TTSSetting.vue | 37 +++- 7 files changed, 296 insertions(+), 59 deletions(-) diff --git a/package.json b/package.json index 5763612..1a9a433 100644 --- a/package.json +++ b/package.json @@ -34,13 +34,19 @@ "release:version": "npx standard-version && git push origin --follow-tags" }, "dependencies": { + "@aws-sdk/client-cognito-identity": "^3.363.0", + "@aws-sdk/client-polly": "^3.363.0", + "@aws-sdk/client-transcribe-streaming": "^3.363.0", + "@aws-sdk/credential-provider-cognito-identity": "^3.363.0", "@iconify-json/svg-spinners": "^1.1.1", "@vueuse/core": "^9.13.0", "api2d": "^0.1.18", + "aws-sdk": "^2.1409.0", "dexie": "^3.2.3", "electron-updater": "^5.3.0", "element-plus": "^2.3.3", "eventsource-parser": "^0.1.0", + "microphone-stream": "^6.0.1", "microsoft-cognitiveservices-speech-sdk": "^1.26.0", "pinia": "^2.0.33", "pinia-plugin-persistedstate": "^3.1.0", diff --git a/src/config.ts b/src/config.ts index 0a9483d..6fd9aa6 100644 --- a/src/config.ts +++ b/src/config.ts @@ -190,6 +190,16 @@ export const supportLanguageMap = { 'zh-TW': '中文(台湾普通话)', } as Record +export const awsRegions = [ + 'us-east-1', + 'us-east-2', + 'us-west-1', + 'us-west-2', + 'ap-east-1', + 'ap-southeast-1', + 'eu-central-1', +] + export const azureRegions = [ 'australiaeast', 'australiasoutheast', diff --git a/src/constant.ts b/src/constant.ts index 952630a..d29e8f7 100644 --- a/src/constant.ts +++ b/src/constant.ts @@ -2,6 +2,8 @@ export const OPEN_KEY = 'openKey' export const OPEN_PROXY = 'openProxy' export const AZURE_REGION = 'azureRegion' export const AZURE_KEY = 'azureKey' +export const AWS_REGION = 'awsRegion' +export const AWS_COGNITO_IDENTITY_POOL_ID = 'awsCognitoIdentityPoolId' export const AZURE_TRANSLATE_KEY = 'azureTranslateKey' export const VOICE_API_NAME = 'voiceApiName' export const IS_ALWAYS_RECOGNITION = 'isAlwaysRecognition' diff --git a/src/hooks/useGlobalSetting.ts b/src/hooks/useGlobalSetting.ts index e78b687..375772a 100644 --- a/src/hooks/useGlobalSetting.ts +++ b/src/hooks/useGlobalSetting.ts @@ -1,4 +1,4 @@ -import { AUTO_PLAY, AZURE_KEY, AZURE_REGION, AZURE_TRANSLATE_KEY, CHAT_API_NAME, CHAT_REMEMBER_COUNT, IS_ALWAYS_RECOGNITION, OPEN_KEY, OPEN_MAX_TOKEN, OPEN_MODEL, OPEN_PROXY, SELF_AVATAR_URL, TTS_PASSWORD, VOICE_API_NAME } from '@/constant' +import { AUTO_PLAY,AWS_COGNITO_IDENTITY_POOL_ID, AWS_KEY,AWS_REGION,AWS_SECRET_KEY,AZURE_KEY, AZURE_REGION, AZURE_TRANSLATE_KEY, CHAT_API_NAME, CHAT_REMEMBER_COUNT, IS_ALWAYS_RECOGNITION, OPEN_KEY, OPEN_MAX_TOKEN, OPEN_MODEL, OPEN_PROXY, SELF_AVATAR_URL, TTS_PASSWORD, VOICE_API_NAME } from '@/constant' import { getAvatarUrl } from '@/utils' @@ -7,6 +7,9 @@ export const useGlobalSetting = () => { const openProxy = useLocalStorage(OPEN_PROXY, '') const azureRegion = useLocalStorage(AZURE_REGION, 'eastasia') const azureKey = useLocalStorage(AZURE_KEY, '') + const awsRegion = useLocalStorage(AWS_REGION, 'us-east-1') + const awsCognitoIdentityId = useLocalStorage(AWS_COGNITO_IDENTITY_POOL_ID, '') + const openModel = useLocalStorage(OPEN_MODEL, 'gpt-3.5-turbo') const selfAvatar = useLocalStorage(SELF_AVATAR_URL, getAvatarUrl('self.png')) const chatApiName = useLocalStorage(CHAT_API_NAME, 'openAI') @@ -22,6 +25,8 @@ export const useGlobalSetting = () => { openKey, openProxy, openModel, + awsRegion, + awsCognitoIdentityId, azureRegion, azureKey, selfAvatar, diff --git a/src/hooks/useSpeechService.ts b/src/hooks/useSpeechService.ts index 1af5a26..f1a36c5 100644 --- a/src/hooks/useSpeechService.ts +++ b/src/hooks/useSpeechService.ts @@ -8,6 +8,16 @@ import { SpeechSynthesizer, } from 'microsoft-cognitiveservices-speech-sdk' +import MicrophoneStream from 'microphone-stream'; +import { CognitoIdentityClient } from "@aws-sdk/client-cognito-identity"; +import {fromCognitoIdentityPool} from "@aws-sdk/credential-provider-cognito-identity"; +import { Polly,SynthesizeSpeechInput,DescribeVoicesCommand } from "@aws-sdk/client-polly"; +import { + TranscribeStreamingClient, + StartStreamTranscriptionCommand, +} from '@aws-sdk/client-transcribe-streaming'; + + const defaultAzureRegion = import.meta.env.VITE_REGION const defaultAzureKey = import.meta.env.VITE_SCRIPTION_KEY const accessPassword = import.meta.env.VITE_TTS_ACCESS_PASSWORD @@ -17,8 +27,13 @@ interface Config { isFetchAllVoice?: boolean } export const useSpeechService = ({ langs = ['fr-FR', 'ja-JP', 'en-US', 'zh-CN', 'zh-HK', 'ko-KR', 'de-DE'], isFetchAllVoice = true }: Config = {}) => { - const { azureKey, azureRegion, ttsPassword } = useGlobalSetting() + const { azureKey, azureRegion, ttsPassword,voiceApiName } = useGlobalSetting() + const { awsCognitoIdentityId, awsRegion, } = useGlobalSetting() + + if(voiceApiName.value==="AWS"){ + isFetchAllVoice=false; + } const resultAzureKey = computed(() => { if (!azureKey.value) { if (accessPassword !== ttsPassword.value) @@ -58,6 +73,7 @@ export const useSpeechService = ({ langs = ['fr-FR', 'ja-JP', 'en-US', 'z const audioBlob = ref(new Blob()) const allVoices = ref([]) + const allAWSVoices = ref([]) const recognizer = ref(new SpeechRecognizer(speechConfig.value)) const synthesizer = ref(new SpeechSynthesizer(speechConfig.value)) @@ -74,8 +90,28 @@ export const useSpeechService = ({ langs = ['fr-FR', 'ja-JP', 'en-US', 'z immediate: true, }) - // 语音识别 + // AWS polly and transcribe SDK 初始化 + const audioAWS = new Audio(); + let micStream: MicrophoneStream | undefined = undefined + const polly = new Polly({ + region: awsRegion.value ?? "us-east-1", + credentials: fromCognitoIdentityPool({ + client: new CognitoIdentityClient({ region: awsRegion.value ?? "us-east-1" }), + identityPoolId: awsCognitoIdentityId.value + }), + }); + + const transcribe = new TranscribeStreamingClient({ + region: awsRegion.value ?? "us-east-1", + credentials: fromCognitoIdentityPool({ + client: new CognitoIdentityClient({ region: awsRegion.value ?? "us-east-1" }), + identityPoolId: awsCognitoIdentityId.value + }), + }); + + + // AZure 语音识别 const audioRecorder = async () => { // 暂时通过 mediaRecorder 方式实现录音保存,后续可能会改为直接通过 SpeechRecognizer 实现保存 @@ -250,16 +286,41 @@ export const useSpeechService = ({ langs = ['fr-FR', 'ja-JP', 'en-US', 'z catch (error) { allVoices.value = [] } + const res = await synthesizer.value.getVoicesAsync() + if (res.errorDetails) { + console.error(`获取语音列表失败:${res.errorDetails}, 请检查语音配置`) + return [] + } + return res.voices + }else{ + return [] } - const res = await synthesizer.value.getVoicesAsync() - if (res.errorDetails) { - console.error(`获取语音列表失败:${res.errorDetails}, 请检查语音配置`) - return [] + + } + + // 获取AWS 语音列表 + async function getAWSVoices() { + const params = { + LanguageCode: "en-US" + }; + + try { + const data = await polly.describeVoices(params) + if(data.Voices){ + allAWSVoices.value=data.Voices.map((item)=>{ + return {"id":item.Id,"gender":item.Gender} + }) + } + return data.Voices??[]; + } catch (error) { + console.error("Error retrieving AWS voices:", error); + return []; } - return res.voices + } + function applySynthesizerConfiguration() { // 通过playback结束事件来判断播放结束 player.value = new SpeakerAudioDestination() @@ -279,6 +340,100 @@ export const useSpeechService = ({ langs = ['fr-FR', 'ja-JP', 'en-US', 'z synthesizer.value = new SpeechSynthesizer(speechConfig.value, speakConfig) } + /* AWS Vocie service */ + const startAWSRecognizeSpeech = async (cb?: (text: string) => void) => { + + micStream = new MicrophoneStream(); + // // this part should be put into an async function + + micStream.setStream( + await window.navigator.mediaDevices.getUserMedia({ + video: false, + audio: true, + }) + ); + + + //构造audioSream + isRecognizing.value = true + const MAX_AUDIO_CHUNK_SIZE = 48000 + + const audioStream = async function* () { + for await (const chunk of micStream as unknown as Iterable) { + if (chunk.length <= MAX_AUDIO_CHUNK_SIZE) { + yield { + AudioEvent: { + AudioChunk: pcmEncodeChunk(chunk), + }, + } + } + } + }; + + //PCM 编码 + const pcmEncodeChunk = (chunk: any) => { + const input = MicrophoneStream.toRaw(chunk); + var offset = 0; + var buffer = new ArrayBuffer(input.length * 2); + var view = new DataView(buffer); + for (var i = 0; i < input.length; i++, offset += 2) { + var s = Math.max(-1, Math.min(1, input[i])); + view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true); + } + return Buffer.from(buffer); + }; + //Transcribe stream command 初始化 + const command = new StartStreamTranscriptionCommand({ + LanguageCode: language.value, + MediaEncoding: "pcm", + MediaSampleRateHertz: 44100, + AudioStream: audioStream(), + }); + + const response = await transcribe.send(command); + let resultText = "" + if (response.TranscriptResultStream) { + for await (const event of response.TranscriptResultStream) { + if (event.TranscriptEvent) { + const results = event.TranscriptEvent?.Transcript?.Results; + results?.map((result: any) => { + (result.Alternatives || []).map((alternative: any) => { + const transcript = alternative.Items.map((item: any) => item.Content).join(" "); + resultText = transcript; + cb && cb(transcript) + }); + }); + } + } + isRecognizing.value = false + } + return resultText + + } + + const stopAWSRecognizeSpeech = () => { + micStream?.stop() + } + + + //语音合成 + const awsTextToSpeak = async (text: string, voice?: string) => { + const params: SynthesizeSpeechInput = { + Text: text, + OutputFormat: 'mp3', + VoiceId: 'Joanna', // Replace with the desired voice ID (e.g., Joanna, Matthew, etc.) + }; + + const response = await polly.synthesizeSpeech(params); + + if (response.AudioStream) { + const buffer = await response.AudioStream.transformToByteArray(); + audioAWS.src = URL.createObjectURL(new Blob([buffer], { type: 'audio/mpeg' })); + audioAWS.play(); + } + } + + return { languages, language, @@ -289,16 +444,23 @@ export const useSpeechService = ({ langs = ['fr-FR', 'ja-JP', 'en-US', 'z isRecognizReadying, startRecognizeSpeech, stopRecognizeSpeech, + startAWSRecognizeSpeech, + stopAWSRecognizeSpeech, recognizeSpeech, textToSpeak, + awsTextToSpeak, ssmlToSpeak, stopTextToSpeak, getVoices, + getAWSVoices, allVoices, + allAWSVoices, isSynthesizing, rate, style, audioBlob, player, + audioAWS, + } } diff --git a/src/pages/Home/components/Content.vue b/src/pages/Home/components/Content.vue index 8f05d5f..da6b48b 100644 --- a/src/pages/Home/components/Content.vue +++ b/src/pages/Home/components/Content.vue @@ -14,7 +14,7 @@ interface Translates { // hooks const store = useConversationStore() const { el, scrollToBottom } = useScroll() -const { selfAvatar, openKey, chatRememberCount, autoPlay } = useGlobalSetting() +const { selfAvatar, voiceApiName, openKey, chatRememberCount, autoPlay } = useGlobalSetting() const { language, @@ -26,12 +26,19 @@ const { startRecognizeSpeech, isRecognizReadying, stopRecognizeSpeech, + startAWSRecognizeSpeech, + stopAWSRecognizeSpeech, ssmlToSpeak, + awsTextToSpeak, isSynthesizing, audioBlob, player, + audioAWS, } = useSpeechService({ langs: store.allLanguage as any, isFetchAllVoice: false }) + + + // states const message = ref('') // input message const text = ref('') // current select message @@ -127,7 +134,13 @@ const fetchResponse = async (prompt: ChatMessage[] | string, type: FetchType = F } async function onSubmit(fromRecognize = false) { - if (!verifyOpenKey(openKey.value)) return alert('请输入正确的API-KEY') + //增加OPENAPI Proxy 验证模式: OPENAPI_KEY:PROXY_KEY + if (openKey.value.indexOf(":") > -1) { + if (!verifyOpenKey(openKey.value.split(":")[0])) return alert('请输入正确的API-KEY') + } else { + if (!verifyOpenKey(openKey.value)) return alert('请输入正确的API-KEY') + } + if (!message.value) return store.changeConversations([ @@ -210,24 +223,54 @@ function speakByAI(content: string, index: number) { } speakIndex.value = index text.value = content - ssmlToSpeak(content) + + if (voiceApiName.value === "AWS") { + awsTextToSpeak(content) + } else { + ssmlToSpeak(content) + } + } + + const recognize = async () => { try { console.log('isRecognizing', isRecognizing.value) - if (isRecognizing.value) { - await stopRecognizeSpeech() - onSubmit(true) - console.log('submit', message.value) - return + if (voiceApiName.value === "AWS") { + if (isRecognizing.value){ + //延迟关闭AWS Transcribe Stream Client ,避免语音未完整上传 + setTimeout(()=>{ + stopAWSRecognizeSpeech() + },2000) + isRecognizing.value=false + }else{ + audioAWS.pause() + message.value = '' + + + stopAllSpeaker() // 开启语音识别时停止所有语音播放 + startAWSRecognizeSpeech((textSlice:string) => { + message.value =textSlice || '' + }); + + } + + } else { + if (isRecognizing.value) { + await stopRecognizeSpeech() + onSubmit(true) + console.log('submit', message.value) + return + } + message.value = '' + + stopAllSpeaker() // 开启语音识别时停止所有语音播放 + startRecognizeSpeech((textSlice) => { + message.value += textSlice || '' + }) } - message.value = '' - stopAllSpeaker() // 开启语音识别时停止所有语音播放 - startRecognizeSpeech((textSlice) => { - message.value += textSlice || '' - }) } catch (error) { alert(error) @@ -277,12 +320,8 @@ async function grammarAnalysis(text: string, i: number) {
- + @@ -338,7 +378,8 @@ async function grammarAnalysis(text: string, i: number) { - + @@ -356,11 +397,8 @@ async function grammarAnalysis(text: string, i: number) {
- @@ -373,30 +411,17 @@ async function grammarAnalysis(text: string, i: number) { 录音设备准备中
- +
AI Is Thinking
- -
diff --git a/src/pages/Setting/components/TTSSetting.vue b/src/pages/Setting/components/TTSSetting.vue index fe776ac..38a3bff 100644 --- a/src/pages/Setting/components/TTSSetting.vue +++ b/src/pages/Setting/components/TTSSetting.vue @@ -1,6 +1,6 @@