Feature/Add Groq Whisper support (#3706)

ghondar · HenryHengZJ · web-flow · commit 4c29b2390c1d · 2024-12-17T23:11:07.000Z
* feat: Add Groq Whisper support to SpeechToText component

- Introduced a new speech-to-text provider, Groq Whisper, in both the backend and UI components.
- Updated SpeechToTextType to include GROQ_WHISPER.
- Implemented Groq client integration for audio transcription with customizable model, language, and temperature options.
- Added UI elements for Groq Whisper configuration, including input fields for model, language, and temperature settings.

* turn speech to text none status to false when other was selected

---------

Co-authored-by: Henry &lt;hzj94@hotmail.com&gt;
diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts
@@ -3,11 +3,13 @@ import { getCredentialData } from './utils'
 import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai'
 import { AssemblyAI } from 'assemblyai'
 import { getFileFromStorage } from './storageUtils'
+import Groq from 'groq-sdk'
 
 const SpeechToTextType = {
     OPENAI_WHISPER: 'openAIWhisper',
     ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
-    LOCALAI_STT: 'localAISTT'
+    LOCALAI_STT: 'localAISTT',
+    GROQ_WHISPER: 'groqWhisper'
 }
 
 export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
@@ -70,6 +72,23 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi
                 }
                 break
             }
+            case SpeechToTextType.GROQ_WHISPER: {
+                const groqClient = new Groq({
+                    apiKey: credentialData.groqApiKey
+                })
+                const file = await toFile(audio_file, upload.name)
+                const groqTranscription = await groqClient.audio.transcriptions.create({
+                    file,
+                    model: speechToTextConfig?.model || 'whisper-large-v3',
+                    language: speechToTextConfig?.language,
+                    temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
+                    response_format: 'verbose_json'
+                })
+                if (groqTranscription?.text) {
+                    return groqTranscription.text
+                }
+                break
+            }
         }
     } else {
         throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
diff --git a/packages/ui/src/assets/images/groq.png b/packages/ui/src/assets/images/groq.png
diff --git a/packages/ui/src/ui-component/extended/SpeechToText.jsx b/packages/ui/src/ui-component/extended/SpeechToText.jsx
@@ -17,6 +17,7 @@ import { Dropdown } from '@/ui-component/dropdown/Dropdown'
 import openAISVG from '@/assets/images/openai.svg'
 import assemblyAIPng from '@/assets/images/assemblyai.png'
 import localAiPng from '@/assets/images/localai.png'
+import groqPng from '@/assets/images/groq.png'
 
 // store
 import useNotifier from '@/utils/useNotifier'
@@ -29,7 +30,8 @@ import chatflowsApi from '@/api/chatflows'
 const SpeechToTextType = {
     OPENAI_WHISPER: 'openAIWhisper',
     ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
-    LOCALAI_STT: 'localAISTT'
+    LOCALAI_STT: 'localAISTT',
+    GROQ_WHISPER: 'groqWhisper'
 }
 
 // Weird quirk - the key must match the name property value.
@@ -139,6 +141,46 @@ const speechToTextProviders = {
                 optional: true
             }
         ]
+    },
+    [SpeechToTextType.GROQ_WHISPER]: {
+        label: 'Groq Whisper',
+        name: SpeechToTextType.GROQ_WHISPER,
+        icon: groqPng,
+        url: 'https://console.groq.com/',
+        inputs: [
+            {
+                label: 'Model',
+                name: 'model',
+                type: 'string',
+                description: `The STT model to load. Defaults to whisper-large-v3 if left blank.`,
+                placeholder: 'whisper-large-v3',
+                optional: true
+            },
+            {
+                label: 'Connect Credential',
+                name: 'credential',
+                type: 'credential',
+                credentialNames: ['groqApi']
+            },
+            {
+                label: 'Language',
+                name: 'language',
+                type: 'string',
+                description:
+                    'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.',
+                placeholder: 'en',
+                optional: true
+            },
+            {
+                label: 'Temperature',
+                name: 'temperature',
+                type: 'number',
+                step: 0.1,
+                description:
+                    'The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.',
+                optional: true
+            }
+        ]
     }
 }
 
@@ -210,6 +252,9 @@ const SpeechToText = ({ dialogProps }) => {
                     newVal[provider.name] = { ...speechToText[provider.name], status: false }
                 }
             })
+            if (providerName !== 'none') {
+                newVal['none'].status = false
+            }
         }
         setSpeechToText(newVal)
         return newVal