From b64f41440c72b993c6c1c721d448630ab9030ea8 Mon Sep 17 00:00:00 2001 From: Anthony Bryan Gavilan Vinces Date: Mon, 16 Dec 2024 17:24:09 -0500 Subject: [PATCH 1/3] feat: Add Azure Cognitive Services integration for speech-to-text functionality - Introduced a new credential class for Azure Cognitive Services. - Updated speech-to-text processing to support Azure Cognitive Services as a provider. - Enhanced UI components to include Azure Cognitive Services options and inputs for configuration. - Added necessary imports and error handling for Azure API requests. --- .../AzureCognitiveServices.credential.ts | 39 +++++++++++++ packages/components/src/speechToText.ts | 42 +++++++++++++- .../ui-component/extended/SpeechToText.jsx | 57 ++++++++++++++++++- 3 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 packages/components/credentials/AzureCognitiveServices.credential.ts diff --git a/packages/components/credentials/AzureCognitiveServices.credential.ts b/packages/components/credentials/AzureCognitiveServices.credential.ts new file mode 100644 index 00000000000..66323e088a9 --- /dev/null +++ b/packages/components/credentials/AzureCognitiveServices.credential.ts @@ -0,0 +1,39 @@ +import { INodeParams, INodeCredential } from '../src/Interface' + +class AzureCognitiveServices implements INodeCredential { + label: string + name: string + version: number + inputs: INodeParams[] + + constructor() { + this.label = 'Azure Cognitive Services' + this.name = 'azureCognitiveServices' + this.version = 1.0 + this.inputs = [ + { + label: 'Azure Subscription Key', + name: 'azureSubscriptionKey', + type: 'password', + description: 'Your Azure Cognitive Services subscription key' + }, + { + label: 'Service Region', + name: 'serviceRegion', + type: 'string', + description: 'The Azure service region (e.g., "westus", "eastus")', + placeholder: 'westus' + }, + { + label: 'API Version', + name: 'apiVersion', + type: 'string', + description: 'The API version to use (e.g., "2024-05-15-preview")', + placeholder: '2024-05-15-preview', + default: '2024-05-15-preview' + } + ] + } +} + +module.exports = { credClass: AzureCognitiveServices } diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts index 821f0221630..9ba35930674 100644 --- a/packages/components/src/speechToText.ts +++ b/packages/components/src/speechToText.ts @@ -3,11 +3,14 @@ import { getCredentialData } from './utils' import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai' import { AssemblyAI } from 'assemblyai' import { getFileFromStorage } from './storageUtils' +import axios from 'axios' +import FormData from 'form-data' const SpeechToTextType = { OPENAI_WHISPER: 'openAIWhisper', ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe', - LOCALAI_STT: 'localAISTT' + LOCALAI_STT: 'localAISTT', + AZURE_COGNITIVE: 'azureCognitive' } export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => { @@ -70,6 +73,43 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi } break } + case SpeechToTextType.AZURE_COGNITIVE: { + try { + const baseUrl = `https://${credentialData.serviceRegion}.cognitiveservices.azure.com/speechtotext/transcriptions:transcribe` + const apiVersion = credentialData.apiVersion || '2024-05-15-preview' + + const formData = new FormData() + formData.append('audio', audio_file, { + filename: upload.name, + contentType: upload.type + }) + + const channelsStr = speechToTextConfig.channels || '0,1' + const channels = channelsStr.split(',').map(Number) + + const definition = { + locales: [speechToTextConfig.language || 'en-US'], + profanityFilterMode: speechToTextConfig.profanityFilterMode || 'Masked', + channels + } + formData.append('definition', JSON.stringify(definition)) + + const response = await axios.post(`${baseUrl}?api-version=${apiVersion}`, formData, { + headers: { + 'Ocp-Apim-Subscription-Key': credentialData.azureSubscriptionKey, + Accept: 'application/json', + ...formData.getHeaders() + } + }) + + if (response.data && response.data.combinedPhrases.length > 0) { + return response.data.combinedPhrases[0]?.text || '' + } + return '' + } catch (error) { + throw error.response?.data || error + } + } } } else { throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.') diff --git a/packages/ui/src/ui-component/extended/SpeechToText.jsx b/packages/ui/src/ui-component/extended/SpeechToText.jsx index 59f9964b918..e72faf717d0 100644 --- a/packages/ui/src/ui-component/extended/SpeechToText.jsx +++ b/packages/ui/src/ui-component/extended/SpeechToText.jsx @@ -17,6 +17,7 @@ import { Dropdown } from '@/ui-component/dropdown/Dropdown' import openAISVG from '@/assets/images/openai.svg' import assemblyAIPng from '@/assets/images/assemblyai.png' import localAiPng from '@/assets/images/localai.png' +import azureSvg from '@/assets/images/azure_openai.svg' // store import useNotifier from '@/utils/useNotifier' @@ -29,7 +30,8 @@ import chatflowsApi from '@/api/chatflows' const SpeechToTextType = { OPENAI_WHISPER: 'openAIWhisper', ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe', - LOCALAI_STT: 'localAISTT' + LOCALAI_STT: 'localAISTT', + AZURE_COGNITIVE: 'azureCognitive' } // Weird quirk - the key must match the name property value. @@ -139,6 +141,59 @@ const speechToTextProviders = { optional: true } ] + }, + [SpeechToTextType.AZURE_COGNITIVE]: { + label: 'Azure Cognitive Services', + name: SpeechToTextType.AZURE_COGNITIVE, + icon: azureSvg, + url: 'https://azure.microsoft.com/en-us/products/cognitive-services/speech-services', + inputs: [ + { + label: 'Connect Credential', + name: 'credential', + type: 'credential', + credentialNames: ['azureCognitiveServices'] + }, + { + label: 'Language', + name: 'language', + type: 'string', + description: 'The recognition language (e.g., "en-US", "es-ES")', + placeholder: 'en-US', + optional: true + }, + { + label: 'Profanity Filter Mode', + name: 'profanityFilterMode', + type: 'options', + description: 'How to handle profanity in the transcription', + options: [ + { + label: 'None', + name: 'None' + }, + { + label: 'Masked', + name: 'Masked' + }, + { + label: 'Removed', + name: 'Removed' + } + ], + default: 'Masked', + optional: true + }, + { + label: 'Audio Channels', + name: 'channels', + type: 'string', + description: 'Comma-separated list of audio channels to process (e.g., "0,1")', + placeholder: '0,1', + default: '0,1', + optional: true + } + ] } } From e557e88470c51c55066e93f8923b0bb2a508b153 Mon Sep 17 00:00:00 2001 From: Henry Heng Date: Wed, 18 Dec 2024 00:16:39 +0000 Subject: [PATCH 2/3] Update SpeechToText.jsx linting --- packages/ui/src/ui-component/extended/SpeechToText.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ui/src/ui-component/extended/SpeechToText.jsx b/packages/ui/src/ui-component/extended/SpeechToText.jsx index 08ab5cbbc03..145f7baa8e5 100644 --- a/packages/ui/src/ui-component/extended/SpeechToText.jsx +++ b/packages/ui/src/ui-component/extended/SpeechToText.jsx @@ -192,7 +192,7 @@ const speechToTextProviders = { type: 'string', description: 'Comma-separated list of audio channels to process (e.g., "0,1")', placeholder: '0,1', - default: '0,1', + default: '0,1' } ] }, From cb0b81e31624c7bd9c6846ac083a14eb9ee982e4 Mon Sep 17 00:00:00 2001 From: Anthony Bryan Gavilan Vinces Date: Tue, 17 Dec 2024 20:14:02 -0500 Subject: [PATCH 3/3] refactor: Update audio file handling in SpeechToText component - Removed the dependency on 'form-data' and replaced it with a Blob for audio file uploads. - Simplified the audio file appending process to the form data. - Cleaned up the headers in the Axios request by removing unnecessary form data headers. This change enhances the efficiency of audio file processing in the speech-to-text functionality. --- packages/components/src/speechToText.ts | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts index eb18af225ee..fbb659d54e3 100644 --- a/packages/components/src/speechToText.ts +++ b/packages/components/src/speechToText.ts @@ -4,7 +4,6 @@ import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai' import { AssemblyAI } from 'assemblyai' import { getFileFromStorage } from './storageUtils' import axios from 'axios' -import FormData from 'form-data' import Groq from 'groq-sdk' const SpeechToTextType = { @@ -81,10 +80,8 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi const apiVersion = credentialData.apiVersion || '2024-05-15-preview' const formData = new FormData() - formData.append('audio', audio_file, { - filename: upload.name, - contentType: upload.type - }) + const audioBlob = new Blob([audio_file], { type: upload.type }) + formData.append('audio', audioBlob, upload.name) const channelsStr = speechToTextConfig.channels || '0,1' const channels = channelsStr.split(',').map(Number) @@ -99,8 +96,7 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi const response = await axios.post(`${baseUrl}?api-version=${apiVersion}`, formData, { headers: { 'Ocp-Apim-Subscription-Key': credentialData.azureSubscriptionKey, - Accept: 'application/json', - ...formData.getHeaders() + Accept: 'application/json' } })