From b64f41440c72b993c6c1c721d448630ab9030ea8 Mon Sep 17 00:00:00 2001
From: Anthony Bryan Gavilan Vinces <abgv9221@gmail.com>
Date: Mon, 16 Dec 2024 17:24:09 -0500
Subject: [PATCH 1/3] feat: Add Azure Cognitive Services integration for
 speech-to-text functionality

- Introduced a new credential class for Azure Cognitive Services.
- Updated speech-to-text processing to support Azure Cognitive Services as a provider.
- Enhanced UI components to include Azure Cognitive Services options and inputs for configuration.
- Added necessary imports and error handling for Azure API requests.
---
 .../AzureCognitiveServices.credential.ts      | 39 +++++++++++++
 packages/components/src/speechToText.ts       | 42 +++++++++++++-
 .../ui-component/extended/SpeechToText.jsx    | 57 ++++++++++++++++++-
 3 files changed, 136 insertions(+), 2 deletions(-)
 create mode 100644 packages/components/credentials/AzureCognitiveServices.credential.ts

diff --git a/packages/components/credentials/AzureCognitiveServices.credential.ts b/packages/components/credentials/AzureCognitiveServices.credential.ts
new file mode 100644
index 00000000000..66323e088a9
--- /dev/null
+++ b/packages/components/credentials/AzureCognitiveServices.credential.ts
@@ -0,0 +1,39 @@
+import { INodeParams, INodeCredential } from '../src/Interface'
+
+class AzureCognitiveServices implements INodeCredential {
+    label: string
+    name: string
+    version: number
+    inputs: INodeParams[]
+
+    constructor() {
+        this.label = 'Azure Cognitive Services'
+        this.name = 'azureCognitiveServices'
+        this.version = 1.0
+        this.inputs = [
+            {
+                label: 'Azure Subscription Key',
+                name: 'azureSubscriptionKey',
+                type: 'password',
+                description: 'Your Azure Cognitive Services subscription key'
+            },
+            {
+                label: 'Service Region',
+                name: 'serviceRegion',
+                type: 'string',
+                description: 'The Azure service region (e.g., "westus", "eastus")',
+                placeholder: 'westus'
+            },
+            {
+                label: 'API Version',
+                name: 'apiVersion',
+                type: 'string',
+                description: 'The API version to use (e.g., "2024-05-15-preview")',
+                placeholder: '2024-05-15-preview',
+                default: '2024-05-15-preview'
+            }
+        ]
+    }
+}
+
+module.exports = { credClass: AzureCognitiveServices }
diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts
index 821f0221630..9ba35930674 100644
--- a/packages/components/src/speechToText.ts
+++ b/packages/components/src/speechToText.ts
@@ -3,11 +3,14 @@ import { getCredentialData } from './utils'
 import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai'
 import { AssemblyAI } from 'assemblyai'
 import { getFileFromStorage } from './storageUtils'
+import axios from 'axios'
+import FormData from 'form-data'
 
 const SpeechToTextType = {
     OPENAI_WHISPER: 'openAIWhisper',
     ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
-    LOCALAI_STT: 'localAISTT'
+    LOCALAI_STT: 'localAISTT',
+    AZURE_COGNITIVE: 'azureCognitive'
 }
 
 export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
@@ -70,6 +73,43 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi
                 }
                 break
             }
+            case SpeechToTextType.AZURE_COGNITIVE: {
+                try {
+                    const baseUrl = `https://${credentialData.serviceRegion}.cognitiveservices.azure.com/speechtotext/transcriptions:transcribe`
+                    const apiVersion = credentialData.apiVersion || '2024-05-15-preview'
+
+                    const formData = new FormData()
+                    formData.append('audio', audio_file, {
+                        filename: upload.name,
+                        contentType: upload.type
+                    })
+
+                    const channelsStr = speechToTextConfig.channels || '0,1'
+                    const channels = channelsStr.split(',').map(Number)
+
+                    const definition = {
+                        locales: [speechToTextConfig.language || 'en-US'],
+                        profanityFilterMode: speechToTextConfig.profanityFilterMode || 'Masked',
+                        channels
+                    }
+                    formData.append('definition', JSON.stringify(definition))
+
+                    const response = await axios.post(`${baseUrl}?api-version=${apiVersion}`, formData, {
+                        headers: {
+                            'Ocp-Apim-Subscription-Key': credentialData.azureSubscriptionKey,
+                            Accept: 'application/json',
+                            ...formData.getHeaders()
+                        }
+                    })
+
+                    if (response.data && response.data.combinedPhrases.length > 0) {
+                        return response.data.combinedPhrases[0]?.text || ''
+                    }
+                    return ''
+                } catch (error) {
+                    throw error.response?.data || error
+                }
+            }
         }
     } else {
         throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
diff --git a/packages/ui/src/ui-component/extended/SpeechToText.jsx b/packages/ui/src/ui-component/extended/SpeechToText.jsx
index 59f9964b918..e72faf717d0 100644
--- a/packages/ui/src/ui-component/extended/SpeechToText.jsx
+++ b/packages/ui/src/ui-component/extended/SpeechToText.jsx
@@ -17,6 +17,7 @@ import { Dropdown } from '@/ui-component/dropdown/Dropdown'
 import openAISVG from '@/assets/images/openai.svg'
 import assemblyAIPng from '@/assets/images/assemblyai.png'
 import localAiPng from '@/assets/images/localai.png'
+import azureSvg from '@/assets/images/azure_openai.svg'
 
 // store
 import useNotifier from '@/utils/useNotifier'
@@ -29,7 +30,8 @@ import chatflowsApi from '@/api/chatflows'
 const SpeechToTextType = {
     OPENAI_WHISPER: 'openAIWhisper',
     ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
-    LOCALAI_STT: 'localAISTT'
+    LOCALAI_STT: 'localAISTT',
+    AZURE_COGNITIVE: 'azureCognitive'
 }
 
 // Weird quirk - the key must match the name property value.
@@ -139,6 +141,59 @@ const speechToTextProviders = {
                 optional: true
             }
         ]
+    },
+    [SpeechToTextType.AZURE_COGNITIVE]: {
+        label: 'Azure Cognitive Services',
+        name: SpeechToTextType.AZURE_COGNITIVE,
+        icon: azureSvg,
+        url: 'https://azure.microsoft.com/en-us/products/cognitive-services/speech-services',
+        inputs: [
+            {
+                label: 'Connect Credential',
+                name: 'credential',
+                type: 'credential',
+                credentialNames: ['azureCognitiveServices']
+            },
+            {
+                label: 'Language',
+                name: 'language',
+                type: 'string',
+                description: 'The recognition language (e.g., "en-US", "es-ES")',
+                placeholder: 'en-US',
+                optional: true
+            },
+            {
+                label: 'Profanity Filter Mode',
+                name: 'profanityFilterMode',
+                type: 'options',
+                description: 'How to handle profanity in the transcription',
+                options: [
+                    {
+                        label: 'None',
+                        name: 'None'
+                    },
+                    {
+                        label: 'Masked',
+                        name: 'Masked'
+                    },
+                    {
+                        label: 'Removed',
+                        name: 'Removed'
+                    }
+                ],
+                default: 'Masked',
+                optional: true
+            },
+            {
+                label: 'Audio Channels',
+                name: 'channels',
+                type: 'string',
+                description: 'Comma-separated list of audio channels to process (e.g., "0,1")',
+                placeholder: '0,1',
+                default: '0,1',
+                optional: true
+            }
+        ]
     }
 }
 

From e557e88470c51c55066e93f8923b0bb2a508b153 Mon Sep 17 00:00:00 2001
From: Henry Heng <henryheng@flowiseai.com>
Date: Wed, 18 Dec 2024 00:16:39 +0000
Subject: [PATCH 2/3] Update SpeechToText.jsx linting

---
 packages/ui/src/ui-component/extended/SpeechToText.jsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/ui/src/ui-component/extended/SpeechToText.jsx b/packages/ui/src/ui-component/extended/SpeechToText.jsx
index 08ab5cbbc03..145f7baa8e5 100644
--- a/packages/ui/src/ui-component/extended/SpeechToText.jsx
+++ b/packages/ui/src/ui-component/extended/SpeechToText.jsx
@@ -192,7 +192,7 @@ const speechToTextProviders = {
                 type: 'string',
                 description: 'Comma-separated list of audio channels to process (e.g., "0,1")',
                 placeholder: '0,1',
-                default: '0,1',
+                default: '0,1'
             }
         ]
     },

From cb0b81e31624c7bd9c6846ac083a14eb9ee982e4 Mon Sep 17 00:00:00 2001
From: Anthony Bryan Gavilan Vinces <abgv9221@gmail.com>
Date: Tue, 17 Dec 2024 20:14:02 -0500
Subject: [PATCH 3/3] refactor: Update audio file handling in SpeechToText
 component

- Removed the dependency on 'form-data' and replaced it with a Blob for audio file uploads.
- Simplified the audio file appending process to the form data.
- Cleaned up the headers in the Axios request by removing unnecessary form data headers.

This change enhances the efficiency of audio file processing in the speech-to-text functionality.
---
 packages/components/src/speechToText.ts | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts
index eb18af225ee..fbb659d54e3 100644
--- a/packages/components/src/speechToText.ts
+++ b/packages/components/src/speechToText.ts
@@ -4,7 +4,6 @@ import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai'
 import { AssemblyAI } from 'assemblyai'
 import { getFileFromStorage } from './storageUtils'
 import axios from 'axios'
-import FormData from 'form-data'
 import Groq from 'groq-sdk'
 
 const SpeechToTextType = {
@@ -81,10 +80,8 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi
                     const apiVersion = credentialData.apiVersion || '2024-05-15-preview'
 
                     const formData = new FormData()
-                    formData.append('audio', audio_file, {
-                        filename: upload.name,
-                        contentType: upload.type
-                    })
+                    const audioBlob = new Blob([audio_file], { type: upload.type })
+                    formData.append('audio', audioBlob, upload.name)
 
                     const channelsStr = speechToTextConfig.channels || '0,1'
                     const channels = channelsStr.split(',').map(Number)
@@ -99,8 +96,7 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi
                     const response = await axios.post(`${baseUrl}?api-version=${apiVersion}`, formData, {
                         headers: {
                             'Ocp-Apim-Subscription-Key': credentialData.azureSubscriptionKey,
-                            Accept: 'application/json',
-                            ...formData.getHeaders()
+                            Accept: 'application/json'
                         }
                     })