Skip to content

Commit 2360f5f

Browse files
ghondarHenryHengZJ
andauthored
Feature/Add Azure Cognitive speech-to-text functionality (#3718)
* feat: Add Azure Cognitive Services integration for speech-to-text functionality - Introduced a new credential class for Azure Cognitive Services. - Updated speech-to-text processing to support Azure Cognitive Services as a provider. - Enhanced UI components to include Azure Cognitive Services options and inputs for configuration. - Added necessary imports and error handling for Azure API requests. * Update SpeechToText.jsx linting * refactor: Update audio file handling in SpeechToText component - Removed the dependency on 'form-data' and replaced it with a Blob for audio file uploads. - Simplified the audio file appending process to the form data. - Cleaned up the headers in the Axios request by removing unnecessary form data headers. This change enhances the efficiency of audio file processing in the speech-to-text functionality. --------- Co-authored-by: Henry Heng <[email protected]> Co-authored-by: Henry <[email protected]>
1 parent fff6319 commit 2360f5f

File tree

3 files changed

+129
-0
lines changed

3 files changed

+129
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import { INodeParams, INodeCredential } from '../src/Interface'
2+
3+
class AzureCognitiveServices implements INodeCredential {
4+
label: string
5+
name: string
6+
version: number
7+
inputs: INodeParams[]
8+
9+
constructor() {
10+
this.label = 'Azure Cognitive Services'
11+
this.name = 'azureCognitiveServices'
12+
this.version = 1.0
13+
this.inputs = [
14+
{
15+
label: 'Azure Subscription Key',
16+
name: 'azureSubscriptionKey',
17+
type: 'password',
18+
description: 'Your Azure Cognitive Services subscription key'
19+
},
20+
{
21+
label: 'Service Region',
22+
name: 'serviceRegion',
23+
type: 'string',
24+
description: 'The Azure service region (e.g., "westus", "eastus")',
25+
placeholder: 'westus'
26+
},
27+
{
28+
label: 'API Version',
29+
name: 'apiVersion',
30+
type: 'string',
31+
description: 'The API version to use (e.g., "2024-05-15-preview")',
32+
placeholder: '2024-05-15-preview',
33+
default: '2024-05-15-preview'
34+
}
35+
]
36+
}
37+
}
38+
39+
module.exports = { credClass: AzureCognitiveServices }

packages/components/src/speechToText.ts

+36
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@ import { getCredentialData } from './utils'
33
import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai'
44
import { AssemblyAI } from 'assemblyai'
55
import { getFileFromStorage } from './storageUtils'
6+
import axios from 'axios'
67
import Groq from 'groq-sdk'
78

89
const SpeechToTextType = {
910
OPENAI_WHISPER: 'openAIWhisper',
1011
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
1112
LOCALAI_STT: 'localAISTT',
13+
AZURE_COGNITIVE: 'azureCognitive',
1214
GROQ_WHISPER: 'groqWhisper'
1315
}
1416

@@ -72,6 +74,40 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi
7274
}
7375
break
7476
}
77+
case SpeechToTextType.AZURE_COGNITIVE: {
78+
try {
79+
const baseUrl = `https://${credentialData.serviceRegion}.cognitiveservices.azure.com/speechtotext/transcriptions:transcribe`
80+
const apiVersion = credentialData.apiVersion || '2024-05-15-preview'
81+
82+
const formData = new FormData()
83+
const audioBlob = new Blob([audio_file], { type: upload.type })
84+
formData.append('audio', audioBlob, upload.name)
85+
86+
const channelsStr = speechToTextConfig.channels || '0,1'
87+
const channels = channelsStr.split(',').map(Number)
88+
89+
const definition = {
90+
locales: [speechToTextConfig.language || 'en-US'],
91+
profanityFilterMode: speechToTextConfig.profanityFilterMode || 'Masked',
92+
channels
93+
}
94+
formData.append('definition', JSON.stringify(definition))
95+
96+
const response = await axios.post(`${baseUrl}?api-version=${apiVersion}`, formData, {
97+
headers: {
98+
'Ocp-Apim-Subscription-Key': credentialData.azureSubscriptionKey,
99+
Accept: 'application/json'
100+
}
101+
})
102+
103+
if (response.data && response.data.combinedPhrases.length > 0) {
104+
return response.data.combinedPhrases[0]?.text || ''
105+
}
106+
return ''
107+
} catch (error) {
108+
throw error.response?.data || error
109+
}
110+
}
75111
case SpeechToTextType.GROQ_WHISPER: {
76112
const groqClient = new Groq({
77113
apiKey: credentialData.groqApiKey

packages/ui/src/ui-component/extended/SpeechToText.jsx

+54
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import { Dropdown } from '@/ui-component/dropdown/Dropdown'
1717
import openAISVG from '@/assets/images/openai.svg'
1818
import assemblyAIPng from '@/assets/images/assemblyai.png'
1919
import localAiPng from '@/assets/images/localai.png'
20+
import azureSvg from '@/assets/images/azure_openai.svg'
2021
import groqPng from '@/assets/images/groq.png'
2122

2223
// store
@@ -31,6 +32,7 @@ const SpeechToTextType = {
3132
OPENAI_WHISPER: 'openAIWhisper',
3233
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
3334
LOCALAI_STT: 'localAISTT',
35+
AZURE_COGNITIVE: 'azureCognitive',
3436
GROQ_WHISPER: 'groqWhisper'
3537
}
3638

@@ -142,6 +144,58 @@ const speechToTextProviders = {
142144
}
143145
]
144146
},
147+
[SpeechToTextType.AZURE_COGNITIVE]: {
148+
label: 'Azure Cognitive Services',
149+
name: SpeechToTextType.AZURE_COGNITIVE,
150+
icon: azureSvg,
151+
url: 'https://azure.microsoft.com/en-us/products/cognitive-services/speech-services',
152+
inputs: [
153+
{
154+
label: 'Connect Credential',
155+
name: 'credential',
156+
type: 'credential',
157+
credentialNames: ['azureCognitiveServices']
158+
},
159+
{
160+
label: 'Language',
161+
name: 'language',
162+
type: 'string',
163+
description: 'The recognition language (e.g., "en-US", "es-ES")',
164+
placeholder: 'en-US',
165+
optional: true
166+
},
167+
{
168+
label: 'Profanity Filter Mode',
169+
name: 'profanityFilterMode',
170+
type: 'options',
171+
description: 'How to handle profanity in the transcription',
172+
options: [
173+
{
174+
label: 'None',
175+
name: 'None'
176+
},
177+
{
178+
label: 'Masked',
179+
name: 'Masked'
180+
},
181+
{
182+
label: 'Removed',
183+
name: 'Removed'
184+
}
185+
],
186+
default: 'Masked',
187+
optional: true
188+
},
189+
{
190+
label: 'Audio Channels',
191+
name: 'channels',
192+
type: 'string',
193+
description: 'Comma-separated list of audio channels to process (e.g., "0,1")',
194+
placeholder: '0,1',
195+
default: '0,1'
196+
}
197+
]
198+
},
145199
[SpeechToTextType.GROQ_WHISPER]: {
146200
label: 'Groq Whisper',
147201
name: SpeechToTextType.GROQ_WHISPER,

0 commit comments

Comments
 (0)