-
-
Notifications
You must be signed in to change notification settings - Fork 530
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add AudioInput #4048
Comments
+1. Working with speech to text - also in pyodide via transformers.js.py becomes more and more realistic. For example also implicitly requested in #7021. |
I we wan't to make the AudioInput or Audio pane more engaging we can use https://github.com/katspaugh/wavesurfer.js. This is what the Gradio Audio component is built on top of. For recording they use the |
Here is a very rough and basic implementation. script.javascript const startRecording = `Start Recording`
const stopRecording = `Stop Recording`
class AudioStreamWidget {
constructor(model) {
this.audioContext = new (window.AudioContext || window.webkitAudioContext)();
this.stream = null;
this.source = null;
this.mediaRecorder = null;
this.chunks = [];
this.model = model
}
async start() {
try {
this.stream = await navigator.mediaDevices.getUserMedia({ audio: true });
this.source = this.audioContext.createMediaStreamSource(this.stream);
this.mediaRecorder = new MediaRecorder(this.stream);
this.mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
this.chunks.push(event.data);
}
};
this.mediaRecorder.onstop = this.onStopRecording.bind(this);
this.mediaRecorder.start();
console.log('Audio stream started and recording');
} catch (err) {
console.error('Error accessing audio stream', err);
}
}
stop() {
if (this.mediaRecorder && this.mediaRecorder.state !== 'inactive') {
this.mediaRecorder.stop();
}
if (this.stream) {
this.stream.getTracks().forEach(track => track.stop());
this.stream = null;
console.log('Audio stream stopped');
}
}
onStopRecording() {
// const blob = new Blob(this.chunks, { type: 'audio/webm' });
const blob = new Blob(this.chunks, { type: 'audio/webm' });
this.chunks = [];
// Create a download link for the audio file
this.blobToBase64(blob).then(base64 => {
this.sendToBackend(base64);
console.log('Recording sent to server');
});
}
blobToBase64(blob) {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.readAsDataURL(blob);
reader.onloadend = () => {
resolve(reader.result);
};
reader.onerror = error => reject(error);
});
}
sendToBackend(base64) {
this.model._data_url = base64
}
}
export function render({ model }) {
let audio = new AudioStreamWidget(model);
let state = "start"
let btn = document.createElement("button");
btn.innerHTML = startRecording;
btn.addEventListener("click", () => {
console.log(btn.innerHTML)
if (state == "start") {
audio.start();
btn.innerHTML = stopRecording;
state = "stop"
} else {
audio.stop();
btn.innerHTML = startRecording;
state = "start"
}
});
return btn
} script.py import panel as pn
import param
from base64 import b64decode
import numpy as np
from panel.custom import JSComponent
import tempfile
pn.extension()
class AudioInput(JSComponent):
value = param.Parameter()
format = param.Selector(default='webm', objects=['webm'], doc="The name of the audio format to provide the value in.")
_data_url = param.Parameter()
_esm = 'script.js'
@param.depends("_data_url", watch=True)
def _update_value(self):
data_url = self._data_url
data = data_url.split(",")[1]
self.value = b64decode(data)
AudioInput = AudioInput()
def download_webm_file(value):
if not value:
return "No audio available"
return f'<a id="download-link" href="{value}" download="sound.webm">Download File</a>'
def audio_value(value):
if not value:
return None
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as temp_file:
temp_file.write(value)
temp_file_name = temp_file.name
return temp_file_name
audio = pn.pane.Audio(pn.bind(audio_value, AudioInput.param.value), loop=True, width=300, height=50)
pn.Column(
AudioInput,
pn.bind(download_webm_file, AudioInput.param._data_url),
audio,
).servable() Notes
|
Gradio has a webm to wav function here https://github.com/gradio-app/gradio/blob/main/js/audio/shared/audioBufferToWav.ts. |
Just to clarify, this is separate from SpeechToText, i.e. AudioInput is agnostic to speech, and it records any sound? |
Yes. Should be able to record Audio in some Audio format like webm, wav, mp3, numpy, torch tensor or similar. Use Python to analyse or transform. Use Panel to display the transformed result whether its Audio or something Else. |
Whether its should record Full file, stream intermediære chunks or be able to do both is not clear to me. |
We need similar functionality for Video. The current VideoStream takes pictures which is something Else. |
I believe we will see more and more Audio and Speech use cases. For example OpenAI recently released https://openai.com/index/introducing-the-realtime-api/. |
Request
Add
AudioInput
widget for working with streaming audioMotivation
Looking at the awesome results of the VideoStream example PR it is clear that Panel can do something truly amazing for streaming data sources.
Looking at the reference gallery I notice that we cannot provide users to work with an audio stream.
Adding this would provide something unique to our users.
You can see how Gradio supports it here https://gradio.app/real_time_speech_recognition/ for inspiration.
The text was updated successfully, but these errors were encountered: