-
Notifications
You must be signed in to change notification settings - Fork 424
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add VAD examples using ALSA for recording (#739)
- Loading branch information
1 parent
a5f8fbc
commit 6fb8ced
Showing
17 changed files
with
601 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
This script works only on Linux. It uses ALSA for recording. | ||
""" | ||
|
||
import argparse | ||
from pathlib import Path | ||
|
||
import sherpa_onnx | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser( | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
) | ||
|
||
parser.add_argument( | ||
"--silero-vad-model", | ||
type=str, | ||
required=True, | ||
help="Path to silero_vad.onnx", | ||
) | ||
|
||
parser.add_argument( | ||
"--device-name", | ||
type=str, | ||
required=True, | ||
help=""" | ||
The device name specifies which microphone to use in case there are several | ||
on your system. You can use | ||
arecord -l | ||
to find all available microphones on your computer. For instance, if it outputs | ||
**** List of CAPTURE Hardware Devices **** | ||
card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
Subdevices: 1/1 | ||
Subdevice #0: subdevice #0 | ||
and if you want to select card 3 and the device 0 on that card, please use: | ||
plughw:3,0 | ||
as the device_name. | ||
""", | ||
) | ||
|
||
return parser.parse_args() | ||
|
||
|
||
def main(): | ||
args = get_args() | ||
if not Path(args.silero_vad_model).is_file(): | ||
raise RuntimeError( | ||
f"{args.silero_vad_model} does not exist. Please download it from " | ||
"https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx" | ||
) | ||
|
||
device_name = args.device_name | ||
print(f"device_name: {device_name}") | ||
alsa = sherpa_onnx.Alsa(device_name) | ||
|
||
sample_rate = 16000 | ||
samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms | ||
|
||
config = sherpa_onnx.VadModelConfig() | ||
config.silero_vad.model = args.silero_vad_model | ||
config.sample_rate = sample_rate | ||
|
||
vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30) | ||
|
||
print("Started! Please speak. Press Ctrl C to exit") | ||
|
||
printed = False | ||
k = 0 | ||
try: | ||
while True: | ||
samples = alsa.read(samples_per_read) # a blocking read | ||
|
||
vad.accept_waveform(samples) | ||
|
||
if vad.is_speech_detected() and not printed: | ||
print("Detected speech") | ||
printed = True | ||
|
||
if not vad.is_speech_detected(): | ||
printed = False | ||
|
||
while not vad.empty(): | ||
samples = vad.front.samples | ||
duration = len(samples) / sample_rate | ||
filename = f"seg-{k}-{duration:.3f}-seconds.wav" | ||
k += 1 | ||
sherpa_onnx.write_wave(filename, samples, sample_rate) | ||
print(f"Duration: {duration:.3f} seconds") | ||
print(f"Saved to {filename}") | ||
print("----------") | ||
|
||
vad.pop() | ||
except KeyboardInterrupt: | ||
print("\nCaught Ctrl + C. Exit") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import argparse | ||
import os | ||
import sys | ||
from pathlib import Path | ||
|
||
try: | ||
import sounddevice as sd | ||
except ImportError: | ||
print("Please install sounddevice first. You can use") | ||
print() | ||
print(" pip install sounddevice") | ||
print() | ||
print("to install it") | ||
sys.exit(-1) | ||
|
||
import sherpa_onnx | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser( | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
) | ||
|
||
parser.add_argument( | ||
"--silero-vad-model", | ||
type=str, | ||
required=True, | ||
help="Path to silero_vad.onnx", | ||
) | ||
|
||
return parser.parse_args() | ||
|
||
|
||
def main(): | ||
args = get_args() | ||
if not Path(args.silero_vad_model).is_file(): | ||
raise RuntimeError( | ||
f"{args.silero_vad_model} does not exist. Please download it from " | ||
"https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx" | ||
) | ||
|
||
mic_sample_rate = 16000 | ||
if "SHERPA_ONNX_MIC_SAMPLE_RATE" in os.environ: | ||
mic_sample_rate = int(os.environ.get("SHERPA_ONNX_MIC_SAMPLE_RATE")) | ||
print(f"Change microphone sample rate to {mic_sample_rate}") | ||
|
||
sample_rate = 16000 | ||
samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms | ||
|
||
config = sherpa_onnx.VadModelConfig() | ||
config.silero_vad.model = args.silero_vad_model | ||
config.sample_rate = sample_rate | ||
|
||
vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30) | ||
|
||
# python3 -m sounddevice | ||
# can also be used to list all devices | ||
|
||
devices = sd.query_devices() | ||
if len(devices) == 0: | ||
print("No microphone devices found") | ||
print( | ||
"If you are using Linux and you are sure there is a microphone " | ||
"on your system, please use " | ||
"./vad-alsa.py" | ||
) | ||
sys.exit(0) | ||
|
||
print(devices) | ||
|
||
if "SHERPA_ONNX_MIC_DEVICE" in os.environ: | ||
input_device_idx = int(os.environ.get("SHERPA_ONNX_MIC_DEVICE")) | ||
sd.default.device[0] = input_device_idx | ||
print(f'Use selected device: {devices[input_device_idx]["name"]}') | ||
else: | ||
input_device_idx = sd.default.device[0] | ||
print(f'Use default device: {devices[input_device_idx]["name"]}') | ||
|
||
print("Started! Please speak. Press Ctrl C to exit") | ||
|
||
printed = False | ||
k = 0 | ||
try: | ||
with sd.InputStream( | ||
channels=1, dtype="float32", samplerate=mic_sample_rate | ||
) as s: | ||
while True: | ||
samples, _ = s.read(samples_per_read) # a blocking read | ||
samples = samples.reshape(-1) | ||
|
||
if mic_sample_rate != sample_rate: | ||
import librosa | ||
|
||
samples = librosa.resample( | ||
samples, orig_sr=mic_sample_rate, target_sr=sample_rate | ||
) | ||
|
||
vad.accept_waveform(samples) | ||
|
||
if vad.is_speech_detected() and not printed: | ||
print("Detected speech") | ||
printed = True | ||
|
||
if not vad.is_speech_detected(): | ||
printed = False | ||
|
||
while not vad.empty(): | ||
samples = vad.front.samples | ||
duration = len(samples) / sample_rate | ||
filename = f"seg-{k}-{duration:.3f}-seconds.wav" | ||
k += 1 | ||
sherpa_onnx.write_wave(filename, samples, sample_rate) | ||
print(f"Duration: {duration:.3f} seconds") | ||
print(f"Saved to {filename}") | ||
print("----------") | ||
|
||
vad.pop() | ||
except KeyboardInterrupt: | ||
print("\nCaught Ctrl + C. Exit") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.