-
Notifications
You must be signed in to change notification settings - Fork 424
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ccb2d43
commit 69c7880
Showing
28 changed files
with
674 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,3 +86,4 @@ vits-piper-* | |
vits-coqui-* | ||
vits-mms-* | ||
*.tar.bz2 | ||
sherpa-onnx-paraformer-trilingual-zh-cantonese-en |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
module vad-asr-paraformer | ||
|
||
go 1.12 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"github.com/gordonklaus/portaudio" | ||
sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" | ||
"log" | ||
"strings" | ||
) | ||
|
||
func main() { | ||
log.SetFlags(log.LstdFlags | log.Lmicroseconds) | ||
|
||
// 1. Create VAD | ||
config := sherpa.VadModelConfig{} | ||
|
||
// Please download silero_vad.onnx from | ||
// https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
|
||
config.SileroVad.Model = "./silero_vad.onnx" | ||
config.SileroVad.Threshold = 0.5 | ||
config.SileroVad.MinSilenceDuration = 0.5 | ||
config.SileroVad.MinSpeechDuration = 0.25 | ||
config.SileroVad.WindowSize = 512 | ||
config.SampleRate = 16000 | ||
config.NumThreads = 1 | ||
config.Provider = "cpu" | ||
config.Debug = 1 | ||
|
||
var bufferSizeInSeconds float32 = 20 | ||
|
||
vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds) | ||
defer sherpa.DeleteVoiceActivityDetector(vad) | ||
|
||
// 2. Create ASR recognizer | ||
|
||
c := sherpa.OfflineRecognizerConfig{} | ||
c.FeatConfig.SampleRate = 16000 | ||
c.FeatConfig.FeatureDim = 80 | ||
|
||
// Please download the model from | ||
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2 | ||
c.ModelConfig.Paraformer.Model = "./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.int8.onnx" | ||
c.ModelConfig.Tokens = "./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/tokens.txt" | ||
c.ModelConfig.NumThreads = 2 | ||
c.ModelConfig.Debug = 1 | ||
c.ModelConfig.Provider = "cpu" | ||
|
||
recognizer := sherpa.NewOfflineRecognizer(&c) | ||
defer sherpa.DeleteOfflineRecognizer(recognizer) | ||
|
||
err := portaudio.Initialize() | ||
if err != nil { | ||
log.Fatalf("Unable to initialize portaudio: %v\n", err) | ||
} | ||
defer portaudio.Terminate() | ||
|
||
default_device, err := portaudio.DefaultInputDevice() | ||
if err != nil { | ||
log.Fatal("Failed to get default input device: %v\n", err) | ||
} | ||
log.Printf("Selected default input device: %s\n", default_device.Name) | ||
param := portaudio.StreamParameters{} | ||
param.Input.Device = default_device | ||
param.Input.Channels = 1 | ||
param.Input.Latency = default_device.DefaultHighInputLatency | ||
|
||
param.SampleRate = float64(config.SampleRate) | ||
param.FramesPerBuffer = 0 | ||
param.Flags = portaudio.ClipOff | ||
|
||
// you can choose another value for 0.1 if you want | ||
samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second | ||
samples := make([]float32, samplesPerCall) | ||
|
||
s, err := portaudio.OpenStream(param, samples) | ||
if err != nil { | ||
log.Fatalf("Failed to open the stream") | ||
} | ||
|
||
defer s.Close() | ||
chk(s.Start()) | ||
|
||
log.Print("Started! Please speak") | ||
printed := false | ||
|
||
k := 0 | ||
for { | ||
chk(s.Read()) | ||
vad.AcceptWaveform(samples) | ||
|
||
if vad.IsSpeech() && !printed { | ||
printed = true | ||
log.Print("Detected speech\n") | ||
} | ||
|
||
if !vad.IsSpeech() { | ||
printed = false | ||
} | ||
|
||
for !vad.IsEmpty() { | ||
speechSegment := vad.Front() | ||
vad.Pop() | ||
|
||
duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate) | ||
|
||
audio := &sherpa.GeneratedAudio{} | ||
audio.Samples = speechSegment.Samples | ||
audio.SampleRate = config.SampleRate | ||
|
||
// Now decode it | ||
go decode(recognizer, audio, k) | ||
|
||
k += 1 | ||
|
||
log.Printf("Duration: %.2f seconds\n", duration) | ||
} | ||
} | ||
|
||
chk(s.Stop()) | ||
} | ||
|
||
func decode(recognizer *sherpa.OfflineRecognizer, audio *sherpa.GeneratedAudio, id int) { | ||
stream := sherpa.NewOfflineStream(recognizer) | ||
defer sherpa.DeleteOfflineStream(stream) | ||
stream.AcceptWaveform(audio.SampleRate, audio.Samples) | ||
recognizer.Decode(stream) | ||
result := stream.GetResult() | ||
text := strings.ToLower(result.Text) | ||
text = strings.Trim(text, " ") | ||
log.Println(text) | ||
|
||
duration := float32(len(audio.Samples)) / float32(audio.SampleRate) | ||
|
||
filename := fmt.Sprintf("seg-%d-%.2f-seconds-%s.wav", id, duration, text) | ||
ok := audio.Save(filename) | ||
if ok { | ||
log.Printf("Saved to %s", filename) | ||
} | ||
log.Print("----------\n") | ||
} | ||
|
||
func chk(err error) { | ||
if err != nil { | ||
panic(err) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/usr/bin/env bash | ||
|
||
|
||
if [ ! -f ./silero_vad.onnx ]; then | ||
curl -SL -O https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
fi | ||
|
||
if [ ! -f ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.int8.onnx ]; then | ||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2 | ||
tar xvf sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2 | ||
rm sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2 | ||
fi | ||
|
||
go mod tidy | ||
go build | ||
./vad-asr-paraformer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
module vad-asr-whisper | ||
|
||
go 1.12 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"github.com/gordonklaus/portaudio" | ||
sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" | ||
"log" | ||
"strings" | ||
) | ||
|
||
func main() { | ||
log.SetFlags(log.LstdFlags | log.Lmicroseconds) | ||
|
||
// 1. Create VAD | ||
config := sherpa.VadModelConfig{} | ||
|
||
// Please download silero_vad.onnx from | ||
// https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
|
||
config.SileroVad.Model = "./silero_vad.onnx" | ||
config.SileroVad.Threshold = 0.5 | ||
config.SileroVad.MinSilenceDuration = 0.5 | ||
config.SileroVad.MinSpeechDuration = 0.25 | ||
config.SileroVad.WindowSize = 512 | ||
config.SampleRate = 16000 | ||
config.NumThreads = 1 | ||
config.Provider = "cpu" | ||
config.Debug = 1 | ||
|
||
var bufferSizeInSeconds float32 = 20 | ||
|
||
vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds) | ||
defer sherpa.DeleteVoiceActivityDetector(vad) | ||
|
||
// 2. Create ASR recognizer | ||
|
||
c := sherpa.OfflineRecognizerConfig{} | ||
c.FeatConfig.SampleRate = 16000 | ||
c.FeatConfig.FeatureDim = 80 | ||
c.ModelConfig.Whisper.Encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx" | ||
c.ModelConfig.Whisper.Decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx" | ||
c.ModelConfig.Tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt" | ||
c.ModelConfig.NumThreads = 2 | ||
c.ModelConfig.Debug = 1 | ||
c.ModelConfig.Provider = "cpu" | ||
|
||
recognizer := sherpa.NewOfflineRecognizer(&c) | ||
defer sherpa.DeleteOfflineRecognizer(recognizer) | ||
|
||
err := portaudio.Initialize() | ||
if err != nil { | ||
log.Fatalf("Unable to initialize portaudio: %v\n", err) | ||
} | ||
defer portaudio.Terminate() | ||
|
||
default_device, err := portaudio.DefaultInputDevice() | ||
if err != nil { | ||
log.Fatal("Failed to get default input device: %v\n", err) | ||
} | ||
log.Printf("Selected default input device: %s\n", default_device.Name) | ||
param := portaudio.StreamParameters{} | ||
param.Input.Device = default_device | ||
param.Input.Channels = 1 | ||
param.Input.Latency = default_device.DefaultHighInputLatency | ||
|
||
param.SampleRate = float64(config.SampleRate) | ||
param.FramesPerBuffer = 0 | ||
param.Flags = portaudio.ClipOff | ||
|
||
// you can choose another value for 0.1 if you want | ||
samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second | ||
samples := make([]float32, samplesPerCall) | ||
|
||
s, err := portaudio.OpenStream(param, samples) | ||
if err != nil { | ||
log.Fatalf("Failed to open the stream") | ||
} | ||
|
||
defer s.Close() | ||
chk(s.Start()) | ||
|
||
log.Print("Started! Please speak") | ||
printed := false | ||
|
||
k := 0 | ||
for { | ||
chk(s.Read()) | ||
vad.AcceptWaveform(samples) | ||
|
||
if vad.IsSpeech() && !printed { | ||
printed = true | ||
log.Print("Detected speech\n") | ||
} | ||
|
||
if !vad.IsSpeech() { | ||
printed = false | ||
} | ||
|
||
for !vad.IsEmpty() { | ||
speechSegment := vad.Front() | ||
vad.Pop() | ||
|
||
duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate) | ||
|
||
audio := &sherpa.GeneratedAudio{} | ||
audio.Samples = speechSegment.Samples | ||
audio.SampleRate = config.SampleRate | ||
|
||
// Now decode it | ||
go decode(recognizer, audio, k) | ||
|
||
k += 1 | ||
|
||
log.Printf("Duration: %.2f seconds\n", duration) | ||
} | ||
} | ||
|
||
chk(s.Stop()) | ||
} | ||
|
||
func decode(recognizer *sherpa.OfflineRecognizer, audio *sherpa.GeneratedAudio, id int) { | ||
stream := sherpa.NewOfflineStream(recognizer) | ||
defer sherpa.DeleteOfflineStream(stream) | ||
stream.AcceptWaveform(audio.SampleRate, audio.Samples) | ||
recognizer.Decode(stream) | ||
result := stream.GetResult() | ||
text := strings.ToLower(result.Text) | ||
text = strings.Trim(text, " ") | ||
log.Println(text) | ||
|
||
duration := float32(len(audio.Samples)) / float32(audio.SampleRate) | ||
|
||
filename := fmt.Sprintf("seg-%d-%.2f-seconds-%s.wav", id, duration, text) | ||
ok := audio.Save(filename) | ||
if ok { | ||
log.Printf("Saved to %s", filename) | ||
} | ||
log.Print("----------\n") | ||
} | ||
|
||
func chk(err error) { | ||
if err != nil { | ||
panic(err) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/usr/bin/env bash | ||
|
||
|
||
if [ ! -f ./silero_vad.onnx ]; then | ||
curl -SL -O https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
fi | ||
|
||
if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then | ||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
rm sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
fi | ||
|
||
go mod tidy | ||
go build | ||
./vad-asr-whisper |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
module vad | ||
|
||
go 1.12 |
Oops, something went wrong.