From e7ffcbd677ce4b09d5ec33fac70a051d69af02ce Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 14 Sep 2024 12:30:13 +0800 Subject: [PATCH] Add APIs about max speech duration in VAD for various programming languages (#1349) --- .github/workflows/dot-net.yaml | 2 ++ .../vad-with-non-streaming-asr/bin/paraformer.dart | 1 + .../bin/sense-voice-2.dart | 1 + .../vad-with-non-streaming-asr/bin/sense-voice.dart | 1 + .../bin/telespeech-ctc.dart | 1 + .../vad-with-non-streaming-asr/bin/whisper.dart | 1 + .../bin/zipformer-transducer.dart | 1 + .../sherpa_onnx/lib/src/sherpa_onnx_bindings.dart | 3 +++ flutter/sherpa_onnx/lib/src/vad.dart | 7 +++++-- go-api-examples/vad-asr-paraformer/main.go | 1 + go-api-examples/vad-asr-whisper/main.go | 1 + java-api-examples/VadNonStreamingParaformer.java | 1 + java-api-examples/VadNonStreamingSenseVoice.java | 1 + java-api-examples/VadRemoveSilence.java | 1 + lazarus-examples/generate_subtitles/my_init.pas | 3 ++- .../test_vad_with_non_streaming_asr_whisper.js | 1 + .../test-vad-with-non-streaming-asr-whisper.js | 1 + .../vad-remove-non-speech-segments-from-file.py | 9 +++++++++ scripts/dotnet/SileroVadModelConfig.cs | 3 +++ scripts/go/sherpa_onnx.go | 2 ++ scripts/node-addon-api/lib/vad.js | 3 +++ scripts/node-addon-api/src/vad.cc | 1 + sherpa-onnx/c-api/c-api.cc | 3 +++ sherpa-onnx/c-api/c-api.h | 5 +++++ .../com/k2fsa/sherpa/onnx/SileroVadModelConfig.java | 12 ++++++++++++ sherpa-onnx/jni/voice-activity-detector.cc | 4 ++++ sherpa-onnx/kotlin-api/Vad.kt | 1 + sherpa-onnx/pascal-api/sherpa_onnx.pas | 9 +++++++-- swift-api-examples/SherpaOnnx.swift | 6 ++++-- wasm/vad/sherpa-onnx-vad.js | 7 ++++++- wasm/vad/sherpa-onnx-wasm-main-vad.cc | 4 +++- 31 files changed, 88 insertions(+), 9 deletions(-) diff --git a/.github/workflows/dot-net.yaml b/.github/workflows/dot-net.yaml index 5299b19ce..36637a9e2 100644 --- a/.github/workflows/dot-net.yaml +++ b/.github/workflows/dot-net.yaml @@ -93,6 +93,8 @@ jobs: git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface cd huggingface + git fetch + git pull mkdir -p windows-for-dotnet cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/paraformer.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/paraformer.dart index b607c57a1..8d00a671f 100644 --- a/dart-api-examples/vad-with-non-streaming-asr/bin/paraformer.dart +++ b/dart-api-examples/vad-with-non-streaming-asr/bin/paraformer.dart @@ -32,6 +32,7 @@ void main(List arguments) async { model: sileroVad, minSilenceDuration: 0.25, minSpeechDuration: 0.5, + maxSpeechDuration: 5.0, ); final vadConfig = sherpa_onnx.VadModelConfig( diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice-2.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice-2.dart index 59493961c..09a7f43b1 100644 --- a/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice-2.dart +++ b/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice-2.dart @@ -38,6 +38,7 @@ void main(List arguments) async { model: sileroVad, minSilenceDuration: 0.25, minSpeechDuration: 0.5, + maxSpeechDuration: 5.0, ); final vadConfig = sherpa_onnx.VadModelConfig( diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice.dart index 5f989db60..6489153f6 100644 --- a/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice.dart +++ b/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice.dart @@ -37,6 +37,7 @@ void main(List arguments) async { model: sileroVad, minSilenceDuration: 0.25, minSpeechDuration: 0.5, + maxSpeechDuration: 5.0, ); final vadConfig = sherpa_onnx.VadModelConfig( diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/telespeech-ctc.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/telespeech-ctc.dart index acb707beb..b0a09a542 100644 --- a/dart-api-examples/vad-with-non-streaming-asr/bin/telespeech-ctc.dart +++ b/dart-api-examples/vad-with-non-streaming-asr/bin/telespeech-ctc.dart @@ -33,6 +33,7 @@ void main(List arguments) async { model: sileroVad, minSilenceDuration: 0.25, minSpeechDuration: 0.5, + maxSpeechDuration: 5.0, ); final vadConfig = sherpa_onnx.VadModelConfig( diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/whisper.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/whisper.dart index 6a5ed8f77..be66ee9d4 100644 --- a/dart-api-examples/vad-with-non-streaming-asr/bin/whisper.dart +++ b/dart-api-examples/vad-with-non-streaming-asr/bin/whisper.dart @@ -34,6 +34,7 @@ void main(List arguments) async { model: sileroVad, minSilenceDuration: 0.25, minSpeechDuration: 0.5, + maxSpeechDuration: 5.0, ); final vadConfig = sherpa_onnx.VadModelConfig( diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-transducer.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-transducer.dart index f1d3df31c..5ccc7e431 100644 --- a/dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-transducer.dart +++ b/dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-transducer.dart @@ -37,6 +37,7 @@ void main(List arguments) async { model: sileroVad, minSilenceDuration: 0.25, minSpeechDuration: 0.5, + maxSpeechDuration: 5.0, ); final vadConfig = sherpa_onnx.VadModelConfig( diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index abc5e1f09..207160087 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -301,6 +301,9 @@ final class SherpaOnnxSileroVadModelConfig extends Struct { @Int32() external int windowSize; + + @Float() + external double maxSpeechDuration; } final class SherpaOnnxVadModelConfig extends Struct { diff --git a/flutter/sherpa_onnx/lib/src/vad.dart b/flutter/sherpa_onnx/lib/src/vad.dart index bcab3fd30..10fac5a45 100644 --- a/flutter/sherpa_onnx/lib/src/vad.dart +++ b/flutter/sherpa_onnx/lib/src/vad.dart @@ -11,11 +11,12 @@ class SileroVadModelConfig { this.threshold = 0.5, this.minSilenceDuration = 0.5, this.minSpeechDuration = 0.25, - this.windowSize = 512}); + this.windowSize = 512, + this.maxSpeechDuration = 5.0}); @override String toString() { - return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize)'; + return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize, maxSpeechDuration: $maxSpeechDuration)'; } final String model; @@ -23,6 +24,7 @@ class SileroVadModelConfig { final double minSilenceDuration; final double minSpeechDuration; final int windowSize; + final double maxSpeechDuration; } class VadModelConfig { @@ -127,6 +129,7 @@ class VoiceActivityDetector { c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration; c.ref.sileroVad.minSpeechDuration = config.sileroVad.minSpeechDuration; c.ref.sileroVad.windowSize = config.sileroVad.windowSize; + c.ref.sileroVad.maxSpeechDuration = config.sileroVad.maxSpeechDuration; c.ref.sampleRate = config.sampleRate; c.ref.numThreads = config.numThreads; diff --git a/go-api-examples/vad-asr-paraformer/main.go b/go-api-examples/vad-asr-paraformer/main.go index 7beca779b..a5789142e 100644 --- a/go-api-examples/vad-asr-paraformer/main.go +++ b/go-api-examples/vad-asr-paraformer/main.go @@ -22,6 +22,7 @@ func main() { config.SileroVad.MinSilenceDuration = 0.5 config.SileroVad.MinSpeechDuration = 0.25 config.SileroVad.WindowSize = 512 + config.SileroVad.MaxSpeechDuration = 5.0 config.SampleRate = 16000 config.NumThreads = 1 config.Provider = "cpu" diff --git a/go-api-examples/vad-asr-whisper/main.go b/go-api-examples/vad-asr-whisper/main.go index 08a8aef8e..39027517d 100644 --- a/go-api-examples/vad-asr-whisper/main.go +++ b/go-api-examples/vad-asr-whisper/main.go @@ -22,6 +22,7 @@ func main() { config.SileroVad.MinSilenceDuration = 0.5 config.SileroVad.MinSpeechDuration = 0.25 config.SileroVad.WindowSize = 512 + config.SileroVad.MaxSpeechDuration = 5.0 config.SampleRate = 16000 config.NumThreads = 1 config.Provider = "cpu" diff --git a/java-api-examples/VadNonStreamingParaformer.java b/java-api-examples/VadNonStreamingParaformer.java index eb57f4c14..1757a4b60 100644 --- a/java-api-examples/VadNonStreamingParaformer.java +++ b/java-api-examples/VadNonStreamingParaformer.java @@ -18,6 +18,7 @@ public static Vad createVad() { .setMinSilenceDuration(0.25f) .setMinSpeechDuration(0.5f) .setWindowSize(512) + .setMaxSpeechDuration(5.0f) .build(); VadModelConfig config = diff --git a/java-api-examples/VadNonStreamingSenseVoice.java b/java-api-examples/VadNonStreamingSenseVoice.java index d6d27d447..cbfac6b31 100644 --- a/java-api-examples/VadNonStreamingSenseVoice.java +++ b/java-api-examples/VadNonStreamingSenseVoice.java @@ -18,6 +18,7 @@ public static Vad createVad() { .setMinSilenceDuration(0.25f) .setMinSpeechDuration(0.5f) .setWindowSize(512) + .setMaxSpeechDuration(5.0f) .build(); VadModelConfig config = diff --git a/java-api-examples/VadRemoveSilence.java b/java-api-examples/VadRemoveSilence.java index 3af1caa7f..511a508e4 100644 --- a/java-api-examples/VadRemoveSilence.java +++ b/java-api-examples/VadRemoveSilence.java @@ -19,6 +19,7 @@ public static void main(String[] args) { .setMinSilenceDuration(0.25f) .setMinSpeechDuration(0.5f) .setWindowSize(512) + .setMaxSpeechDuration(5.0f) .build(); VadModelConfig config = diff --git a/lazarus-examples/generate_subtitles/my_init.pas b/lazarus-examples/generate_subtitles/my_init.pas index 55df79f15..d57448b6d 100644 --- a/lazarus-examples/generate_subtitles/my_init.pas +++ b/lazarus-examples/generate_subtitles/my_init.pas @@ -48,8 +48,9 @@ function CreateVad(VadFilename: AnsiString): TSherpaOnnxVoiceActivityDetector; WindowSize := 512; {Please don't change it unless you know the details} Config.SileroVad.Model := VadFilename; - Config.SileroVad.MinSpeechDuration := 0.5; + Config.SileroVad.MinSpeechDuration := 0.25; Config.SileroVad.MinSilenceDuration := 0.5; + Config.SileroVad.MaxSpeechDuration := 5.0; Config.SileroVad.Threshold := 0.5; Config.SileroVad.WindowSize := WindowSize; Config.NumThreads:= 2; diff --git a/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js b/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js index 6f3783e7c..2b672b019 100644 --- a/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js +++ b/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js @@ -34,6 +34,7 @@ function createVad() { threshold: 0.5, minSpeechDuration: 0.25, minSilenceDuration: 0.5, + maxSpeechDuration: 5, windowSize: 512, }, sampleRate: 16000, diff --git a/nodejs-examples/test-vad-with-non-streaming-asr-whisper.js b/nodejs-examples/test-vad-with-non-streaming-asr-whisper.js index e84c3ab11..7e4e23c5a 100644 --- a/nodejs-examples/test-vad-with-non-streaming-asr-whisper.js +++ b/nodejs-examples/test-vad-with-non-streaming-asr-whisper.js @@ -29,6 +29,7 @@ function createVad() { threshold: 0.5, minSpeechDuration: 0.25, minSilenceDuration: 0.5, + maxSpeechDuration: 5, windowSize: 512, }, sampleRate: 16000, diff --git a/python-api-examples/vad-remove-non-speech-segments-from-file.py b/python-api-examples/vad-remove-non-speech-segments-from-file.py index f559e6519..ad4814487 100755 --- a/python-api-examples/vad-remove-non-speech-segments-from-file.py +++ b/python-api-examples/vad-remove-non-speech-segments-from-file.py @@ -90,6 +90,15 @@ def main(): config = sherpa_onnx.VadModelConfig() config.silero_vad.model = args.silero_vad_model + config.silero_vad.threshold = 0.5 + config.silero_vad.min_silence_duration = 0.25 # seconds + config.silero_vad.min_speech_duration = 0.25 # seconds + + # If the current segment is larger than this value, then it increases + # the threshold to 0.9 internally. After detecting this segment, + # it resets the threshold to its original value. + config.silero_vad.max_speech_duration = 5 # seconds + config.sample_rate = sample_rate window_size = config.silero_vad.window_size diff --git a/scripts/dotnet/SileroVadModelConfig.cs b/scripts/dotnet/SileroVadModelConfig.cs index 8bf81ea87..6a80f4dc6 100644 --- a/scripts/dotnet/SileroVadModelConfig.cs +++ b/scripts/dotnet/SileroVadModelConfig.cs @@ -14,6 +14,7 @@ public SileroVadModelConfig() MinSilenceDuration = 0.5F; MinSpeechDuration = 0.25F; WindowSize = 512; + MaxSpeechDuration = 5.0F; } [MarshalAs(UnmanagedType.LPStr)] @@ -26,5 +27,7 @@ public SileroVadModelConfig() public float MinSpeechDuration; public int WindowSize; + + public float MaxSpeechDuration; } } diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index aeee609ca..ad5060c2c 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -771,6 +771,7 @@ type SileroVadModelConfig struct { MinSilenceDuration float32 MinSpeechDuration float32 WindowSize int + MaxSpeechDuration float32 } type VadModelConfig struct { @@ -849,6 +850,7 @@ func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float3 c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration) c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration) c.silero_vad.window_size = C.int(config.SileroVad.WindowSize) + c.silero_vad.max_speech_duration = C.float(config.SileroVad.MaxSpeechDuration) c.sample_rate = C.int(config.SampleRate) c.num_threads = C.int(config.NumThreads) diff --git a/scripts/node-addon-api/lib/vad.js b/scripts/node-addon-api/lib/vad.js index 3ef7b6cad..a71dfdb66 100644 --- a/scripts/node-addon-api/lib/vad.js +++ b/scripts/node-addon-api/lib/vad.js @@ -39,6 +39,9 @@ config = { sileroVad: { model: "./silero_vad.onnx", threshold: 0.5, + minSilenceDuration: 0.5, + minSpeechDuration: 0.25, + maxSpeechDuration: 5, } } */ diff --git a/scripts/node-addon-api/src/vad.cc b/scripts/node-addon-api/src/vad.cc index de92337db..eaed2aeea 100644 --- a/scripts/node-addon-api/src/vad.cc +++ b/scripts/node-addon-api/src/vad.cc @@ -279,6 +279,7 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig( SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration); SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration); SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize); + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(max_speech_duration, maxSpeechDuration); return c; } diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 0a4c683ee..6b5d6f73a 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -907,6 +907,9 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector( vad_config.silero_vad.window_size = SHERPA_ONNX_OR(config->silero_vad.window_size, 512); + vad_config.silero_vad.max_speech_duration = + SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20); + vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000); vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu"); diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 67746e587..3be5a19cd 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -746,6 +746,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { float min_speech_duration; int window_size; + + // If a speech segment is longer than this value, then we increase + // the threshold to 0.9. After finishing detecting the segment, + // the threshold value is reset to its original value. + float max_speech_duration; } SherpaOnnxSileroVadModelConfig; SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java index 1cf019c0d..37b3d14ea 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java @@ -8,6 +8,7 @@ public class SileroVadModelConfig { private final float minSilenceDuration; private final float minSpeechDuration; private final int windowSize; + private final float maxSpeechDuration; private SileroVadModelConfig(Builder builder) { this.model = builder.model; @@ -15,6 +16,7 @@ private SileroVadModelConfig(Builder builder) { this.minSilenceDuration = builder.minSilenceDuration; this.minSpeechDuration = builder.minSpeechDuration; this.windowSize = builder.windowSize; + this.maxSpeechDuration = builder.maxSpeechDuration; } public static Builder builder() { @@ -41,12 +43,17 @@ public int getWindowSize() { return windowSize; } + public float getMaxSpeechDuration() { + return maxSpeechDuration; + } + public static class Builder { private String model = ""; private float threshold = 0.5f; private float minSilenceDuration = 0.25f; private float minSpeechDuration = 0.5f; private int windowSize = 512; + private float maxSpeechDuration = 5.0f; public SileroVadModelConfig build() { return new SileroVadModelConfig(this); @@ -77,5 +84,10 @@ public Builder setWindowSize(int windowSize) { this.windowSize = windowSize; return this; } + + public Builder setMaxSpeechDuration(float maxSpeechDuration) { + this.maxSpeechDuration = maxSpeechDuration; + return this; + } } } diff --git a/sherpa-onnx/jni/voice-activity-detector.cc b/sherpa-onnx/jni/voice-activity-detector.cc index 1f59ae62b..319edd09b 100644 --- a/sherpa-onnx/jni/voice-activity-detector.cc +++ b/sherpa-onnx/jni/voice-activity-detector.cc @@ -40,6 +40,10 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) { fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I"); ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid); + fid = env->GetFieldID(silero_vad_config_cls, "maxSpeechDuration", "F"); + ans.silero_vad.max_speech_duration = + env->GetFloatField(silero_vad_config, fid); + fid = env->GetFieldID(cls, "sampleRate", "I"); ans.sample_rate = env->GetIntField(config, fid); diff --git a/sherpa-onnx/kotlin-api/Vad.kt b/sherpa-onnx/kotlin-api/Vad.kt index 182a23d5f..08a458505 100644 --- a/sherpa-onnx/kotlin-api/Vad.kt +++ b/sherpa-onnx/kotlin-api/Vad.kt @@ -9,6 +9,7 @@ data class SileroVadModelConfig( var minSilenceDuration: Float = 0.25F, var minSpeechDuration: Float = 0.25F, var windowSize: Int = 512, + var maxSpeechDuration: Float = 5.0F, ) data class VadModelConfig( diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index 987b31f14..7f05793e1 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -341,6 +341,7 @@ TSherpaOnnxSileroVadModelConfig = record MinSilenceDuration: Single; MinSpeechDuration: Single; WindowSize: Integer; + MaxSpeechDuration: Single; function ToString: AnsiString; class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); end; @@ -594,6 +595,7 @@ SherpaOnnxSileroVadModelConfig = record MinSilenceDuration: cfloat; MinSpeechDuration: cfloat; WindowSize: cint32; + MaxSpeechDuration: cfloat; end; SherpaOnnxVadModelConfig = record SileroVad: SherpaOnnxSileroVadModelConfig; @@ -1402,10 +1404,11 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString; 'Threshold := %.2f, ' + 'MinSilenceDuration := %.2f, ' + 'MinSpeechDuration := %.2f, ' + - 'WindowSize := %d' + + 'WindowSize := %d, ' + + 'MaxSpeechDuration := %.2f' + ')', [Self.Model, Self.Threshold, Self.MinSilenceDuration, - Self.MinSpeechDuration, Self.WindowSize + Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration ]); end; @@ -1415,6 +1418,7 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString; Dest.MinSilenceDuration := 0.5; Dest.MinSpeechDuration := 0.25; Dest.WindowSize := 512; + Dest.MaxSpeechDuration := 5.0; end; function TSherpaOnnxVadModelConfig.ToString: AnsiString; @@ -1569,6 +1573,7 @@ constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelC C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration; C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration; C.SileroVad.WindowSize := Config.SileroVad.WindowSize; + C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration; C.SampleRate := Config.SampleRate; C.NumThreads := Config.NumThreads; diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index af78b8014..e24819306 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -550,14 +550,16 @@ func sherpaOnnxSileroVadModelConfig( threshold: Float = 0.5, minSilenceDuration: Float = 0.25, minSpeechDuration: Float = 0.5, - windowSize: Int = 512 + windowSize: Int = 512, + maxSpeechDuration: Float = 5.0 ) -> SherpaOnnxSileroVadModelConfig { return SherpaOnnxSileroVadModelConfig( model: toCPointer(model), threshold: threshold, min_silence_duration: minSilenceDuration, min_speech_duration: minSpeechDuration, - window_size: Int32(windowSize) + window_size: Int32(windowSize), + max_speech_duration: maxSpeechDuration ) } diff --git a/wasm/vad/sherpa-onnx-vad.js b/wasm/vad/sherpa-onnx-vad.js index 8be629c78..0dc87906a 100644 --- a/wasm/vad/sherpa-onnx-vad.js +++ b/wasm/vad/sherpa-onnx-vad.js @@ -19,7 +19,7 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) { const buffer = Module._malloc(n); - const len = 5 * 4; + const len = 6 * 4; const ptr = Module._malloc(len); Module.stringToUTF8(config.model || '', buffer, modelLen); @@ -40,6 +40,9 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) { Module.setValue(ptr + offset, config.windowSize || 512, 'i32'); offset += 4; + Module.setValue(ptr + offset, config.maxSpeechDuration || 20, 'float'); + offset += 4; + return { buffer: buffer, ptr: ptr, len: len, } @@ -53,6 +56,7 @@ function initSherpaOnnxVadModelConfig(config, Module) { minSilenceDuration: 0.50, minSpeechDuration: 0.25, windowSize: 512, + maxSpeechDuration: 20, }; } @@ -93,6 +97,7 @@ function createVad(Module, myConfig) { threshold: 0.50, minSilenceDuration: 0.50, minSpeechDuration: 0.25, + maxSpeechDuration: 20, windowSize: 512, }; diff --git a/wasm/vad/sherpa-onnx-wasm-main-vad.cc b/wasm/vad/sherpa-onnx-wasm-main-vad.cc index 3c1600ba1..574ca6690 100644 --- a/wasm/vad/sherpa-onnx-wasm-main-vad.cc +++ b/wasm/vad/sherpa-onnx-wasm-main-vad.cc @@ -13,7 +13,7 @@ extern "C" { -static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 5 * 4, ""); +static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 6 * 4, ""); static_assert(sizeof(SherpaOnnxVadModelConfig) == sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4, @@ -29,6 +29,8 @@ void MyPrint(SherpaOnnxVadModelConfig *config) { fprintf(stdout, "min_speech_duration: %.3f\n", silero_vad->min_speech_duration); fprintf(stdout, "window_size: %d\n", silero_vad->window_size); + fprintf(stdout, "max_speech_duration: %.3f\n", + silero_vad->max_speech_duration); fprintf(stdout, "----------config----------\n");