From e7ffcbd677ce4b09d5ec33fac70a051d69af02ce Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Sat, 14 Sep 2024 12:30:13 +0800
Subject: [PATCH] Add APIs about max speech duration in VAD for various
 programming languages (#1349)

---
 .github/workflows/dot-net.yaml                       |  2 ++
 .../vad-with-non-streaming-asr/bin/paraformer.dart   |  1 +
 .../bin/sense-voice-2.dart                           |  1 +
 .../vad-with-non-streaming-asr/bin/sense-voice.dart  |  1 +
 .../bin/telespeech-ctc.dart                          |  1 +
 .../vad-with-non-streaming-asr/bin/whisper.dart      |  1 +
 .../bin/zipformer-transducer.dart                    |  1 +
 .../sherpa_onnx/lib/src/sherpa_onnx_bindings.dart    |  3 +++
 flutter/sherpa_onnx/lib/src/vad.dart                 |  7 +++++--
 go-api-examples/vad-asr-paraformer/main.go           |  1 +
 go-api-examples/vad-asr-whisper/main.go              |  1 +
 java-api-examples/VadNonStreamingParaformer.java     |  1 +
 java-api-examples/VadNonStreamingSenseVoice.java     |  1 +
 java-api-examples/VadRemoveSilence.java              |  1 +
 lazarus-examples/generate_subtitles/my_init.pas      |  3 ++-
 .../test_vad_with_non_streaming_asr_whisper.js       |  1 +
 .../test-vad-with-non-streaming-asr-whisper.js       |  1 +
 .../vad-remove-non-speech-segments-from-file.py      |  9 +++++++++
 scripts/dotnet/SileroVadModelConfig.cs               |  3 +++
 scripts/go/sherpa_onnx.go                            |  2 ++
 scripts/node-addon-api/lib/vad.js                    |  3 +++
 scripts/node-addon-api/src/vad.cc                    |  1 +
 sherpa-onnx/c-api/c-api.cc                           |  3 +++
 sherpa-onnx/c-api/c-api.h                            |  5 +++++
 .../com/k2fsa/sherpa/onnx/SileroVadModelConfig.java  | 12 ++++++++++++
 sherpa-onnx/jni/voice-activity-detector.cc           |  4 ++++
 sherpa-onnx/kotlin-api/Vad.kt                        |  1 +
 sherpa-onnx/pascal-api/sherpa_onnx.pas               |  9 +++++++--
 swift-api-examples/SherpaOnnx.swift                  |  6 ++++--
 wasm/vad/sherpa-onnx-vad.js                          |  7 ++++++-
 wasm/vad/sherpa-onnx-wasm-main-vad.cc                |  4 +++-
 31 files changed, 88 insertions(+), 9 deletions(-)
diff --git a/.github/workflows/dot-net.yaml b/.github/workflows/dot-net.yaml
index 5299b19ce..36637a9e2 100644
--- a/.github/workflows/dot-net.yaml
+++ b/.github/workflows/dot-net.yaml
@@ -93,6 +93,8 @@ jobs:
             git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
 
             cd huggingface
+            git fetch
+            git pull
             mkdir -p windows-for-dotnet
 
             cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet
diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/paraformer.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/paraformer.dart
index b607c57a1..8d00a671f 100644
--- a/dart-api-examples/vad-with-non-streaming-asr/bin/paraformer.dart
+++ b/dart-api-examples/vad-with-non-streaming-asr/bin/paraformer.dart
@@ -32,6 +32,7 @@ void main(List<String> arguments) async {
     model: sileroVad,
     minSilenceDuration: 0.25,
     minSpeechDuration: 0.5,
+    maxSpeechDuration: 5.0,
   );
 
   final vadConfig = sherpa_onnx.VadModelConfig(
diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice-2.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice-2.dart
index 59493961c..09a7f43b1 100644
--- a/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice-2.dart
+++ b/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice-2.dart
@@ -38,6 +38,7 @@ void main(List<String> arguments) async {
     model: sileroVad,
     minSilenceDuration: 0.25,
     minSpeechDuration: 0.5,
+    maxSpeechDuration: 5.0,
   );
 
   final vadConfig = sherpa_onnx.VadModelConfig(
diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice.dart
index 5f989db60..6489153f6 100644
--- a/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice.dart
+++ b/dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice.dart
@@ -37,6 +37,7 @@ void main(List<String> arguments) async {
     model: sileroVad,
     minSilenceDuration: 0.25,
     minSpeechDuration: 0.5,
+    maxSpeechDuration: 5.0,
   );
 
   final vadConfig = sherpa_onnx.VadModelConfig(
diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/telespeech-ctc.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/telespeech-ctc.dart
index acb707beb..b0a09a542 100644
--- a/dart-api-examples/vad-with-non-streaming-asr/bin/telespeech-ctc.dart
+++ b/dart-api-examples/vad-with-non-streaming-asr/bin/telespeech-ctc.dart
@@ -33,6 +33,7 @@ void main(List<String> arguments) async {
     model: sileroVad,
     minSilenceDuration: 0.25,
     minSpeechDuration: 0.5,
+    maxSpeechDuration: 5.0,
   );
 
   final vadConfig = sherpa_onnx.VadModelConfig(
diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/whisper.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/whisper.dart
index 6a5ed8f77..be66ee9d4 100644
--- a/dart-api-examples/vad-with-non-streaming-asr/bin/whisper.dart
+++ b/dart-api-examples/vad-with-non-streaming-asr/bin/whisper.dart
@@ -34,6 +34,7 @@ void main(List<String> arguments) async {
     model: sileroVad,
     minSilenceDuration: 0.25,
     minSpeechDuration: 0.5,
+    maxSpeechDuration: 5.0,
   );
 
   final vadConfig = sherpa_onnx.VadModelConfig(
diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-transducer.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-transducer.dart
index f1d3df31c..5ccc7e431 100644
--- a/dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-transducer.dart
+++ b/dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-transducer.dart
@@ -37,6 +37,7 @@ void main(List<String> arguments) async {
     model: sileroVad,
     minSilenceDuration: 0.25,
     minSpeechDuration: 0.5,
+    maxSpeechDuration: 5.0,
   );
 
   final vadConfig = sherpa_onnx.VadModelConfig(
diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
index abc5e1f09..207160087 100644
--- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
+++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
@@ -301,6 +301,9 @@ final class SherpaOnnxSileroVadModelConfig extends Struct {
 
   @Int32()
   external int windowSize;
+
+  @Float()
+  external double maxSpeechDuration;
 }
 
 final class SherpaOnnxVadModelConfig extends Struct {
diff --git a/flutter/sherpa_onnx/lib/src/vad.dart b/flutter/sherpa_onnx/lib/src/vad.dart
index bcab3fd30..10fac5a45 100644
--- a/flutter/sherpa_onnx/lib/src/vad.dart
+++ b/flutter/sherpa_onnx/lib/src/vad.dart
@@ -11,11 +11,12 @@ class SileroVadModelConfig {
       this.threshold = 0.5,
       this.minSilenceDuration = 0.5,
       this.minSpeechDuration = 0.25,
-      this.windowSize = 512});
+      this.windowSize = 512,
+      this.maxSpeechDuration = 5.0});
 
   @override
   String toString() {
-    return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize)';
+    return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize, maxSpeechDuration: $maxSpeechDuration)';
   }
 
   final String model;
@@ -23,6 +24,7 @@ class SileroVadModelConfig {
   final double minSilenceDuration;
   final double minSpeechDuration;
   final int windowSize;
+  final double maxSpeechDuration;
 }
 
 class VadModelConfig {
@@ -127,6 +129,7 @@ class VoiceActivityDetector {
     c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration;
     c.ref.sileroVad.minSpeechDuration = config.sileroVad.minSpeechDuration;
     c.ref.sileroVad.windowSize = config.sileroVad.windowSize;
+    c.ref.sileroVad.maxSpeechDuration = config.sileroVad.maxSpeechDuration;
 
     c.ref.sampleRate = config.sampleRate;
     c.ref.numThreads = config.numThreads;
diff --git a/go-api-examples/vad-asr-paraformer/main.go b/go-api-examples/vad-asr-paraformer/main.go
index 7beca779b..a5789142e 100644
--- a/go-api-examples/vad-asr-paraformer/main.go
+++ b/go-api-examples/vad-asr-paraformer/main.go
@@ -22,6 +22,7 @@ func main() {
 	config.SileroVad.MinSilenceDuration = 0.5
 	config.SileroVad.MinSpeechDuration = 0.25
 	config.SileroVad.WindowSize = 512
+	config.SileroVad.MaxSpeechDuration = 5.0
 	config.SampleRate = 16000
 	config.NumThreads = 1
 	config.Provider = "cpu"
diff --git a/go-api-examples/vad-asr-whisper/main.go b/go-api-examples/vad-asr-whisper/main.go
index 08a8aef8e..39027517d 100644
--- a/go-api-examples/vad-asr-whisper/main.go
+++ b/go-api-examples/vad-asr-whisper/main.go
@@ -22,6 +22,7 @@ func main() {
 	config.SileroVad.MinSilenceDuration = 0.5
 	config.SileroVad.MinSpeechDuration = 0.25
 	config.SileroVad.WindowSize = 512
+	config.SileroVad.MaxSpeechDuration = 5.0
 	config.SampleRate = 16000
 	config.NumThreads = 1
 	config.Provider = "cpu"
diff --git a/java-api-examples/VadNonStreamingParaformer.java b/java-api-examples/VadNonStreamingParaformer.java
index eb57f4c14..1757a4b60 100644
--- a/java-api-examples/VadNonStreamingParaformer.java
+++ b/java-api-examples/VadNonStreamingParaformer.java
@@ -18,6 +18,7 @@ public static Vad createVad() {
             .setMinSilenceDuration(0.25f)
             .setMinSpeechDuration(0.5f)
             .setWindowSize(512)
+            .setMaxSpeechDuration(5.0f)
             .build();
 
     VadModelConfig config =
diff --git a/java-api-examples/VadNonStreamingSenseVoice.java b/java-api-examples/VadNonStreamingSenseVoice.java
index d6d27d447..cbfac6b31 100644
--- a/java-api-examples/VadNonStreamingSenseVoice.java
+++ b/java-api-examples/VadNonStreamingSenseVoice.java
@@ -18,6 +18,7 @@ public static Vad createVad() {
             .setMinSilenceDuration(0.25f)
             .setMinSpeechDuration(0.5f)
             .setWindowSize(512)
+            .setMaxSpeechDuration(5.0f)
             .build();
 
     VadModelConfig config =
diff --git a/java-api-examples/VadRemoveSilence.java b/java-api-examples/VadRemoveSilence.java
index 3af1caa7f..511a508e4 100644
--- a/java-api-examples/VadRemoveSilence.java
+++ b/java-api-examples/VadRemoveSilence.java
@@ -19,6 +19,7 @@ public static void main(String[] args) {
             .setMinSilenceDuration(0.25f)
             .setMinSpeechDuration(0.5f)
             .setWindowSize(512)
+            .setMaxSpeechDuration(5.0f)
             .build();
 
     VadModelConfig config =
diff --git a/lazarus-examples/generate_subtitles/my_init.pas b/lazarus-examples/generate_subtitles/my_init.pas
index 55df79f15..d57448b6d 100644
--- a/lazarus-examples/generate_subtitles/my_init.pas
+++ b/lazarus-examples/generate_subtitles/my_init.pas
@@ -48,8 +48,9 @@ function CreateVad(VadFilename: AnsiString): TSherpaOnnxVoiceActivityDetector;
   WindowSize := 512; {Please don't change it unless you know the details}
 
   Config.SileroVad.Model := VadFilename;
-  Config.SileroVad.MinSpeechDuration := 0.5;
+  Config.SileroVad.MinSpeechDuration := 0.25;
   Config.SileroVad.MinSilenceDuration := 0.5;
+  Config.SileroVad.MaxSpeechDuration := 5.0;
   Config.SileroVad.Threshold := 0.5;
   Config.SileroVad.WindowSize := WindowSize;
   Config.NumThreads:= 2;
diff --git a/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js b/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js
index 6f3783e7c..2b672b019 100644
--- a/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js
+++ b/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js
@@ -34,6 +34,7 @@ function createVad() {
       threshold: 0.5,
       minSpeechDuration: 0.25,
       minSilenceDuration: 0.5,
+      maxSpeechDuration: 5,
       windowSize: 512,
     },
     sampleRate: 16000,
diff --git a/nodejs-examples/test-vad-with-non-streaming-asr-whisper.js b/nodejs-examples/test-vad-with-non-streaming-asr-whisper.js
index e84c3ab11..7e4e23c5a 100644
--- a/nodejs-examples/test-vad-with-non-streaming-asr-whisper.js
+++ b/nodejs-examples/test-vad-with-non-streaming-asr-whisper.js
@@ -29,6 +29,7 @@ function createVad() {
       threshold: 0.5,
       minSpeechDuration: 0.25,
       minSilenceDuration: 0.5,
+      maxSpeechDuration: 5,
       windowSize: 512,
     },
     sampleRate: 16000,
diff --git a/python-api-examples/vad-remove-non-speech-segments-from-file.py b/python-api-examples/vad-remove-non-speech-segments-from-file.py
index f559e6519..ad4814487 100755
--- a/python-api-examples/vad-remove-non-speech-segments-from-file.py
+++ b/python-api-examples/vad-remove-non-speech-segments-from-file.py
@@ -90,6 +90,15 @@ def main():
 
     config = sherpa_onnx.VadModelConfig()
     config.silero_vad.model = args.silero_vad_model
+    config.silero_vad.threshold = 0.5
+    config.silero_vad.min_silence_duration = 0.25  # seconds
+    config.silero_vad.min_speech_duration = 0.25  # seconds
+
+    # If the current segment is larger than this value, then it increases
+    # the threshold to 0.9 internally. After detecting this segment,
+    # it resets the threshold to its original value.
+    config.silero_vad.max_speech_duration = 5  # seconds
+
     config.sample_rate = sample_rate
 
     window_size = config.silero_vad.window_size
diff --git a/scripts/dotnet/SileroVadModelConfig.cs b/scripts/dotnet/SileroVadModelConfig.cs
index 8bf81ea87..6a80f4dc6 100644
--- a/scripts/dotnet/SileroVadModelConfig.cs
+++ b/scripts/dotnet/SileroVadModelConfig.cs
@@ -14,6 +14,7 @@ public SileroVadModelConfig()
             MinSilenceDuration = 0.5F;
             MinSpeechDuration = 0.25F;
             WindowSize = 512;
+            MaxSpeechDuration = 5.0F;
         }
 
         [MarshalAs(UnmanagedType.LPStr)]
@@ -26,5 +27,7 @@ public SileroVadModelConfig()
         public float MinSpeechDuration;
 
         public int WindowSize;
+
+        public float MaxSpeechDuration;
     }
 }
diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go
index aeee609ca..ad5060c2c 100644
--- a/scripts/go/sherpa_onnx.go
+++ b/scripts/go/sherpa_onnx.go
@@ -771,6 +771,7 @@ type SileroVadModelConfig struct {
 	MinSilenceDuration float32
 	MinSpeechDuration  float32
 	WindowSize         int
+	MaxSpeechDuration  float32
 }
 
 type VadModelConfig struct {
@@ -849,6 +850,7 @@ func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float3
 	c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration)
 	c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration)
 	c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
+	c.silero_vad.max_speech_duration = C.float(config.SileroVad.MaxSpeechDuration)
 
 	c.sample_rate = C.int(config.SampleRate)
 	c.num_threads = C.int(config.NumThreads)
diff --git a/scripts/node-addon-api/lib/vad.js b/scripts/node-addon-api/lib/vad.js
index 3ef7b6cad..a71dfdb66 100644
--- a/scripts/node-addon-api/lib/vad.js
+++ b/scripts/node-addon-api/lib/vad.js
@@ -39,6 +39,9 @@ config = {
   sileroVad: {
     model: "./silero_vad.onnx",
     threshold: 0.5,
+    minSilenceDuration: 0.5,
+    minSpeechDuration: 0.25,
+    maxSpeechDuration: 5,
   }
 }
    */
diff --git a/scripts/node-addon-api/src/vad.cc b/scripts/node-addon-api/src/vad.cc
index de92337db..eaed2aeea 100644
--- a/scripts/node-addon-api/src/vad.cc
+++ b/scripts/node-addon-api/src/vad.cc
@@ -279,6 +279,7 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig(
   SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration);
   SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration);
   SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize);
+  SHERPA_ONNX_ASSIGN_ATTR_FLOAT(max_speech_duration, maxSpeechDuration);
 
   return c;
 }
diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc
index 0a4c683ee..6b5d6f73a 100644
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -907,6 +907,9 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
   vad_config.silero_vad.window_size =
       SHERPA_ONNX_OR(config->silero_vad.window_size, 512);
 
+  vad_config.silero_vad.max_speech_duration =
+      SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20);
+
   vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
   vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
   vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h
index 67746e587..3be5a19cd 100644
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -746,6 +746,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
   float min_speech_duration;
 
   int window_size;
+
+  // If a speech segment is longer than this value, then we increase
+  // the threshold to 0.9. After finishing detecting the segment,
+  // the threshold value is reset to its original value.
+  float max_speech_duration;
 } SherpaOnnxSileroVadModelConfig;
 
 SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java
index 1cf019c0d..37b3d14ea 100644
--- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java
+++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java
@@ -8,6 +8,7 @@ public class SileroVadModelConfig {
     private final float minSilenceDuration;
     private final float minSpeechDuration;
     private final int windowSize;
+    private final float maxSpeechDuration;
 
     private SileroVadModelConfig(Builder builder) {
         this.model = builder.model;
@@ -15,6 +16,7 @@ private SileroVadModelConfig(Builder builder) {
         this.minSilenceDuration = builder.minSilenceDuration;
         this.minSpeechDuration = builder.minSpeechDuration;
         this.windowSize = builder.windowSize;
+        this.maxSpeechDuration = builder.maxSpeechDuration;
     }
 
     public static Builder builder() {
@@ -41,12 +43,17 @@ public int getWindowSize() {
         return windowSize;
     }
 
+    public float getMaxSpeechDuration() {
+        return maxSpeechDuration;
+    }
+
     public static class Builder {
         private String model = "";
         private float threshold = 0.5f;
         private float minSilenceDuration = 0.25f;
         private float minSpeechDuration = 0.5f;
         private int windowSize = 512;
+        private float maxSpeechDuration = 5.0f;
 
         public SileroVadModelConfig build() {
             return new SileroVadModelConfig(this);
@@ -77,5 +84,10 @@ public Builder setWindowSize(int windowSize) {
             this.windowSize = windowSize;
             return this;
         }
+
+        public Builder setMaxSpeechDuration(float maxSpeechDuration) {
+            this.maxSpeechDuration = maxSpeechDuration;
+            return this;
+        }
     }
 }
diff --git a/sherpa-onnx/jni/voice-activity-detector.cc b/sherpa-onnx/jni/voice-activity-detector.cc
index 1f59ae62b..319edd09b 100644
--- a/sherpa-onnx/jni/voice-activity-detector.cc
+++ b/sherpa-onnx/jni/voice-activity-detector.cc
@@ -40,6 +40,10 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) {
   fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I");
   ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid);
 
+  fid = env->GetFieldID(silero_vad_config_cls, "maxSpeechDuration", "F");
+  ans.silero_vad.max_speech_duration =
+      env->GetFloatField(silero_vad_config, fid);
+
   fid = env->GetFieldID(cls, "sampleRate", "I");
   ans.sample_rate = env->GetIntField(config, fid);
 
diff --git a/sherpa-onnx/kotlin-api/Vad.kt b/sherpa-onnx/kotlin-api/Vad.kt
index 182a23d5f..08a458505 100644
--- a/sherpa-onnx/kotlin-api/Vad.kt
+++ b/sherpa-onnx/kotlin-api/Vad.kt
@@ -9,6 +9,7 @@ data class SileroVadModelConfig(
     var minSilenceDuration: Float = 0.25F,
     var minSpeechDuration: Float = 0.25F,
     var windowSize: Int = 512,
+    var maxSpeechDuration: Float = 5.0F,
 )
 
 data class VadModelConfig(
diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas
index 987b31f14..7f05793e1 100644
--- a/sherpa-onnx/pascal-api/sherpa_onnx.pas
+++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas
@@ -341,6 +341,7 @@   TSherpaOnnxSileroVadModelConfig = record
     MinSilenceDuration: Single;
     MinSpeechDuration: Single;
     WindowSize: Integer;
+    MaxSpeechDuration: Single;
     function ToString: AnsiString;
     class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
   end;
@@ -594,6 +595,7 @@   SherpaOnnxSileroVadModelConfig = record
     MinSilenceDuration: cfloat;
     MinSpeechDuration: cfloat;
     WindowSize: cint32;
+    MaxSpeechDuration: cfloat;
   end;
   SherpaOnnxVadModelConfig = record
     SileroVad: SherpaOnnxSileroVadModelConfig;
@@ -1402,10 +1404,11 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
     'Threshold := %.2f, ' +
     'MinSilenceDuration := %.2f, ' +
     'MinSpeechDuration := %.2f, ' +
-    'WindowSize := %d' +
+    'WindowSize := %d, ' +
+    'MaxSpeechDuration := %.2f' +
     ')',
     [Self.Model, Self.Threshold, Self.MinSilenceDuration,
-     Self.MinSpeechDuration, Self.WindowSize
+     Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
     ]);
 end;
 
@@ -1415,6 +1418,7 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
   Dest.MinSilenceDuration := 0.5;
   Dest.MinSpeechDuration := 0.25;
   Dest.WindowSize := 512;
+  Dest.MaxSpeechDuration := 5.0;
 end;
 
 function TSherpaOnnxVadModelConfig.ToString: AnsiString;
@@ -1569,6 +1573,7 @@ constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelC
   C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
   C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
   C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
+  C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration;
 
   C.SampleRate := Config.SampleRate;
   C.NumThreads := Config.NumThreads;
diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift
index af78b8014..e24819306 100644
--- a/swift-api-examples/SherpaOnnx.swift
+++ b/swift-api-examples/SherpaOnnx.swift
@@ -550,14 +550,16 @@ func sherpaOnnxSileroVadModelConfig(
   threshold: Float = 0.5,
   minSilenceDuration: Float = 0.25,
   minSpeechDuration: Float = 0.5,
-  windowSize: Int = 512
+  windowSize: Int = 512,
+  maxSpeechDuration: Float = 5.0
 ) -> SherpaOnnxSileroVadModelConfig {
   return SherpaOnnxSileroVadModelConfig(
     model: toCPointer(model),
     threshold: threshold,
     min_silence_duration: minSilenceDuration,
     min_speech_duration: minSpeechDuration,
-    window_size: Int32(windowSize)
+    window_size: Int32(windowSize),
+    max_speech_duration: maxSpeechDuration
   )
 }
 
diff --git a/wasm/vad/sherpa-onnx-vad.js b/wasm/vad/sherpa-onnx-vad.js
index 8be629c78..0dc87906a 100644
--- a/wasm/vad/sherpa-onnx-vad.js
+++ b/wasm/vad/sherpa-onnx-vad.js
@@ -19,7 +19,7 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) {
 
   const buffer = Module._malloc(n);
 
-  const len = 5 * 4;
+  const len = 6 * 4;
   const ptr = Module._malloc(len);
 
   Module.stringToUTF8(config.model || '', buffer, modelLen);
@@ -40,6 +40,9 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) {
   Module.setValue(ptr + offset, config.windowSize || 512, 'i32');
   offset += 4;
 
+  Module.setValue(ptr + offset, config.maxSpeechDuration || 20, 'float');
+  offset += 4;
+
   return {
     buffer: buffer, ptr: ptr, len: len,
   }
@@ -53,6 +56,7 @@ function initSherpaOnnxVadModelConfig(config, Module) {
       minSilenceDuration: 0.50,
       minSpeechDuration: 0.25,
       windowSize: 512,
+      maxSpeechDuration: 20,
     };
   }
 
@@ -93,6 +97,7 @@ function createVad(Module, myConfig) {
     threshold: 0.50,
     minSilenceDuration: 0.50,
     minSpeechDuration: 0.25,
+    maxSpeechDuration: 20,
     windowSize: 512,
   };
 
diff --git a/wasm/vad/sherpa-onnx-wasm-main-vad.cc b/wasm/vad/sherpa-onnx-wasm-main-vad.cc
index 3c1600ba1..574ca6690 100644
--- a/wasm/vad/sherpa-onnx-wasm-main-vad.cc
+++ b/wasm/vad/sherpa-onnx-wasm-main-vad.cc
@@ -13,7 +13,7 @@
 
 extern "C" {
 
-static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 5 * 4, "");
+static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 6 * 4, "");
 
 static_assert(sizeof(SherpaOnnxVadModelConfig) ==
                   sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4,
@@ -29,6 +29,8 @@ void MyPrint(SherpaOnnxVadModelConfig *config) {
   fprintf(stdout, "min_speech_duration: %.3f\n",
           silero_vad->min_speech_duration);
   fprintf(stdout, "window_size: %d\n", silero_vad->window_size);
+  fprintf(stdout, "max_speech_duration: %.3f\n",
+          silero_vad->max_speech_duration);
 
   fprintf(stdout, "----------config----------\n");