tetherto · Zbig9000 · Mar 15, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
@@ -224,6 +224,16 @@ public void suppressNonSpeechTokens(boolean enable) {
     /** No speech threshold. */
     public float no_speech_thold;
 
+    /**
+     * RNG seed for reproducible sampling (when temperature &gt; 0).
+     * Each decoder uses {@code seed + decoder_index} so concurrent decoders get
+     * unique seeds. Maps to the {@code seed} field added at
+     * {@code include/whisper.h:553}; without this field declared here the
+     * subsequent {@code greedy} / {@code beam_search} struct offsets shift by
+     * 4 bytes and JNA reads garbage from the C-side defaults.
+     */
+    public int seed;
+
     /** Greedy decoding parameters. */
     public GreedyParams greedy;
 
@@ -331,6 +341,21 @@ public void setLogitsFilterCallback(WhisperLogitsFilterCallback callback) {
     public long i_start_rule;
     public float grammar_penalty;
 
+    // Voice Activity Detection (VAD) params -- added by upstream after v1.8.4.
+    // Without these three fields declared here the C struct's tail is missing
+    // from the JNA layout, which is fine for read-only callers but corrupts
+    // the trailing memory whenever a Java caller passes WhisperFullParams
+    // back into the C ABI (e.g. whisper_full).
+
+    /** Enable VAD pre-filtering inside whisper_full. (default = false) */
+    public CBool vad;
+
+    /** Path to the Silero VAD model (only used when {@link #vad} is true). */
+    public String vad_model_path;
+
+    /** VAD tuning knobs, mirrors {@code whisper_vad_params}. */
+    public WhisperVadParams vad_params;
+
     @Override
     protected List<String> getFieldOrder() {
         return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
@@ -343,13 +368,15 @@ protected List<String> getFieldOrder() {
                 "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
                 "suppress_blank", "suppress_nst", "temperature",
                 "max_initial_ts", "length_penalty", "temperature_inc",
-                "entropy_thold", "logprob_thold", "no_speech_thold", "greedy",
-                "beam_search", "new_segment_callback", "new_segment_callback_user_data",
-                "progress_callback", "progress_callback_user_data",
-                "encoder_begin_callback", "encoder_begin_callback_user_data",
-                "abort_callback", "abort_callback_user_data",
-                "logits_filter_callback", "logits_filter_callback_user_data",
-                "grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
+                "entropy_thold", "logprob_thold", "no_speech_thold", "seed",
+                "greedy", "beam_search", "new_segment_callback",
+                "new_segment_callback_user_data", "progress_callback",
+                "progress_callback_user_data", "encoder_begin_callback",
+                "encoder_begin_callback_user_data", "abort_callback",
+                "abort_callback_user_data", "logits_filter_callback",
+                "logits_filter_callback_user_data", "grammar_rules",
+                "n_grammar_rules", "i_start_rule", "grammar_penalty",
+                "vad", "vad_model_path", "vad_params");
     }
 
     public static class ByValue extends WhisperFullParams implements Structure.ByValue {

@@ -0,0 +1,61 @@
+package io.github.ggerganov.whispercpp.params;
+
+import com.sun.jna.Pointer;
+import com.sun.jna.Structure;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Voice Activity Detection (VAD) parameters.
+ * Mirrors {@code struct whisper_vad_params} in include/whisper.h.
+ */
+public class WhisperVadParams extends Structure {
+
+    public WhisperVadParams() {
+        super();
+    }
+
+    public WhisperVadParams(Pointer p) {
+        super(p);
+    }
+
+    /** Probability threshold to consider as speech. */
+    public float threshold;
+
+    /** Min duration for a valid speech segment. */
+    public int min_speech_duration_ms;
+
+    /** Min silence duration to consider speech as ended. */
+    public int min_silence_duration_ms;
+
+    /** Max duration of a speech segment before forcing a new segment. */
+    public float max_speech_duration_s;
+
+    /** Padding added before and after speech segments. */
+    public int speech_pad_ms;
+
+    /** Overlap in seconds when copying audio samples from speech segment. */
+    public float samples_overlap;
+
+    @Override
+    protected List<String> getFieldOrder() {
+        return Arrays.asList(
+                "threshold",
+                "min_speech_duration_ms",
+                "min_silence_duration_ms",
+                "max_speech_duration_s",
+                "speech_pad_ms",
+                "samples_overlap");
+    }
+
+    public static class ByValue extends WhisperVadParams implements Structure.ByValue {
+        public ByValue() {
+            super();
+        }
+
+        public ByValue(Pointer p) {
+            super(p);
+        }
+    }
+}
@@ -202,6 +202,8 @@ whisper.transcribe("path/to/audio.wav", params, n_processors: Etc.nprocessors)
 
 Note that transcription occasionally might be low accuracy when it works in parallel.
 
+If n_processors is greater than 1, you cannot set any callbacks including new_segment_callback, progress_callback, encoder_begin_callback, abort_callback, and log_callback set by Whisper.log_set.
+
 ### Segments ###
 
 Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:

@@ -112,6 +112,10 @@ ruby_whisper_log_callback(enum ggml_log_level level, const char * buffer, void *
     return;
   }
   VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
+  if (NIL_P(log_callback)) {
+    return;
+  }
+
   VALUE udata = rb_iv_get(mWhisper, "user_data");
   rb_funcall(log_callback, id_call, 3, INT2NUM(level), rb_str_new2(buffer), udata);
 }
@@ -129,10 +133,16 @@ static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_d
   rb_iv_set(self, "log_callback", log_callback);
   rb_iv_set(self, "user_data", user_data);
 
-  VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
-  rb_define_finalizer(log_callback, finalize_log_callback);
+  if (!NIL_P(log_callback)) {
+    VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
+    rb_define_finalizer(log_callback, finalize_log_callback);
+  }
 
-  whisper_log_set(ruby_whisper_log_callback, NULL);
+  if (NIL_P(log_callback)) {
+    whisper_log_set(NULL, NULL);
+  } else {
+    whisper_log_set(ruby_whisper_log_callback, NULL);
+  }
 
   return Qnil;
 }

@@ -2,6 +2,7 @@
 #define RUBY_WHISPER_H
 
 #include <ruby.h>
+#include <ruby/util.h>
 #include <ruby/memory_view.h>
 #include "whisper.h"
 

@@ -22,7 +22,7 @@ extern const rb_data_type_t ruby_whisper_context_params_type;
 extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
 extern VALUE rb_whisper_model_s_new(VALUE context);
 extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
-extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
+extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context, int n_processors);
 
 ID transcribe_option_names[1];
 
@@ -436,7 +436,7 @@ full_body(VALUE rb_args)
   GetContext(*args->context, rw);
   TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
 
-  prepare_transcription(rwp, args->context);
+  prepare_transcription(rwp, args->context, 1);
   int result = whisper_full(rw->context, rwp->params, args->samples, args->n_samples);
 
   return INT2NUM(result);
@@ -487,7 +487,7 @@ full_parallel_body(VALUE rb_args)
   GetContext(*args->context, rw);
   TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
 
-  prepare_transcription(rwp, args->context);
+  prepare_transcription(rwp, args->context, args->n_processors);
   int result = whisper_full_parallel(rw->context, rwp->params, args->samples, args->n_samples, args->n_processors);
 
   return INT2NUM(result);

@@ -29,6 +29,7 @@
 
 extern VALUE cParams;
 extern VALUE cVADParams;
+extern VALUE mWhisper;
 
 extern ID id_call;
 
@@ -186,6 +187,35 @@ static bool abort_callback(void * user_data) {
   return false;
 }
 
+static void
+check_thread_safety(ruby_whisper_params *rwp, VALUE *context, int n_processors)
+{
+  if (n_processors == 1) {
+    return;
+  }
+
+  if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
+    rb_raise(rb_eRuntimeError, "new segment callback not supported on parallel transcription");
+  }
+
+  if (!NIL_P(rwp->progress_callback_container->callback) || 0 != RARRAY_LEN(rwp->progress_callback_container->callbacks)) {
+    rb_raise(rb_eRuntimeError, "progress callback not supported on parallel transcription");
+  }
+
+  if (!NIL_P(rwp->encoder_begin_callback_container->callback) || 0 != RARRAY_LEN(rwp->encoder_begin_callback_container->callbacks)) {
+    rb_raise(rb_eRuntimeError, "encoder begin callback not supported on parallel transcription");
+  }
+
+  if (!NIL_P(rwp->abort_callback_container->callback) || 0 != RARRAY_LEN(rwp->abort_callback_container->callbacks)) {
+    rb_raise(rb_eRuntimeError, "abort callback not supported on parallel transcription");
+  }
+
+  VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
+  if (!NIL_P(log_callback)) {
+    rb_raise(rb_eRuntimeError, "log callback not supported for parallel transcription");
+  }
+}
+
 static void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
   if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
     rwp->new_segment_callback_container->context = context;
@@ -219,9 +249,13 @@ static void set_vad_params(ruby_whisper_params *rwp)
   rwp->params.vad_params = rwvp->params;
 }
 
+/*
+  TODO: Set abort callback to trap SIGINT and SIGTERM
+*/
 void
-prepare_transcription(ruby_whisper_params *rwp, VALUE *context)
+prepare_transcription(ruby_whisper_params *rwp, VALUE *context, int n_processors)
 {
+  check_thread_safety(rwp, context, n_processors);
   register_callbacks(rwp, context);
   set_vad_params(rwp);
 }
@@ -240,6 +274,20 @@ rb_whisper_params_mark(void *p)
 void
 ruby_whisper_params_free(ruby_whisper_params *rwp)
 {
+  if (rwp->params.language) {
+    ruby_xfree((void *)rwp->params.language);
+  }
+  if (rwp->params.initial_prompt) {
+    ruby_xfree((void *)rwp->params.initial_prompt);
+  }
+  if (rwp->params.vad_model_path) {
+    ruby_xfree((void *)rwp->params.vad_model_path);
+  }
+
+  xfree(rwp->new_segment_callback_container);
+  xfree(rwp->progress_callback_container);
+  xfree(rwp->encoder_begin_callback_container);
+  xfree(rwp->abort_callback_container);
 }
 
 void
@@ -248,7 +296,7 @@ rb_whisper_params_free(void *p)
   ruby_whisper_params *rwp = (ruby_whisper_params *)p;
   // How to free user_data and callback only when not referred to by others?
   ruby_whisper_params_free(rwp);
-  free(rwp);
+  xfree(rwp);
 }
 
 static size_t
@@ -276,6 +324,15 @@ ruby_whisper_params_allocate(VALUE klass)
   ruby_whisper_params *rwp;
   VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_params, &ruby_whisper_params_type, rwp);
   rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+  if (rwp->params.language != NULL) {
+    rwp->params.language = ruby_strdup(rwp->params.language);
+  }
+  if (rwp->params.initial_prompt != NULL) {
+    rwp->params.initial_prompt = ruby_strdup(rwp->params.initial_prompt);
+  }
+  if (rwp->params.vad_model_path != NULL) {
+    rwp->params.vad_model_path = ruby_strdup(rwp->params.vad_model_path);
+  }
   rwp->diarize = false;
   rwp->vad_params = TypedData_Wrap_Struct(cVADParams, &ruby_whisper_vad_params_type, (void *)&rwp->params.vad_params);
   rwp->new_segment_callback_container = rb_whisper_callback_container_allocate();
@@ -296,10 +353,12 @@ ruby_whisper_params_set_language(VALUE self, VALUE value)
 {
   ruby_whisper_params *rwp;
   TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  ruby_xfree((void *)rwp->params.language);
+  rwp->params.language = NULL;
   if (value == Qfalse || value == Qnil) {
-    rwp->params.language = "auto";
+    rwp->params.language = ruby_strdup("auto");
   } else {
-    rwp->params.language = StringValueCStr(value);
+    rwp->params.language = ruby_strdup(StringValueCStr(value));
   }
   return value;
 }
@@ -608,7 +667,13 @@ ruby_whisper_params_set_initial_prompt(VALUE self, VALUE value)
 {
   ruby_whisper_params *rwp;
   TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
-  rwp->params.initial_prompt = StringValueCStr(value);
+  ruby_xfree((void *)rwp->params.initial_prompt);
+  rwp->params.initial_prompt = NULL;
+  if (NIL_P(value)) {
+    rwp->params.initial_prompt = NULL;
+  } else {
+    rwp->params.initial_prompt = ruby_strdup(StringValueCStr(value));
+  }
   return value;
 }
 /*
@@ -1103,12 +1168,14 @@ ruby_whisper_params_set_vad_model_path(VALUE self, VALUE value)
 {
   ruby_whisper_params *rwp;
   TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  ruby_xfree((void *)rwp->params.vad_model_path);
+  rwp->params.vad_model_path = NULL;
   if (NIL_P(value)) {
     rwp->params.vad_model_path = NULL;
     return value;
   }
   VALUE path = ruby_whisper_normalize_model_path(value);
-  rwp->params.vad_model_path = StringValueCStr(path);
+  rwp->params.vad_model_path = ruby_strdup(StringValueCStr(path));
   return value;
 }
 

@@ -16,7 +16,7 @@ extern ID id_to_path;
 extern ID transcribe_option_names[1];
 
 extern void
-prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
+prepare_transcription(ruby_whisper_params * rwp, VALUE * self, int n_processors);
 
 /*
  * transcribe a single file
@@ -73,7 +73,7 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
   //   rwp->params.encoder_begin_callback_user_data = &is_aborted;
   // }
 
-  prepare_transcription(rwp, &self);
+  prepare_transcription(rwp, &self, n_processors);
 
   if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
     fprintf(stderr, "failed to process audio\n");

@@ -37,7 +37,7 @@ module Whisper
   def self.lang_id: (string name) -> Integer
   def self.lang_str: (Integer id) -> String
   def self.lang_str_full: (Integer id) -> String
-  def self.log_set: (log_callback, Object? user_data) -> log_callback
+  def self.log_set: (log_callback?, Object? user_data) -> log_callback
   def self.system_info_str: () -> String
 
   class Context
@@ -52,6 +52,9 @@ module Whisper
     #       puts text
     #     end
     #
+    # If n_processors is greater than 1, you cannot set any callbacks including
+    # new_segment_callback, progress_callback, encoder_begin_callback, abort_callback,
+    # and log_callback set by Whisper.log_set
     def transcribe: (path, Params, ?n_processors: Integer) -> self
                   | (path, Params, ?n_processors: Integer) { (String) -> void } -> self
 
@@ -129,6 +132,9 @@ module Whisper
     # It seems this approach can offer some speedup in some cases.
     # However, the transcription accuracy can be worse at the beginning and end of each chunk.
     #
+    # If n_processors is greater than 1, you cannot set any callbacks including
+    # new_segment_callback, progress_callback, encoder_begin_callback, abort_callback,
+    # and log_callback set by Whisper.log_set
     def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
                      | (Params, _Samples, ?Integer n_samples) -> self
                      | (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self