From 7c309eca500820ab5a9a4bb01d18cdb23783e006 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 21 Jan 2026 20:33:43 +0900 Subject: [PATCH 01/44] ruby : Bump version to 1.3.6 --- bindings/ruby/whispercpp.gemspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/ruby/whispercpp.gemspec b/bindings/ruby/whispercpp.gemspec index 2e05769a22c..88b94e7eb8a 100644 --- a/bindings/ruby/whispercpp.gemspec +++ b/bindings/ruby/whispercpp.gemspec @@ -3,7 +3,7 @@ require_relative "extsources" Gem::Specification.new do |s| s.name = "whispercpp" s.authors = ["Georgi Gerganov", "Todd A. Fisher"] - s.version = '1.3.5' + s.version = '1.3.6' s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby} s.email = 'todd.fisher@gmail.com' s.extra_rdoc_files = ['LICENSE', 'README.md'] From 49b3f1063e0acb070d9807c8cd3508bbe503c2ae Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 21 Jan 2026 20:33:55 +0900 Subject: [PATCH 02/44] Fix code in example --- bindings/ruby/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/ruby/README.md b/bindings/ruby/README.md index ea202753b67..dc45caf44cc 100644 --- a/bindings/ruby/README.md +++ b/bindings/ruby/README.md @@ -334,7 +334,7 @@ VAD feature itself is useful. You can use it separately from ASR: vad = Whisper::VAD::Context.new("silero-v6.2.0") vad .detect("path/to/audio.wav", Whisper::VAD::Params.new) - .each_with_index do |segment, index| + .each.with_index do |segment, index| segment => {start_time: st, end_time: ed} # `Segment` responds to `#deconstruct_keys` puts "[%{nth}: %{st} --> %{ed}]" % {nth: index + 1, st:, ed:} From fd6c9126a3f4d2437502f582c0749bbd9588a7a4 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 09:17:50 +0900 Subject: [PATCH 03/44] Add sample code to transcribe from MemoryView --- bindings/ruby/README.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/bindings/ruby/README.md b/bindings/ruby/README.md index dc45caf44cc..6359906f3d6 100644 --- a/bindings/ruby/README.md +++ b/bindings/ruby/README.md @@ -323,7 +323,24 @@ whisper end ``` -The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView. If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy. +The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView. + +If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy. + +```ruby +require "torchaudio" +require "arrow-numo-narray" +require "whisper" + +waveform, sample_rate = TorchAudio.load("test/fixtures/jfk.wav") +# Convert Torch::Tensor to Arrow::Array via Numo::NArray +samples = waveform.squeeze.numo.to_arrow.to_arrow_array + +whisper = Whisper::Context.new("base") +whisper + # Arrow::Array exports MemoryView + .full(Whisper::Params.new, samples) +``` Using VAD separately from ASR ----------------------------- From 28b97fe2f17beaa9ed2c1fd35f8507a06bb00373 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 09:45:52 +0900 Subject: [PATCH 04/44] Define GetVADContext macro --- bindings/ruby/ext/ruby_whisper.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bindings/ruby/ext/ruby_whisper.h b/bindings/ruby/ext/ruby_whisper.h index 3f5660c374d..83a7bc0e0b5 100644 --- a/bindings/ruby/ext/ruby_whisper.h +++ b/bindings/ruby/ext/ruby_whisper.h @@ -69,6 +69,13 @@ typedef struct { } \ } while (0) +#define GetVADContext(obj, rwvc) do { \ + TypedData_Get_Struct((obj), ruby_whisper_vad_context, &ruby_whisper_vad_context_type, (rwvc)); \ + if ((rwvc)->context == NULL) { \ + rb_raise(rb_eRuntimeError, "Not initialized"); \ + } \ +} while (0) + #define GetVADSegments(obj, rwvss) do { \ TypedData_Get_Struct((obj), ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, (rwvss)); \ if ((rwvss)->segments == NULL) { \ From 101df90ebf4d967111e278081456346e68d23240 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 09:46:05 +0900 Subject: [PATCH 05/44] Use GetVADContext --- bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp b/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp index 58609f87742..c5ba29e9e16 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp +++ b/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp @@ -25,10 +25,7 @@ ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params) { std::vector> pcmf32s; whisper_vad_segments *segments; - TypedData_Get_Struct(self, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc); - if (rwvc->context == NULL) { - rb_raise(rb_eRuntimeError, "Doesn't have referenxe to context internally"); - } + GetVADContext(self, rwvc); TypedData_Get_Struct(params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp); cpp_file_path = StringValueCStr(file_path); From 47729a4e0724fac8e5fcd16be759486dfbc2b816 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 11:40:30 +0900 Subject: [PATCH 06/44] Extract parse_full_args function --- bindings/ruby/ext/ruby_whisper_context.c | 58 +++++++++++++++--------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index a7b5f8513db..714da9f365c 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -272,32 +272,18 @@ VALUE ruby_whisper_model_type(VALUE self) return rb_str_new2(whisper_model_type_readable(rw->context)); } -/* - * Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text - * Not thread safe for same context - * Uses the specified decoding strategy to obtain the text. - * - * call-seq: - * full(params, samples, n_samples) -> nil - * full(params, samples) -> nil - * - * The second argument +samples+ must be an array of samples, respond to :length, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data. - */ -VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) +int +parse_full_args(int argc, VALUE *argv, float** c_samples) { if (argc < 2 || argc > 3) { rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc); } - ruby_whisper *rw; - ruby_whisper_params *rwp; - GetContext(self, rw); - VALUE params = argv[0]; - TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); VALUE samples = argv[1]; int n_samples; rb_memory_view_t view; const bool memory_view_available_p = rb_memory_view_available_p(samples); + if (argc == 3) { n_samples = NUM2INT(argv[2]); if (TYPE(samples) == T_ARRAY) { @@ -328,13 +314,15 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given"); } } - float * c_samples = (float *)malloc(n_samples * sizeof(float)); + + float *tmp_samples = (float *)malloc(n_samples * sizeof(float)); + if (memory_view_available_p) { - c_samples = (float *)view.data; + tmp_samples = (float *)view.data; } else { if (TYPE(samples) == T_ARRAY) { for (int i = 0; i < n_samples; i++) { - c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i)); + tmp_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i)); } } else { // TODO: use rb_block_call @@ -342,12 +330,38 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) for (int i = 0; i < n_samples; i++) { // TODO: check if iter is exhausted and raise ArgumentError appropriately VALUE sample = rb_funcall(iter, id_next, 0); - c_samples[i] = RFLOAT_VALUE(sample); + tmp_samples[i] = RFLOAT_VALUE(sample); } } } + *c_samples = tmp_samples; + + return n_samples; +} + +/* + * Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text + * Not thread safe for same context + * Uses the specified decoding strategy to obtain the text. + * + * call-seq: + * full(params, samples, n_samples) -> nil + * full(params, samples) -> nil + * + * The second argument +samples+ must be an array of samples, respond to :length, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data. + */ +VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) +{ + ruby_whisper *rw; + ruby_whisper_params *rwp; + GetContext(self, rw); + VALUE params = argv[0]; + TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); + float *samples = NULL; + + int n_samples = parse_full_args(argc, argv, &samples); prepare_transcription(rwp, &self); - const int result = whisper_full(rw->context, rwp->params, c_samples, n_samples); + const int result = whisper_full(rw->context, rwp->params, samples, n_samples); if (0 == result) { return self; } else { From 8be0ed8daedaa0654b097ff578ef668df2877ee4 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 11:48:27 +0900 Subject: [PATCH 07/44] Use parse_full_args in ruby_whisper_full_parallel --- bindings/ruby/ext/ruby_whisper_context.c | 63 ++++-------------------- 1 file changed, 9 insertions(+), 54 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 714da9f365c..98d0be9008e 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -386,7 +386,7 @@ static VALUE ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) { if (argc < 2 || argc > 4) { - rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc); + rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..4)", argc); } ruby_whisper *rw; @@ -394,11 +394,9 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) GetContext(self, rw); VALUE params = argv[0]; TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); - VALUE samples = argv[1]; - int n_samples; + float *samples = NULL; + int n_processors; - rb_memory_view_t view; - const bool memory_view_available_p = rb_memory_view_available_p(samples); switch (argc) { case 2: n_processors = 1; @@ -410,56 +408,13 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) n_processors = NUM2INT(argv[3]); break; } - if (argc >= 3 && !NIL_P(argv[2])) { - n_samples = NUM2INT(argv[2]); - if (TYPE(samples) == T_ARRAY) { - if (RARRAY_LEN(samples) < n_samples) { - rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples); - } - } - // Should check when samples.respond_to?(:length)? - } else if (memory_view_available_p) { - if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) { - view.obj = Qnil; - rb_raise(rb_eArgError, "unable to get a memory view"); - } - ssize_t n_samples_size = view.byte_size / view.item_size; - if (n_samples_size > INT_MAX) { - rb_raise(rb_eArgError, "samples are too long"); - } - n_samples = (int)n_samples_size; - } else { - if (TYPE(samples) == T_ARRAY) { - if (RARRAY_LEN(samples) > INT_MAX) { - rb_raise(rb_eArgError, "samples are too long"); - } - n_samples = (int)RARRAY_LEN(samples); - } else if (rb_respond_to(samples, id_length)) { - n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); - } else { - rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given"); - } - } - float * c_samples = (float *)malloc(n_samples * sizeof(float)); - if (memory_view_available_p) { - c_samples = (float *)view.data; - } else { - if (TYPE(samples) == T_ARRAY) { - for (int i = 0; i < n_samples; i++) { - c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i)); - } - } else { - // FIXME: use rb_block_call - VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each")); - for (int i = 0; i < n_samples; i++) { - // TODO: check if iter is exhausted and raise ArgumentError - VALUE sample = rb_funcall(iter, id_next, 0); - c_samples[i] = RFLOAT_VALUE(sample); - } - } - } + int n_samples = parse_full_args( + (argc >= 3 && !NIL_P(argv[2])) ? 3 : 2, + argv, + &samples + ); prepare_transcription(rwp, &self); - const int result = whisper_full_parallel(rw->context, rwp->params, c_samples, n_samples, n_processors); + const int result = whisper_full_parallel(rw->context, rwp->params, samples, n_samples, n_processors); if (0 == result) { return self; } else { From 1d63ed465b8eb0ad257abce73a3d65a5504ca538 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 11:49:41 +0900 Subject: [PATCH 08/44] Free samples after use --- bindings/ruby/ext/ruby_whisper_context.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 98d0be9008e..155c655cf45 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -362,6 +362,9 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) int n_samples = parse_full_args(argc, argv, &samples); prepare_transcription(rwp, &self); const int result = whisper_full(rw->context, rwp->params, samples, n_samples); + if (samples != NULL) { + free(samples); + } if (0 == result) { return self; } else { @@ -415,6 +418,9 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) ); prepare_transcription(rwp, &self); const int result = whisper_full_parallel(rw->context, rwp->params, samples, n_samples, n_processors); + if (samples != NULL) { + free(samples); + } if (0 == result) { return self; } else { From b7068be868b9760298f7126404cd578b7c48aad7 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 11:58:42 +0900 Subject: [PATCH 09/44] Check return value of parse_full_args() --- bindings/ruby/ext/ruby_whisper_context.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 155c655cf45..5c02320f942 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -360,6 +360,9 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) float *samples = NULL; int n_samples = parse_full_args(argc, argv, &samples); + if (samples == NULL) { + rb_raise(rb_eRuntimeError, "failed to parse samples"); + } prepare_transcription(rwp, &self); const int result = whisper_full(rw->context, rwp->params, samples, n_samples); if (samples != NULL) { @@ -416,6 +419,9 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) argv, &samples ); + if (samples == NULL) { + rb_raise(rb_eRuntimeError, "failed to parse samples"); + } prepare_transcription(rwp, &self); const int result = whisper_full_parallel(rw->context, rwp->params, samples, n_samples, n_processors); if (samples != NULL) { From 4fecfd6e0c41bbfb92eeef085bdb2b3328241e62 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 11:59:51 +0900 Subject: [PATCH 10/44] Define GetVADParams macro --- bindings/ruby/ext/ruby_whisper.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bindings/ruby/ext/ruby_whisper.h b/bindings/ruby/ext/ruby_whisper.h index 83a7bc0e0b5..394759b03fd 100644 --- a/bindings/ruby/ext/ruby_whisper.h +++ b/bindings/ruby/ext/ruby_whisper.h @@ -76,6 +76,10 @@ typedef struct { } \ } while (0) +#define GetVADParams(obj, rwvp) do { \ + TypedData_Get_Struct((obj), ruby_whisper_vad_params, &ruby_whisper_vad_params_type, (rwvp)); \ +} while (0) + #define GetVADSegments(obj, rwvss) do { \ TypedData_Get_Struct((obj), ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, (rwvss)); \ if ((rwvss)->segments == NULL) { \ From 875f20450bb65c954d69edc8c93e2baad3b2b95f Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 12:09:02 +0900 Subject: [PATCH 11/44] Add VAD::Context#segments_from_samples --- bindings/ruby/ext/ruby_whisper_vad_context.c | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/bindings/ruby/ext/ruby_whisper_vad_context.c b/bindings/ruby/ext/ruby_whisper_vad_context.c index bf2ed2ba465..4152077124c 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context.c +++ b/bindings/ruby/ext/ruby_whisper_vad_context.c @@ -5,8 +5,11 @@ extern ID id_to_s; extern VALUE cVADContext; +extern const rb_data_type_t ruby_whisper_vad_params_type; extern VALUE ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params); extern VALUE ruby_whisper_normalize_model_path(VALUE model_path); +extern int parse_full_args(int argc, VALUE *argv, float** samples); +extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments); static size_t ruby_whisper_vad_context_memsize(const void *p) @@ -66,10 +69,35 @@ ruby_whisper_vad_context_initialize(VALUE self, VALUE model_path) return Qnil; } +static VALUE +ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self) +{ + ruby_whisper_vad_context *rwvc; + ruby_whisper_vad_params *rwvp; + GetVADContext(self, rwvc); + VALUE params = argv[0]; + GetVADParams(params, rwvp); + float *samples = NULL; + + int n_samples = parse_full_args(argc, argv, &samples); + if (samples == NULL) { + rb_raise(rb_eRuntimeError, "failed to parse samples"); + } + struct whisper_vad_segments *segments = whisper_vad_segments_from_samples( + rwvc->context, + rwvp->params, + samples, + n_samples + ); + + return ruby_whisper_vad_segments_s_init(segments); +} + void init_ruby_whisper_vad_context(VALUE *mVAD) { cVADContext = rb_define_class_under(*mVAD, "Context", rb_cObject); rb_define_alloc_func(cVADContext, ruby_whisper_vad_context_s_allocate); rb_define_method(cVADContext, "initialize", ruby_whisper_vad_context_initialize, 1); + rb_define_method(cVADContext, "segments_from_samples", ruby_whisper_vad_segments_from_samples, -1); rb_define_method(cVADContext, "detect", ruby_whisper_vad_detect, 2); } From eb03a9dee89aebc18b24b5bad5f2336035ac8fe1 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 12:09:16 +0900 Subject: [PATCH 12/44] Add tests for VAD::Context#segments_from_samples --- bindings/ruby/test/test_vad_context.rb | 66 ++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/bindings/ruby/test/test_vad_context.rb b/bindings/ruby/test/test_vad_context.rb index 704916db6de..b4558d34faf 100644 --- a/bindings/ruby/test/test_vad_context.rb +++ b/bindings/ruby/test/test_vad_context.rb @@ -9,6 +9,25 @@ def test_initialize def test_detect context = Whisper::VAD::Context.new("silero-v6.2.0") segments = context.detect(AUDIO, Whisper::VAD::Params.new) + assert_segments segments + end + + def test_invalid_model_type + assert_raise TypeError do + Whisper::VAD::Context.new(Object.new) + end + end + + def test_allocate + vad = Whisper::VAD::Context.allocate + assert_raise do + vad.detect(AUDIO, Whisper::VAD::Params.new) + end + end + + private + + def assert_segments(segments) assert_instance_of Whisper::VAD::Segments, segments i = 0 @@ -35,16 +54,47 @@ def test_detect assert_equal 4, segments.length end - def test_invalid_model_type - assert_raise TypeError do - Whisper::VAD::Context.new(Object.new) + sub_test_case "from samples" do + def setup + super + @vad = Whisper::VAD::Context.new("silero-v6.2.0") + @samples = File.read(AUDIO, nil, 78).unpack("s<*").collect {|i| i.to_f / 2**15} end - end - def test_allocate - vad = Whisper::VAD::Context.allocate - assert_raise do - vad.detect(AUDIO, Whisper::VAD::Params.new) + def test_segments_from_samples + segments = @vad.segments_from_samples(Whisper::VAD::Params.new, @samples, @samples.length) + assert_segments segments + end + + def test_segments_from_samples_without_length + segments = @vad.segments_from_samples(Whisper::VAD::Params.new, @samples) + assert_segments segments + end + + def test_segments_from_samples_enumerator + samples = @samples.each + segments = @vad.segments_from_samples(Whisper::VAD::Params.new, samples, @samples.length) + assert_segments segments + end + + def test_segments_from_samples_enumerator_without_length + samples = @samples.each + assert_raise ArgumentError do + @vad.segments_from_samples(Whisper::VAD::Params.new, samples) + end + end + + def test_segments_from_samples_enumerator_with_too_large_length + samples = @samples.each.take(10).to_enum + assert_raise StopIteration do + @vad.segments_from_samples(Whisper::VAD::Params.new, samples, 11) + end + end + + def test_segments_from_samples_with_memory_view + samples = JFKReader.new(AUDIO) + segments = @vad.segments_from_samples(Whisper::VAD::Params.new, samples) + assert_segments segments end end end From 540d48ee1d21911f9fcfed094bbe832b28a745ca Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 12:10:34 +0900 Subject: [PATCH 13/44] Add signature for VAD::Context#segments_from_samples --- bindings/ruby/sig/whisper.rbs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs index 1137e3f36ab..37e34455ea2 100644 --- a/bindings/ruby/sig/whisper.rbs +++ b/bindings/ruby/sig/whisper.rbs @@ -603,6 +603,8 @@ module Whisper class Context def self.new: (String | path | ::URI::HTTP model_name_or_path) -> instance + def segments_from_samples: (Params, Array[Float] samples, ?Integer n_samples) -> Segments + | (Params, _Samples, ?Integer n_samples) -> Segments def detect: (path wav_file_path, Params) -> Segments end From aa2f7928e3d1594e34d41a9302260f3d49c65549 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 12:15:02 +0900 Subject: [PATCH 14/44] Add sample code for VAD::Context#segments_from_samples --- bindings/ruby/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/bindings/ruby/README.md b/bindings/ruby/README.md index 6359906f3d6..f82ce01a73c 100644 --- a/bindings/ruby/README.md +++ b/bindings/ruby/README.md @@ -358,6 +358,19 @@ vad end ``` +You may also low level API `Whisper::VAD::Context#segments_from_samples` as such `Whisper::Context#full`: + +```ruby +reader = WaveFile::Reader.new("path/to/audio.wav", WaveFile::Format.new(:mono, :float, 16000)) +samples = reader.enum_for(:each_buffer).map(&:samples).flatten + +# Or, +waveform, sample_rate = TorchAudio.load("test/fixtures/jfk.wav") +samples = waveform.squeeze.numo.to_arrow.to_arrow_array + +segments = vad.segments_from_samples(Whisper::Params.new, samples) +``` + Development ----------- From 4e23821c9d3f7470ca6f02e4b9661a73a19cfdfe Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 12:17:08 +0900 Subject: [PATCH 15/44] Add test for Whisper::Context#transcribe with Pathname --- bindings/ruby/test/test_whisper.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/bindings/ruby/test/test_whisper.rb b/bindings/ruby/test/test_whisper.rb index 96e248aca3a..2d1711237d9 100644 --- a/bindings/ruby/test/test_whisper.rb +++ b/bindings/ruby/test/test_whisper.rb @@ -1,6 +1,7 @@ require_relative "helper" require "stringio" require "etc" +require "pathname" # Exists to detect memory-related bug Whisper.log_set ->(level, buffer, user_data) {}, nil @@ -20,6 +21,15 @@ def test_whisper } end + def test_whisper_pathname + @whisper = Whisper::Context.new("base.en") + params = Whisper::Params.new + + @whisper.transcribe(Pathname(AUDIO), params) {|text| + assert_match(/ask not what your country can do for you, ask what you can do for your country/, text) + } + end + def test_transcribe_non_parallel @whisper = Whisper::Context.new("base.en") params = Whisper::Params.new From 4b573c99c1fc370d8b32dada32ec040022e7f557 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 12:20:28 +0900 Subject: [PATCH 16/44] Make Whisper::Context#transcribe and Whisper::VAD::Context#detect accept Pathname --- bindings/ruby/ext/ruby_whisper_transcribe.cpp | 4 ++++ bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/bindings/ruby/ext/ruby_whisper_transcribe.cpp b/bindings/ruby/ext/ruby_whisper_transcribe.cpp index 594b2db90e3..a8f41c537e9 100644 --- a/bindings/ruby/ext/ruby_whisper_transcribe.cpp +++ b/bindings/ruby/ext/ruby_whisper_transcribe.cpp @@ -13,6 +13,7 @@ extern const rb_data_type_t ruby_whisper_params_type; extern ID id_to_s; extern ID id_call; +extern ID id_to_path; extern ID transcribe_option_names[1]; extern void @@ -50,6 +51,9 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) { rb_raise(rb_eRuntimeError, "Expected file path to wave file"); } + if (rb_respond_to(wave_file_path, id_to_path)) { + wave_file_path = rb_funcall(wave_file_path, id_to_path, 0); + } std::string fname_inp = StringValueCStr(wave_file_path); std::vector pcmf32; // mono-channel F32 PCM diff --git a/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp b/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp index c5ba29e9e16..8fb5835ab55 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp +++ b/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp @@ -8,6 +8,8 @@ extern "C" { #endif +extern ID id_to_path; + extern VALUE cVADSegments; extern const rb_data_type_t ruby_whisper_vad_context_type; @@ -28,6 +30,9 @@ ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params) { GetVADContext(self, rwvc); TypedData_Get_Struct(params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp); + if (rb_respond_to(file_path, id_to_path)) { + cpp_file_path = rb_funcall(file_path, id_to_path, 0); + } cpp_file_path = StringValueCStr(file_path); if (!read_audio_data(cpp_file_path, pcmf32, pcmf32s, false)) { From 50420fc2f79cd31abc465cb56c03fd0800849cd7 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 13:35:49 +0900 Subject: [PATCH 17/44] Update signature of Whisper::Context#transcribe --- bindings/ruby/sig/whisper.rbs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs index 37e34455ea2..0e7b2c276e8 100644 --- a/bindings/ruby/sig/whisper.rbs +++ b/bindings/ruby/sig/whisper.rbs @@ -37,8 +37,8 @@ module Whisper # puts text # end # - def transcribe: (string, Params, ?n_processors: Integer) -> self - | (string, Params, ?n_processors: Integer) { (String) -> void } -> self + def transcribe: (path, Params, ?n_processors: Integer) -> self + | (path, Params, ?n_processors: Integer) { (String) -> void } -> self def model_n_vocab: () -> Integer def model_n_audio_ctx: () -> Integer From 6c8a20cb1f64be1a08700c4e19ab2d1efc9e0328 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 13:40:24 +0900 Subject: [PATCH 18/44] Fix variable name --- bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp b/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp index 8fb5835ab55..3423e2e2bd5 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp +++ b/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp @@ -31,7 +31,7 @@ ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params) { TypedData_Get_Struct(params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp); if (rb_respond_to(file_path, id_to_path)) { - cpp_file_path = rb_funcall(file_path, id_to_path, 0); + file_path = rb_funcall(file_path, id_to_path, 0); } cpp_file_path = StringValueCStr(file_path); From a825c013f252eedbc940f0d64fdd1f8bef455b58 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 14:15:08 +0900 Subject: [PATCH 19/44] Don't free memory view --- bindings/ruby/ext/ruby_whisper_context.c | 55 +++++++++++--------- bindings/ruby/ext/ruby_whisper_vad_context.c | 10 ++-- bindings/ruby/test/test_whisper.rb | 10 ++++ 3 files changed, 46 insertions(+), 29 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 5c02320f942..b65a175b13d 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -272,23 +272,22 @@ VALUE ruby_whisper_model_type(VALUE self) return rb_str_new2(whisper_model_type_readable(rw->context)); } -int -parse_full_args(int argc, VALUE *argv, float** c_samples) +bool +parse_full_args(int argc, VALUE *argv, float** c_samples, int *n_samples, bool *memview_exported) { if (argc < 2 || argc > 3) { rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc); } VALUE samples = argv[1]; - int n_samples; rb_memory_view_t view; - const bool memory_view_available_p = rb_memory_view_available_p(samples); + *memview_exported = rb_memory_view_available_p(samples); if (argc == 3) { - n_samples = NUM2INT(argv[2]); + *n_samples = NUM2INT(argv[2]); if (TYPE(samples) == T_ARRAY) { - if (RARRAY_LEN(samples) < n_samples) { - rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples); + if (RARRAY_LEN(samples) < *n_samples) { + rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), *n_samples); } } // Should check when samples.respond_to?(:length)? @@ -297,8 +296,8 @@ parse_full_args(int argc, VALUE *argv, float** c_samples) if (RARRAY_LEN(samples) > INT_MAX) { rb_raise(rb_eArgError, "samples are too long"); } - n_samples = (int)RARRAY_LEN(samples); - } else if (memory_view_available_p) { + *n_samples = (int)RARRAY_LEN(samples); + } else if (*memview_exported) { if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) { view.obj = Qnil; rb_raise(rb_eArgError, "unable to get a memory view"); @@ -307,27 +306,27 @@ parse_full_args(int argc, VALUE *argv, float** c_samples) if (n_samples_size > INT_MAX) { rb_raise(rb_eArgError, "samples are too long"); } - n_samples = (int)n_samples_size; + *n_samples = (int)n_samples_size; } else if (rb_respond_to(samples, id_length)) { - n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); + *n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); } else { rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given"); } } - float *tmp_samples = (float *)malloc(n_samples * sizeof(float)); + float *tmp_samples = (float *)malloc(*n_samples * sizeof(float)); - if (memory_view_available_p) { + if (*memview_exported) { tmp_samples = (float *)view.data; } else { if (TYPE(samples) == T_ARRAY) { - for (int i = 0; i < n_samples; i++) { + for (int i = 0; i < *n_samples; i++) { tmp_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i)); } } else { // TODO: use rb_block_call VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each")); - for (int i = 0; i < n_samples; i++) { + for (int i = 0; i < *n_samples; i++) { // TODO: check if iter is exhausted and raise ArgumentError appropriately VALUE sample = rb_funcall(iter, id_next, 0); tmp_samples[i] = RFLOAT_VALUE(sample); @@ -336,7 +335,7 @@ parse_full_args(int argc, VALUE *argv, float** c_samples) } *c_samples = tmp_samples; - return n_samples; + return true; } /* @@ -358,14 +357,15 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) VALUE params = argv[0]; TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); float *samples = NULL; + int n_samples; + bool memview_exported; - int n_samples = parse_full_args(argc, argv, &samples); - if (samples == NULL) { + if (!parse_full_args(argc, argv, &samples, &n_samples, &memview_exported)) { rb_raise(rb_eRuntimeError, "failed to parse samples"); } prepare_transcription(rwp, &self); const int result = whisper_full(rw->context, rwp->params, samples, n_samples); - if (samples != NULL) { + if (!memview_exported) { free(samples); } if (0 == result) { @@ -401,6 +401,8 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) VALUE params = argv[0]; TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); float *samples = NULL; + int n_samples; + bool memview_exported; int n_processors; switch (argc) { @@ -414,17 +416,18 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) n_processors = NUM2INT(argv[3]); break; } - int n_samples = parse_full_args( - (argc >= 3 && !NIL_P(argv[2])) ? 3 : 2, - argv, - &samples - ); - if (samples == NULL) { + if (!parse_full_args( + (argc >= 3 && !NIL_P(argv[2])) ? 3 : 2, + argv, + &samples, + &n_samples, + &memview_exported + )) { rb_raise(rb_eRuntimeError, "failed to parse samples"); } prepare_transcription(rwp, &self); const int result = whisper_full_parallel(rw->context, rwp->params, samples, n_samples, n_processors); - if (samples != NULL) { + if (!memview_exported) { free(samples); } if (0 == result) { diff --git a/bindings/ruby/ext/ruby_whisper_vad_context.c b/bindings/ruby/ext/ruby_whisper_vad_context.c index 4152077124c..58a0081a62d 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context.c +++ b/bindings/ruby/ext/ruby_whisper_vad_context.c @@ -8,7 +8,7 @@ extern VALUE cVADContext; extern const rb_data_type_t ruby_whisper_vad_params_type; extern VALUE ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params); extern VALUE ruby_whisper_normalize_model_path(VALUE model_path); -extern int parse_full_args(int argc, VALUE *argv, float** samples); +extern int parse_full_args(int argc, VALUE *argv, float** samples, int *n_samples, bool *memview_exported); extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments); static size_t @@ -78,9 +78,10 @@ ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self) VALUE params = argv[0]; GetVADParams(params, rwvp); float *samples = NULL; + int n_samples; + bool memview_exported; - int n_samples = parse_full_args(argc, argv, &samples); - if (samples == NULL) { + if (!parse_full_args(argc, argv, &samples, &n_samples, &memview_exported)) { rb_raise(rb_eRuntimeError, "failed to parse samples"); } struct whisper_vad_segments *segments = whisper_vad_segments_from_samples( @@ -89,6 +90,9 @@ ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self) samples, n_samples ); + if (!memview_exported) { + free(samples); + } return ruby_whisper_vad_segments_s_init(segments); } diff --git a/bindings/ruby/test/test_whisper.rb b/bindings/ruby/test/test_whisper.rb index 2d1711237d9..29071210072 100644 --- a/bindings/ruby/test/test_whisper.rb +++ b/bindings/ruby/test/test_whisper.rb @@ -217,6 +217,16 @@ def test_full_with_memory_view assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text) end + def test_full_with_memroy_view_gc + samples = JFKReader.new(AUDIO) + @whisper.full(@params, samples) + GC.start + require "fiddle" + Fiddle::MemoryView.export samples do |view| + assert_equal 176000, view.to_s.unpack("#{view.format}*").length + end + end + def test_full_parallel nprocessors = 2 @whisper.full_parallel(@params, @samples, @samples.length, nprocessors) From d247d5affb44e8d1430fa131e799e15b3e180830 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 21:14:16 +0900 Subject: [PATCH 20/44] Make parse_full_args return struct --- bindings/ruby/ext/ruby_whisper.h | 9 ++ bindings/ruby/ext/ruby_whisper_context.c | 93 +++++++++----------- bindings/ruby/ext/ruby_whisper_vad_context.c | 19 ++-- 3 files changed, 57 insertions(+), 64 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper.h b/bindings/ruby/ext/ruby_whisper.h index 394759b03fd..e0563966fc2 100644 --- a/bindings/ruby/ext/ruby_whisper.h +++ b/bindings/ruby/ext/ruby_whisper.h @@ -1,6 +1,8 @@ #ifndef RUBY_WHISPER_H #define RUBY_WHISPER_H +#include +#include #include "whisper.h" typedef struct { @@ -55,6 +57,13 @@ typedef struct { struct whisper_vad_context *context; } ruby_whisper_vad_context; +typedef struct full_parsed_args { + float *samples; + int n_samples; + rb_memory_view_t memview; + bool memview_exported; +} full_parsed_args; + #define GetContext(obj, rw) do { \ TypedData_Get_Struct((obj), ruby_whisper, &ruby_whisper_type, (rw)); \ if ((rw)->context == NULL) { \ diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index b65a175b13d..a6d70e997b5 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -1,5 +1,3 @@ -#include -#include #include "ruby_whisper.h" extern ID id_to_s; @@ -272,22 +270,22 @@ VALUE ruby_whisper_model_type(VALUE self) return rb_str_new2(whisper_model_type_readable(rw->context)); } -bool -parse_full_args(int argc, VALUE *argv, float** c_samples, int *n_samples, bool *memview_exported) +struct full_parsed_args +parse_full_args(int argc, VALUE *argv) { if (argc < 2 || argc > 3) { rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc); } VALUE samples = argv[1]; - rb_memory_view_t view; - *memview_exported = rb_memory_view_available_p(samples); + bool memview_available = rb_memory_view_available_p(samples); + struct full_parsed_args parsed = {0}; if (argc == 3) { - *n_samples = NUM2INT(argv[2]); + parsed.n_samples = NUM2INT(argv[2]); if (TYPE(samples) == T_ARRAY) { - if (RARRAY_LEN(samples) < *n_samples) { - rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), *n_samples); + if (RARRAY_LEN(samples) < parsed.n_samples) { + rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), parsed.n_samples); } } // Should check when samples.respond_to?(:length)? @@ -296,46 +294,57 @@ parse_full_args(int argc, VALUE *argv, float** c_samples, int *n_samples, bool * if (RARRAY_LEN(samples) > INT_MAX) { rb_raise(rb_eArgError, "samples are too long"); } - *n_samples = (int)RARRAY_LEN(samples); - } else if (*memview_exported) { - if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) { - view.obj = Qnil; + parsed.n_samples = (int)RARRAY_LEN(samples); + } else if (memview_available) { + if (!rb_memory_view_get(samples, &parsed.memview, RUBY_MEMORY_VIEW_SIMPLE)) { rb_raise(rb_eArgError, "unable to get a memory view"); } - ssize_t n_samples_size = view.byte_size / view.item_size; + ssize_t n_samples_size = parsed.memview.byte_size / parsed.memview.item_size; if (n_samples_size > INT_MAX) { + rb_memory_view_release(&parsed.memview); rb_raise(rb_eArgError, "samples are too long"); } - *n_samples = (int)n_samples_size; + parsed.n_samples = (int)n_samples_size; } else if (rb_respond_to(samples, id_length)) { - *n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); + parsed.n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); } else { rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given"); } } - float *tmp_samples = (float *)malloc(*n_samples * sizeof(float)); - - if (*memview_exported) { - tmp_samples = (float *)view.data; + if (memview_available) { + parsed.memview_exported = true; + parsed.samples = (float *)parsed.memview.data; } else { + parsed.memview_exported = false; + parsed.samples = (float *)malloc(parsed.n_samples * sizeof(float)); if (TYPE(samples) == T_ARRAY) { - for (int i = 0; i < *n_samples; i++) { - tmp_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i)); + for (int i = 0; i < parsed.n_samples; i++) { + parsed.samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i)); } } else { // TODO: use rb_block_call VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each")); - for (int i = 0; i < *n_samples; i++) { + for (int i = 0; i < parsed.n_samples; i++) { // TODO: check if iter is exhausted and raise ArgumentError appropriately VALUE sample = rb_funcall(iter, id_next, 0); - tmp_samples[i] = RFLOAT_VALUE(sample); + parsed.samples[i] = RFLOAT_VALUE(sample); } } } - *c_samples = tmp_samples; - return true; + return parsed; +} + +void +release_samples(full_parsed_args *parsed_args) +{ + if (parsed_args->memview_exported) { + rb_memory_view_release(&parsed_args->memview); + } else { + free(parsed_args->samples); + } + *parsed_args = (full_parsed_args){0}; } /* @@ -356,18 +365,11 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) GetContext(self, rw); VALUE params = argv[0]; TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); - float *samples = NULL; - int n_samples; - bool memview_exported; - if (!parse_full_args(argc, argv, &samples, &n_samples, &memview_exported)) { - rb_raise(rb_eRuntimeError, "failed to parse samples"); - } + struct full_parsed_args parsed = parse_full_args(argc, argv); prepare_transcription(rwp, &self); - const int result = whisper_full(rw->context, rwp->params, samples, n_samples); - if (!memview_exported) { - free(samples); - } + const int result = whisper_full(rw->context, rwp->params, parsed.samples, parsed.n_samples); + release_samples(&parsed); if (0 == result) { return self; } else { @@ -400,9 +402,6 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) GetContext(self, rw); VALUE params = argv[0]; TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); - float *samples = NULL; - int n_samples; - bool memview_exported; int n_processors; switch (argc) { @@ -416,20 +415,10 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) n_processors = NUM2INT(argv[3]); break; } - if (!parse_full_args( - (argc >= 3 && !NIL_P(argv[2])) ? 3 : 2, - argv, - &samples, - &n_samples, - &memview_exported - )) { - rb_raise(rb_eRuntimeError, "failed to parse samples"); - } + struct full_parsed_args parsed = parse_full_args((argc >= 3 && !NIL_P(argv[2])) ? 3 : 2, argv); prepare_transcription(rwp, &self); - const int result = whisper_full_parallel(rw->context, rwp->params, samples, n_samples, n_processors); - if (!memview_exported) { - free(samples); - } + const int result = whisper_full_parallel(rw->context, rwp->params, parsed.samples, parsed.n_samples, n_processors); + release_samples(&parsed); if (0 == result) { return self; } else { diff --git a/bindings/ruby/ext/ruby_whisper_vad_context.c b/bindings/ruby/ext/ruby_whisper_vad_context.c index 58a0081a62d..52101af67ae 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context.c +++ b/bindings/ruby/ext/ruby_whisper_vad_context.c @@ -8,7 +8,9 @@ extern VALUE cVADContext; extern const rb_data_type_t ruby_whisper_vad_params_type; extern VALUE ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params); extern VALUE ruby_whisper_normalize_model_path(VALUE model_path); -extern int parse_full_args(int argc, VALUE *argv, float** samples, int *n_samples, bool *memview_exported); +extern full_parsed_args parse_full_args(int argc, VALUE *argv); +extern void release_samples(struct full_parsed_args *parsed); + extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments); static size_t @@ -77,22 +79,15 @@ ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self) GetVADContext(self, rwvc); VALUE params = argv[0]; GetVADParams(params, rwvp); - float *samples = NULL; - int n_samples; - bool memview_exported; - if (!parse_full_args(argc, argv, &samples, &n_samples, &memview_exported)) { - rb_raise(rb_eRuntimeError, "failed to parse samples"); - } + struct full_parsed_args parsed = parse_full_args(argc, argv); struct whisper_vad_segments *segments = whisper_vad_segments_from_samples( rwvc->context, rwvp->params, - samples, - n_samples + parsed.samples, + parsed.n_samples ); - if (!memview_exported) { - free(samples); - } + release_samples(&parsed); return ruby_whisper_vad_segments_s_init(segments); } From bea2bec29b3deae775bbe49c645c8ac9841ed3b1 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 21:19:06 +0900 Subject: [PATCH 21/44] Fallback when failed to get MemoryView --- bindings/ruby/ext/ruby_whisper_context.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index a6d70e997b5..4877541d54f 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -280,6 +280,7 @@ parse_full_args(int argc, VALUE *argv) VALUE samples = argv[1]; bool memview_available = rb_memory_view_available_p(samples); struct full_parsed_args parsed = {0}; + parsed.memview_exported = false; if (argc == 3) { parsed.n_samples = NUM2INT(argv[2]); @@ -296,15 +297,18 @@ parse_full_args(int argc, VALUE *argv) } parsed.n_samples = (int)RARRAY_LEN(samples); } else if (memview_available) { - if (!rb_memory_view_get(samples, &parsed.memview, RUBY_MEMORY_VIEW_SIMPLE)) { - rb_raise(rb_eArgError, "unable to get a memory view"); + if (rb_memory_view_get(samples, &parsed.memview, RUBY_MEMORY_VIEW_SIMPLE)) { + ssize_t n_samples_size = parsed.memview.byte_size / parsed.memview.item_size; + if (n_samples_size > INT_MAX) { + rb_memory_view_release(&parsed.memview); + rb_raise(rb_eArgError, "samples are too long"); + } + parsed.memview_exported = true; + parsed.n_samples = (int)n_samples_size; + } else { + rb_warn("unable to get a memory view"); + parsed.n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); } - ssize_t n_samples_size = parsed.memview.byte_size / parsed.memview.item_size; - if (n_samples_size > INT_MAX) { - rb_memory_view_release(&parsed.memview); - rb_raise(rb_eArgError, "samples are too long"); - } - parsed.n_samples = (int)n_samples_size; } else if (rb_respond_to(samples, id_length)) { parsed.n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); } else { @@ -312,11 +316,9 @@ parse_full_args(int argc, VALUE *argv) } } - if (memview_available) { - parsed.memview_exported = true; + if (parsed.memview_exported) { parsed.samples = (float *)parsed.memview.data; } else { - parsed.memview_exported = false; parsed.samples = (float *)malloc(parsed.n_samples * sizeof(float)); if (TYPE(samples) == T_ARRAY) { for (int i = 0; i < parsed.n_samples; i++) { From d63f441ca6cf908a690f92a2e9aa33043becbf0c Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 21:23:58 +0900 Subject: [PATCH 22/44] Add num of samples when too long --- bindings/ruby/ext/ruby_whisper_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 4877541d54f..047ae088cbe 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -301,7 +301,7 @@ parse_full_args(int argc, VALUE *argv) ssize_t n_samples_size = parsed.memview.byte_size / parsed.memview.item_size; if (n_samples_size > INT_MAX) { rb_memory_view_release(&parsed.memview); - rb_raise(rb_eArgError, "samples are too long"); + rb_raise(rb_eArgError, "samples are too long: %zd", n_samples_size); } parsed.memview_exported = true; parsed.n_samples = (int)n_samples_size; From f8164f3f7118247ec6e9a59262ab54a46fa1dd81 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 21:33:22 +0900 Subject: [PATCH 23/44] Check members of MemoryView --- bindings/ruby/ext/ruby_whisper_context.c | 26 +++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 047ae088cbe..3655cbd7c74 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -270,6 +270,21 @@ VALUE ruby_whisper_model_type(VALUE self) return rb_str_new2(whisper_model_type_readable(rw->context)); } +static bool +check_memory_view(rb_memory_view_t *memview) +{ + if (strcmp(memview->format, "f") != 0) { + rb_warn("currently only \"f\"View is supported for MemoryView, but given: %s", memview->format); + return false; + } + if (memview->ndim != 1) { + rb_warn("currently only 1 dimensional MemoryView is supported, but given: %zd", memview->ndim); + return false; + } + + return true; +} + struct full_parsed_args parse_full_args(int argc, VALUE *argv) { @@ -297,7 +312,8 @@ parse_full_args(int argc, VALUE *argv) } parsed.n_samples = (int)RARRAY_LEN(samples); } else if (memview_available) { - if (rb_memory_view_get(samples, &parsed.memview, RUBY_MEMORY_VIEW_SIMPLE)) { + if (rb_memory_view_get(samples, &parsed.memview, RUBY_MEMORY_VIEW_SIMPLE) && + check_memory_view(&parsed.memview)) { ssize_t n_samples_size = parsed.memview.byte_size / parsed.memview.item_size; if (n_samples_size > INT_MAX) { rb_memory_view_release(&parsed.memview); @@ -306,8 +322,12 @@ parse_full_args(int argc, VALUE *argv) parsed.memview_exported = true; parsed.n_samples = (int)n_samples_size; } else { - rb_warn("unable to get a memory view"); - parsed.n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); + rb_warn("unable to get a memory view. fallbacks to Ruby object"); + if (rb_respond_to(samples, id_length)) { + parsed.n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); + } else { + rb_raise(rb_eArgError, "samples must respond to :length"); + } } } else if (rb_respond_to(samples, id_length)) { parsed.n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); From 841734baa84cda54b8d2bd9505d4f7a4ee7fe4b0 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 21:43:51 +0900 Subject: [PATCH 24/44] Fix a typo --- bindings/ruby/ext/ruby_whisper_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 3655cbd7c74..fb376ded56c 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -274,7 +274,7 @@ static bool check_memory_view(rb_memory_view_t *memview) { if (strcmp(memview->format, "f") != 0) { - rb_warn("currently only \"f\"View is supported for MemoryView, but given: %s", memview->format); + rb_warn("currently only format \"f\" is supported for MemoryView, but given: %s", memview->format); return false; } if (memview->ndim != 1) { From 8250f1b5b3decde50f3a62782d13e7682f9174bb Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 21:45:49 +0900 Subject: [PATCH 25/44] Remove unnecessary include --- bindings/ruby/ext/ruby_whisper.c | 2 -- bindings/ruby/ext/ruby_whisper_model.c | 1 - bindings/ruby/ext/ruby_whisper_params.c | 1 - bindings/ruby/ext/ruby_whisper_segment.c | 1 - bindings/ruby/ext/ruby_whisper_token.c | 1 - bindings/ruby/ext/ruby_whisper_transcribe.cpp | 1 - bindings/ruby/ext/ruby_whisper_vad_context.c | 1 - bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp | 1 - bindings/ruby/ext/ruby_whisper_vad_params.c | 1 - bindings/ruby/ext/ruby_whisper_vad_segment.c | 1 - bindings/ruby/ext/ruby_whisper_vad_segments.c | 1 - 11 files changed, 12 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper.c b/bindings/ruby/ext/ruby_whisper.c index ac677e9e3df..eb95829c032 100644 --- a/bindings/ruby/ext/ruby_whisper.c +++ b/bindings/ruby/ext/ruby_whisper.c @@ -1,5 +1,3 @@ -#include -#include #include "ruby_whisper.h" VALUE mWhisper; diff --git a/bindings/ruby/ext/ruby_whisper_model.c b/bindings/ruby/ext/ruby_whisper_model.c index b196a8b5cb5..0e91fb3f87f 100644 --- a/bindings/ruby/ext/ruby_whisper_model.c +++ b/bindings/ruby/ext/ruby_whisper_model.c @@ -1,4 +1,3 @@ -#include #include "ruby_whisper.h" extern const rb_data_type_t ruby_whisper_type; diff --git a/bindings/ruby/ext/ruby_whisper_params.c b/bindings/ruby/ext/ruby_whisper_params.c index 4dfe2575a39..61eb1733676 100644 --- a/bindings/ruby/ext/ruby_whisper_params.c +++ b/bindings/ruby/ext/ruby_whisper_params.c @@ -1,4 +1,3 @@ -#include #include "ruby_whisper.h" #define BOOL_PARAMS_SETTER(self, prop, value) \ diff --git a/bindings/ruby/ext/ruby_whisper_segment.c b/bindings/ruby/ext/ruby_whisper_segment.c index 5229cb53900..ee0d66c4cc8 100644 --- a/bindings/ruby/ext/ruby_whisper_segment.c +++ b/bindings/ruby/ext/ruby_whisper_segment.c @@ -1,4 +1,3 @@ -#include #include "ruby_whisper.h" #define N_KEY_NAMES 6 diff --git a/bindings/ruby/ext/ruby_whisper_token.c b/bindings/ruby/ext/ruby_whisper_token.c index ea4f4e635d2..56a7eab2231 100644 --- a/bindings/ruby/ext/ruby_whisper_token.c +++ b/bindings/ruby/ext/ruby_whisper_token.c @@ -1,4 +1,3 @@ -#include #include "ruby_whisper.h" #define N_KEY_NAMES 11 diff --git a/bindings/ruby/ext/ruby_whisper_transcribe.cpp b/bindings/ruby/ext/ruby_whisper_transcribe.cpp index a8f41c537e9..c00fbcd1def 100644 --- a/bindings/ruby/ext/ruby_whisper_transcribe.cpp +++ b/bindings/ruby/ext/ruby_whisper_transcribe.cpp @@ -1,4 +1,3 @@ -#include #include "ruby_whisper.h" #include "common-whisper.h" #include diff --git a/bindings/ruby/ext/ruby_whisper_vad_context.c b/bindings/ruby/ext/ruby_whisper_vad_context.c index 52101af67ae..b5b10aa13f3 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context.c +++ b/bindings/ruby/ext/ruby_whisper_vad_context.c @@ -1,4 +1,3 @@ -#include #include "ruby_whisper.h" extern ID id_to_s; diff --git a/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp b/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp index 3423e2e2bd5..802b0222dbd 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp +++ b/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp @@ -1,4 +1,3 @@ -#include #include "ruby_whisper.h" #include "common-whisper.h" #include diff --git a/bindings/ruby/ext/ruby_whisper_vad_params.c b/bindings/ruby/ext/ruby_whisper_vad_params.c index f254bfa2138..28256650e32 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_params.c +++ b/bindings/ruby/ext/ruby_whisper_vad_params.c @@ -1,4 +1,3 @@ -#include #include "ruby_whisper.h" #define DEFINE_PARAM(param_name, nth) \ diff --git a/bindings/ruby/ext/ruby_whisper_vad_segment.c b/bindings/ruby/ext/ruby_whisper_vad_segment.c index 49ff0aadcce..84a007bb725 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_segment.c +++ b/bindings/ruby/ext/ruby_whisper_vad_segment.c @@ -1,4 +1,3 @@ -#include #include "ruby_whisper.h" #define N_KEY_NAMES 2 diff --git a/bindings/ruby/ext/ruby_whisper_vad_segments.c b/bindings/ruby/ext/ruby_whisper_vad_segments.c index 1bb375937a4..db62fdb6222 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_segments.c +++ b/bindings/ruby/ext/ruby_whisper_vad_segments.c @@ -1,4 +1,3 @@ -#include #include "ruby_whisper.h" extern ID id___method__; From 256005d3ad1e09ed58626f8057239756ef40d8bc Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 21:52:44 +0900 Subject: [PATCH 26/44] Fix a typo --- bindings/ruby/ext/ruby_whisper_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index fb376ded56c..47dc8b6f315 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -332,7 +332,7 @@ parse_full_args(int argc, VALUE *argv) } else if (rb_respond_to(samples, id_length)) { parsed.n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); } else { - rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given"); + rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of float when n_samples is not given"); } } From 56335dd074f3a008af4ddcbd24685cb5d390eb5d Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 21:53:06 +0900 Subject: [PATCH 27/44] Fix a typo --- bindings/ruby/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/ruby/README.md b/bindings/ruby/README.md index f82ce01a73c..bd3eaa8bd48 100644 --- a/bindings/ruby/README.md +++ b/bindings/ruby/README.md @@ -368,7 +368,7 @@ samples = reader.enum_for(:each_buffer).map(&:samples).flatten waveform, sample_rate = TorchAudio.load("test/fixtures/jfk.wav") samples = waveform.squeeze.numo.to_arrow.to_arrow_array -segments = vad.segments_from_samples(Whisper::Params.new, samples) +segments = vad.segments_from_samples(Whisper::VAD::Params.new, samples) ``` Development From 94b90c896e7a2267b0ad5673cb6819fbe4194186 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 22:37:17 +0900 Subject: [PATCH 28/44] Care the case of MemoryView doesn't fit spec --- bindings/ruby/ext/ruby_whisper_context.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 47dc8b6f315..e28fa20e8e1 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -312,14 +312,20 @@ parse_full_args(int argc, VALUE *argv) } parsed.n_samples = (int)RARRAY_LEN(samples); } else if (memview_available) { - if (rb_memory_view_get(samples, &parsed.memview, RUBY_MEMORY_VIEW_SIMPLE) && - check_memory_view(&parsed.memview)) { + bool memview_got = rb_memory_view_get(samples, &parsed.memview, RUBY_MEMORY_VIEW_SIMPLE); + if (memview_got) { + parsed.memview_exported = check_memory_view(&parsed.memview); + if (!parsed.memview_exported) { + rb_memory_view_release(&parsed.memview); + parsed.memview = (rb_memory_view_t){0}; + } + } + if (parsed.memview_exported) { ssize_t n_samples_size = parsed.memview.byte_size / parsed.memview.item_size; if (n_samples_size > INT_MAX) { rb_memory_view_release(&parsed.memview); rb_raise(rb_eArgError, "samples are too long: %zd", n_samples_size); } - parsed.memview_exported = true; parsed.n_samples = (int)n_samples_size; } else { rb_warn("unable to get a memory view. fallbacks to Ruby object"); From 4cb862d8de189505a9c5aaaede987a9a14e3be1d Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 22:48:37 +0900 Subject: [PATCH 29/44] Add TODO comment --- bindings/ruby/ext/ruby_whisper_context.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index e28fa20e8e1..df215861df9 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -345,6 +345,8 @@ parse_full_args(int argc, VALUE *argv) if (parsed.memview_exported) { parsed.samples = (float *)parsed.memview.data; } else { + // FIXME: Ensure free parsed.samples both after this line and + // in caller context using rb_ensure or so parsed.samples = (float *)malloc(parsed.n_samples * sizeof(float)); if (TYPE(samples) == T_ARRAY) { for (int i = 0; i < parsed.n_samples; i++) { From 93a49da8734fec5cae5de8cc94b9c20fc6d9c92b Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 22:49:17 +0900 Subject: [PATCH 30/44] Add optimazation option to compiler flags --- bindings/ruby/ext/extconf.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/bindings/ruby/ext/extconf.rb b/bindings/ruby/ext/extconf.rb index 8a5ac67457b..acff501aa3b 100644 --- a/bindings/ruby/ext/extconf.rb +++ b/bindings/ruby/ext/extconf.rb @@ -7,6 +7,7 @@ have_library("gomp") rescue nil libs = Dependencies.new(cmake, options).to_s +$CFLAGS << " -O3 -march=native" $INCFLAGS << " -Isources/include -Isources/ggml/include -Isources/examples" $LOCAL_LIBS << " #{libs}" $cleanfiles << " build #{libs}" From 98bb5a967fb392d653092e9a344e9d6165f15bbb Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 23:15:05 +0900 Subject: [PATCH 31/44] Use ALLOC_N instead of malloc --- bindings/ruby/ext/ruby_whisper_context.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index df215861df9..c196a413da9 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -347,7 +347,7 @@ parse_full_args(int argc, VALUE *argv) } else { // FIXME: Ensure free parsed.samples both after this line and // in caller context using rb_ensure or so - parsed.samples = (float *)malloc(parsed.n_samples * sizeof(float)); + parsed.samples = ALLOC_N(float, parsed.n_samples); if (TYPE(samples) == T_ARRAY) { for (int i = 0; i < parsed.n_samples; i++) { parsed.samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i)); @@ -372,7 +372,7 @@ release_samples(full_parsed_args *parsed_args) if (parsed_args->memview_exported) { rb_memory_view_release(&parsed_args->memview); } else { - free(parsed_args->samples); + ruby_xfree(parsed_args->samples); } *parsed_args = (full_parsed_args){0}; } From afd8deb23541ba0c5e633457eb2782605f1d24e3 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Wed, 28 Jan 2026 23:17:28 +0900 Subject: [PATCH 32/44] Add description to sample code --- bindings/ruby/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bindings/ruby/README.md b/bindings/ruby/README.md index bd3eaa8bd48..86774158355 100644 --- a/bindings/ruby/README.md +++ b/bindings/ruby/README.md @@ -361,10 +361,11 @@ vad You may also low level API `Whisper::VAD::Context#segments_from_samples` as such `Whisper::Context#full`: ```ruby +# Ruby Array reader = WaveFile::Reader.new("path/to/audio.wav", WaveFile::Format.new(:mono, :float, 16000)) samples = reader.enum_for(:each_buffer).map(&:samples).flatten -# Or, +# Or, object which exports MemoryView waveform, sample_rate = TorchAudio.load("test/fixtures/jfk.wav") samples = waveform.squeeze.numo.to_arrow.to_arrow_array From f32a33d4bc5ea15378c1f0a75efa0591c426a2f6 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 16:20:07 +0900 Subject: [PATCH 33/44] Rename and change args: parse_full_args -> parse_samples --- bindings/ruby/ext/ruby_whisper.h | 4 +- bindings/ruby/ext/ruby_whisper_context.c | 59 ++++++++++---------- bindings/ruby/ext/ruby_whisper_vad_context.c | 11 +++- 3 files changed, 40 insertions(+), 34 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper.h b/bindings/ruby/ext/ruby_whisper.h index e0563966fc2..c2c9866ae0d 100644 --- a/bindings/ruby/ext/ruby_whisper.h +++ b/bindings/ruby/ext/ruby_whisper.h @@ -57,12 +57,12 @@ typedef struct { struct whisper_vad_context *context; } ruby_whisper_vad_context; -typedef struct full_parsed_args { +typedef struct parsed_samples_t { float *samples; int n_samples; rb_memory_view_t memview; bool memview_exported; -} full_parsed_args; +} parsed_samples_t; #define GetContext(obj, rw) do { \ TypedData_Get_Struct((obj), ruby_whisper, &ruby_whisper_type, (rw)); \ diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index c196a413da9..4852597d7e0 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -285,34 +285,29 @@ check_memory_view(rb_memory_view_t *memview) return true; } -struct full_parsed_args -parse_full_args(int argc, VALUE *argv) +struct parsed_samples_t +parse_samples(VALUE *samples, VALUE *n_samples) { - if (argc < 2 || argc > 3) { - rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc); - } - - VALUE samples = argv[1]; - bool memview_available = rb_memory_view_available_p(samples); - struct full_parsed_args parsed = {0}; + bool memview_available = rb_memory_view_available_p(*samples); + struct parsed_samples_t parsed = {0}; parsed.memview_exported = false; - if (argc == 3) { - parsed.n_samples = NUM2INT(argv[2]); - if (TYPE(samples) == T_ARRAY) { - if (RARRAY_LEN(samples) < parsed.n_samples) { - rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), parsed.n_samples); + if (!NIL_P(*n_samples)) { + parsed.n_samples = NUM2INT(*n_samples); + if (TYPE(*samples) == T_ARRAY) { + if (RARRAY_LEN(*samples) < parsed.n_samples) { + rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(*samples), parsed.n_samples); } } // Should check when samples.respond_to?(:length)? } else { - if (TYPE(samples) == T_ARRAY) { - if (RARRAY_LEN(samples) > INT_MAX) { + if (TYPE(*samples) == T_ARRAY) { + if (RARRAY_LEN(*samples) > INT_MAX) { rb_raise(rb_eArgError, "samples are too long"); } - parsed.n_samples = (int)RARRAY_LEN(samples); + parsed.n_samples = (int)RARRAY_LEN(*samples); } else if (memview_available) { - bool memview_got = rb_memory_view_get(samples, &parsed.memview, RUBY_MEMORY_VIEW_SIMPLE); + bool memview_got = rb_memory_view_get(*samples, &parsed.memview, RUBY_MEMORY_VIEW_SIMPLE); if (memview_got) { parsed.memview_exported = check_memory_view(&parsed.memview); if (!parsed.memview_exported) { @@ -329,14 +324,14 @@ parse_full_args(int argc, VALUE *argv) parsed.n_samples = (int)n_samples_size; } else { rb_warn("unable to get a memory view. fallbacks to Ruby object"); - if (rb_respond_to(samples, id_length)) { - parsed.n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); + if (rb_respond_to(*samples, id_length)) { + parsed.n_samples = NUM2INT(rb_funcall(*samples, id_length, 0)); } else { rb_raise(rb_eArgError, "samples must respond to :length"); } } - } else if (rb_respond_to(samples, id_length)) { - parsed.n_samples = NUM2INT(rb_funcall(samples, id_length, 0)); + } else if (rb_respond_to(*samples, id_length)) { + parsed.n_samples = NUM2INT(rb_funcall(*samples, id_length, 0)); } else { rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of float when n_samples is not given"); } @@ -348,13 +343,13 @@ parse_full_args(int argc, VALUE *argv) // FIXME: Ensure free parsed.samples both after this line and // in caller context using rb_ensure or so parsed.samples = ALLOC_N(float, parsed.n_samples); - if (TYPE(samples) == T_ARRAY) { + if (TYPE(*samples) == T_ARRAY) { for (int i = 0; i < parsed.n_samples; i++) { - parsed.samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i)); + parsed.samples[i] = RFLOAT_VALUE(rb_ary_entry(*samples, i)); } } else { // TODO: use rb_block_call - VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each")); + VALUE iter = rb_funcall(*samples, id_to_enum, 1, rb_str_new2("each")); for (int i = 0; i < parsed.n_samples; i++) { // TODO: check if iter is exhausted and raise ArgumentError appropriately VALUE sample = rb_funcall(iter, id_next, 0); @@ -367,14 +362,14 @@ parse_full_args(int argc, VALUE *argv) } void -release_samples(full_parsed_args *parsed_args) +release_samples(parsed_samples_t *parsed_args) { if (parsed_args->memview_exported) { rb_memory_view_release(&parsed_args->memview); } else { ruby_xfree(parsed_args->samples); } - *parsed_args = (full_parsed_args){0}; + *parsed_args = (parsed_samples_t){0}; } /* @@ -390,13 +385,18 @@ release_samples(full_parsed_args *parsed_args) */ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) { + if (argc < 2 || argc > 3) { + rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc); + } + ruby_whisper *rw; ruby_whisper_params *rwp; GetContext(self, rw); VALUE params = argv[0]; TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); + VALUE n_samples = argc == 2 ? Qnil : argv[2]; - struct full_parsed_args parsed = parse_full_args(argc, argv); + struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); prepare_transcription(rwp, &self); const int result = whisper_full(rw->context, rwp->params, parsed.samples, parsed.n_samples); release_samples(&parsed); @@ -432,6 +432,7 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) GetContext(self, rw); VALUE params = argv[0]; TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); + VALUE n_samples = argc == 2 ? Qnil : argv[2]; int n_processors; switch (argc) { @@ -445,7 +446,7 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) n_processors = NUM2INT(argv[3]); break; } - struct full_parsed_args parsed = parse_full_args((argc >= 3 && !NIL_P(argv[2])) ? 3 : 2, argv); + struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); prepare_transcription(rwp, &self); const int result = whisper_full_parallel(rw->context, rwp->params, parsed.samples, parsed.n_samples, n_processors); release_samples(&parsed); diff --git a/bindings/ruby/ext/ruby_whisper_vad_context.c b/bindings/ruby/ext/ruby_whisper_vad_context.c index b5b10aa13f3..fa4694f3404 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context.c +++ b/bindings/ruby/ext/ruby_whisper_vad_context.c @@ -7,8 +7,8 @@ extern VALUE cVADContext; extern const rb_data_type_t ruby_whisper_vad_params_type; extern VALUE ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params); extern VALUE ruby_whisper_normalize_model_path(VALUE model_path); -extern full_parsed_args parse_full_args(int argc, VALUE *argv); -extern void release_samples(struct full_parsed_args *parsed); +extern parsed_samples_t parse_samples(VALUE *samples, VALUE *n_samples); +extern void release_samples(struct parsed_samples_t *parsed); extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments); @@ -73,13 +73,18 @@ ruby_whisper_vad_context_initialize(VALUE self, VALUE model_path) static VALUE ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self) { + if (argc < 2 || argc > 3) { + rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc); + } + ruby_whisper_vad_context *rwvc; ruby_whisper_vad_params *rwvp; GetVADContext(self, rwvc); VALUE params = argv[0]; GetVADParams(params, rwvp); + VALUE n_samples = argc == 2 ? Qnil : argv[2]; - struct full_parsed_args parsed = parse_full_args(argc, argv); + struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); struct whisper_vad_segments *segments = whisper_vad_segments_from_samples( rwvc->context, rwvp->params, From e135c54c7e3c04d9406f3e78e95d95dcfb6a6c21 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 17:47:06 +0900 Subject: [PATCH 34/44] Free samples when exception raised --- bindings/ruby/ext/ruby_whisper_context.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 4852597d7e0..9e485efc365 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -340,9 +340,8 @@ parse_samples(VALUE *samples, VALUE *n_samples) if (parsed.memview_exported) { parsed.samples = (float *)parsed.memview.data; } else { - // FIXME: Ensure free parsed.samples both after this line and - // in caller context using rb_ensure or so - parsed.samples = ALLOC_N(float, parsed.n_samples); + VALUE store; + parsed.samples = rb_alloc_tmp_buffer(&store, sizeof(float) * parsed.n_samples); if (TYPE(*samples) == T_ARRAY) { for (int i = 0; i < parsed.n_samples; i++) { parsed.samples[i] = RFLOAT_VALUE(rb_ary_entry(*samples, i)); @@ -356,6 +355,7 @@ parse_samples(VALUE *samples, VALUE *n_samples) parsed.samples[i] = RFLOAT_VALUE(sample); } } + rb_free_tmp_buffer(&store); } return parsed; @@ -364,10 +364,10 @@ parse_samples(VALUE *samples, VALUE *n_samples) void release_samples(parsed_samples_t *parsed_args) { + // When parsed_args->memview_exported is false, + // parsed_args->samples was allocated by rb_alloc_tmp_buffer, so no need to free here if (parsed_args->memview_exported) { rb_memory_view_release(&parsed_args->memview); - } else { - ruby_xfree(parsed_args->samples); } *parsed_args = (parsed_samples_t){0}; } @@ -398,6 +398,7 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); prepare_transcription(rwp, &self); + // FIXME: Ensure release_samples is called in case of exception const int result = whisper_full(rw->context, rwp->params, parsed.samples, parsed.n_samples); release_samples(&parsed); if (0 == result) { From dc11257f595e658b27310830a4fad9c81ad90386 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 18:40:07 +0900 Subject: [PATCH 35/44] Assign type check result to a variable --- bindings/ruby/ext/ruby_whisper_context.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 9e485efc365..9b591351ef1 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -291,17 +291,18 @@ parse_samples(VALUE *samples, VALUE *n_samples) bool memview_available = rb_memory_view_available_p(*samples); struct parsed_samples_t parsed = {0}; parsed.memview_exported = false; + const bool is_array = RB_TYPE_P(*samples, T_ARRAY); if (!NIL_P(*n_samples)) { parsed.n_samples = NUM2INT(*n_samples); - if (TYPE(*samples) == T_ARRAY) { + if (is_array) { if (RARRAY_LEN(*samples) < parsed.n_samples) { rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(*samples), parsed.n_samples); } } // Should check when samples.respond_to?(:length)? } else { - if (TYPE(*samples) == T_ARRAY) { + if (is_array) { if (RARRAY_LEN(*samples) > INT_MAX) { rb_raise(rb_eArgError, "samples are too long"); } @@ -342,7 +343,7 @@ parse_samples(VALUE *samples, VALUE *n_samples) } else { VALUE store; parsed.samples = rb_alloc_tmp_buffer(&store, sizeof(float) * parsed.n_samples); - if (TYPE(*samples) == T_ARRAY) { + if (is_array) { for (int i = 0; i < parsed.n_samples; i++) { parsed.samples[i] = RFLOAT_VALUE(rb_ary_entry(*samples, i)); } From 635cadc2ead9a4abb393761f92403c010e9678e5 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 19:29:09 +0900 Subject: [PATCH 36/44] Define wrapper function of whisper_full --- bindings/ruby/ext/ruby_whisper_context.c | 25 +++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 9b591351ef1..d92c8c382fa 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -25,6 +25,13 @@ extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context); ID transcribe_option_names[1]; +typedef struct full_args { + struct whisper_context *context; + struct whisper_full_params *params; + float *samples; + int n_samples; +} full_args; + static void ruby_whisper_free(ruby_whisper *rw) { @@ -373,6 +380,14 @@ release_samples(parsed_samples_t *parsed_args) *parsed_args = (parsed_samples_t){0}; } +static VALUE +rb_full(VALUE rb_args) +{ + full_args *args = (full_args *)rb_args; + int result = whisper_full(args->context, *args->params, args->samples, args->n_samples); + return INT2NUM(result); +} + /* * Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text * Not thread safe for same context @@ -400,8 +415,16 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); prepare_transcription(rwp, &self); // FIXME: Ensure release_samples is called in case of exception - const int result = whisper_full(rw->context, rwp->params, parsed.samples, parsed.n_samples); + // Defining Samples class and wrapping parsed.samples in it might help + full_args args = { + rw->context, + &rwp->params, + parsed.samples, + parsed.n_samples, + }; + VALUE rb_result = rb_full((VALUE)&args); release_samples(&parsed); + const int result = NUM2INT(rb_result); if (0 == result) { return self; } else { From adfeb10f0ec73fe06f7b07a7133025dd31117a6d Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 19:35:35 +0900 Subject: [PATCH 37/44] Change signature of parse_samples for rb_ensure --- bindings/ruby/ext/ruby_whisper_context.c | 12 ++++++++---- bindings/ruby/ext/ruby_whisper_vad_context.c | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index d92c8c382fa..966b6bd2566 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -369,15 +369,19 @@ parse_samples(VALUE *samples, VALUE *n_samples) return parsed; } -void -release_samples(parsed_samples_t *parsed_args) +VALUE +release_samples(VALUE rb_parsed_args) { + parsed_samples_t *parsed_args = (parsed_samples_t *)rb_parsed_args; + // When parsed_args->memview_exported is false, // parsed_args->samples was allocated by rb_alloc_tmp_buffer, so no need to free here if (parsed_args->memview_exported) { rb_memory_view_release(&parsed_args->memview); } *parsed_args = (parsed_samples_t){0}; + + return Qnil; } static VALUE @@ -423,7 +427,7 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) parsed.n_samples, }; VALUE rb_result = rb_full((VALUE)&args); - release_samples(&parsed); + release_samples((VALUE)&parsed); const int result = NUM2INT(rb_result); if (0 == result) { return self; @@ -474,7 +478,7 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); prepare_transcription(rwp, &self); const int result = whisper_full_parallel(rw->context, rwp->params, parsed.samples, parsed.n_samples, n_processors); - release_samples(&parsed); + release_samples((VALUE)&parsed); if (0 == result) { return self; } else { diff --git a/bindings/ruby/ext/ruby_whisper_vad_context.c b/bindings/ruby/ext/ruby_whisper_vad_context.c index fa4694f3404..c4046871476 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context.c +++ b/bindings/ruby/ext/ruby_whisper_vad_context.c @@ -8,7 +8,7 @@ extern const rb_data_type_t ruby_whisper_vad_params_type; extern VALUE ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params); extern VALUE ruby_whisper_normalize_model_path(VALUE model_path); extern parsed_samples_t parse_samples(VALUE *samples, VALUE *n_samples); -extern void release_samples(struct parsed_samples_t *parsed); +extern VALUE release_samples(VALUE parsed); extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments); @@ -91,7 +91,7 @@ ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self) parsed.samples, parsed.n_samples ); - release_samples(&parsed); + release_samples((VALUE)&parsed); return ruby_whisper_vad_segments_s_init(segments); } From d2ba09112652dbeb82f2e95dc83771f7491d4ba3 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 19:54:54 +0900 Subject: [PATCH 38/44] Ensure release MemoryView --- bindings/ruby/ext/ruby_whisper_context.c | 32 ++++++++++++++++---- bindings/ruby/ext/ruby_whisper_vad_context.c | 29 ++++++++++++++---- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 966b6bd2566..f8d06d73ebe 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -32,6 +32,14 @@ typedef struct full_args { int n_samples; } full_args; +typedef struct full_parallel_args { + struct whisper_context *context; + struct whisper_full_params *params; + float *samples; + int n_samples; + int n_processors; +} full_parallel_args; + static void ruby_whisper_free(ruby_whisper *rw) { @@ -418,16 +426,13 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); prepare_transcription(rwp, &self); - // FIXME: Ensure release_samples is called in case of exception - // Defining Samples class and wrapping parsed.samples in it might help full_args args = { rw->context, &rwp->params, parsed.samples, parsed.n_samples, }; - VALUE rb_result = rb_full((VALUE)&args); - release_samples((VALUE)&parsed); + VALUE rb_result = rb_ensure(rb_full, (VALUE)&args, release_samples, (VALUE)&parsed); const int result = NUM2INT(rb_result); if (0 == result) { return self; @@ -436,6 +441,14 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) } } +static VALUE +rb_full_parallel(VALUE rb_args) +{ + full_parallel_args *args = (full_parallel_args *)rb_args; + int result = whisper_full_parallel(args->context, *args->params, args->samples, args->n_samples, args->n_processors); + return INT2NUM(result); +} + /* * Split the input audio in chunks and process each chunk separately using whisper_full_with_state() * Result is stored in the default state of the context @@ -477,8 +490,15 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) } struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); prepare_transcription(rwp, &self); - const int result = whisper_full_parallel(rw->context, rwp->params, parsed.samples, parsed.n_samples, n_processors); - release_samples((VALUE)&parsed); + const full_parallel_args args = { + rw->context, + &rwp->params, + parsed.samples, + parsed.n_samples, + n_processors, + }; + const VALUE rb_result = rb_ensure(rb_full_parallel, (VALUE)&args, release_samples, (VALUE)&parsed); + const int result = NUM2INT(rb_result); if (0 == result) { return self; } else { diff --git a/bindings/ruby/ext/ruby_whisper_vad_context.c b/bindings/ruby/ext/ruby_whisper_vad_context.c index c4046871476..cdf74331cbb 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context.c +++ b/bindings/ruby/ext/ruby_whisper_vad_context.c @@ -12,6 +12,13 @@ extern VALUE release_samples(VALUE parsed); extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments); +typedef struct segments_from_samples_args { + struct whisper_vad_context *context; + struct whisper_vad_params *params; + float *samples; + int n_samples; +} segments_from_samples_args; + static size_t ruby_whisper_vad_context_memsize(const void *p) { @@ -70,6 +77,16 @@ ruby_whisper_vad_context_initialize(VALUE self, VALUE model_path) return Qnil; } +static VALUE +rb_segments_from_samples(VALUE rb_args) +{ + segments_from_samples_args *args = (segments_from_samples_args *)rb_args; + + struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(args->context, *args->params, args->samples, args->n_samples); + + return ruby_whisper_vad_segments_s_init(segments); +} + static VALUE ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self) { @@ -85,15 +102,15 @@ ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self) VALUE n_samples = argc == 2 ? Qnil : argv[2]; struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); - struct whisper_vad_segments *segments = whisper_vad_segments_from_samples( + segments_from_samples_args args = { rwvc->context, - rwvp->params, + &rwvp->params, parsed.samples, - parsed.n_samples - ); - release_samples((VALUE)&parsed); + parsed.n_samples, + }; + VALUE segments = rb_ensure(rb_segments_from_samples, (VALUE)&args, release_samples, (VALUE)&parsed); - return ruby_whisper_vad_segments_s_init(segments); + return segments; } void init_ruby_whisper_vad_context(VALUE *mVAD) From 22a4809a685b9673e87a1345e94c9070e976322f Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 20:49:11 +0900 Subject: [PATCH 39/44] Extract fill_samples function --- bindings/ruby/ext/ruby_whisper_context.c | 51 ++++++++++++++++-------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index f8d06d73ebe..1238644ade2 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -25,6 +25,12 @@ extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context); ID transcribe_option_names[1]; +typedef struct fill_samples_args { + float *dest; + VALUE *src; + int n_samples; +} fill_samples_args; + typedef struct full_args { struct whisper_context *context; struct whisper_full_params *params; @@ -300,6 +306,28 @@ check_memory_view(rb_memory_view_t *memview) return true; } +static VALUE +fill_samples(VALUE rb_args) +{ + fill_samples_args *args = (fill_samples_args *)rb_args; + + if (RB_TYPE_P(*args->src, T_ARRAY)) { + for (int i = 0; i < args->n_samples; i++) { + args->dest[i] = RFLOAT_VALUE(rb_ary_entry(*args->src, i)); + } + } else { + // TODO: use rb_block_call + VALUE iter = rb_funcall(*args->src, id_to_enum, 1, rb_str_new2("each")); + for (int i = 0; i < args->n_samples; i++) { + // TODO: check if iter is exhausted and raise ArgumentError appropriately + VALUE sample = rb_funcall(iter, id_next, 0); + args->dest[i] = RFLOAT_VALUE(sample); + } + } + + return Qnil; +} + struct parsed_samples_t parse_samples(VALUE *samples, VALUE *n_samples) { @@ -356,22 +384,13 @@ parse_samples(VALUE *samples, VALUE *n_samples) if (parsed.memview_exported) { parsed.samples = (float *)parsed.memview.data; } else { - VALUE store; - parsed.samples = rb_alloc_tmp_buffer(&store, sizeof(float) * parsed.n_samples); - if (is_array) { - for (int i = 0; i < parsed.n_samples; i++) { - parsed.samples[i] = RFLOAT_VALUE(rb_ary_entry(*samples, i)); - } - } else { - // TODO: use rb_block_call - VALUE iter = rb_funcall(*samples, id_to_enum, 1, rb_str_new2("each")); - for (int i = 0; i < parsed.n_samples; i++) { - // TODO: check if iter is exhausted and raise ArgumentError appropriately - VALUE sample = rb_funcall(iter, id_next, 0); - parsed.samples[i] = RFLOAT_VALUE(sample); - } - } - rb_free_tmp_buffer(&store); + parsed.samples = ALLOC_N(float, parsed.n_samples); + fill_samples_args args = { + parsed.samples, + samples, + parsed.n_samples, + }; + fill_samples((VALUE)&args); } return parsed; From 55e495410462f4a1229d416e0bc2ef77d9a1e740 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 20:57:55 +0900 Subject: [PATCH 40/44] Free samples memory when filling it failed --- bindings/ruby/ext/ruby_whisper_context.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 1238644ade2..a9905eb1c0e 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -390,7 +390,12 @@ parse_samples(VALUE *samples, VALUE *n_samples) samples, parsed.n_samples, }; - fill_samples((VALUE)&args); + int state; + rb_protect(fill_samples, (VALUE)&args, &state); + if (state) { + xfree(parsed.samples); + rb_jump_tag(state); + } } return parsed; From 58fb46ed44fc0de4248d055e664991e535144d23 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 20:59:30 +0900 Subject: [PATCH 41/44] Free samples memory when transcription failed --- bindings/ruby/ext/ruby_whisper_context.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index a9905eb1c0e..73053de6e5d 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -406,10 +406,10 @@ release_samples(VALUE rb_parsed_args) { parsed_samples_t *parsed_args = (parsed_samples_t *)rb_parsed_args; - // When parsed_args->memview_exported is false, - // parsed_args->samples was allocated by rb_alloc_tmp_buffer, so no need to free here if (parsed_args->memview_exported) { rb_memory_view_release(&parsed_args->memview); + } else { + xfree(parsed_args->samples); } *parsed_args = (parsed_samples_t){0}; From 7938864b7f6f38360604caa9f814649baf7c51d6 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 21:10:03 +0900 Subject: [PATCH 42/44] Prepare transcription in wrapper funciton --- bindings/ruby/ext/ruby_whisper_context.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 73053de6e5d..e3130f9c699 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -32,15 +32,17 @@ typedef struct fill_samples_args { } fill_samples_args; typedef struct full_args { + VALUE *rb_context; struct whisper_context *context; - struct whisper_full_params *params; + ruby_whisper_params *params; float *samples; int n_samples; } full_args; typedef struct full_parallel_args { + VALUE *rb_context; struct whisper_context *context; - struct whisper_full_params *params; + ruby_whisper_params *params; float *samples; int n_samples; int n_processors; @@ -420,7 +422,10 @@ static VALUE rb_full(VALUE rb_args) { full_args *args = (full_args *)rb_args; - int result = whisper_full(args->context, *args->params, args->samples, args->n_samples); + + prepare_transcription(args->params, args->rb_context); + int result = whisper_full(args->context, args->params->params, args->samples, args->n_samples); + return INT2NUM(result); } @@ -449,10 +454,10 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) VALUE n_samples = argc == 2 ? Qnil : argv[2]; struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); - prepare_transcription(rwp, &self); full_args args = { + &self, rw->context, - &rwp->params, + rwp, parsed.samples, parsed.n_samples, }; @@ -469,7 +474,10 @@ static VALUE rb_full_parallel(VALUE rb_args) { full_parallel_args *args = (full_parallel_args *)rb_args; - int result = whisper_full_parallel(args->context, *args->params, args->samples, args->n_samples, args->n_processors); + + prepare_transcription(args->params, args->rb_context); + int result = whisper_full_parallel(args->context, args->params->params, args->samples, args->n_samples, args->n_processors); + return INT2NUM(result); } @@ -513,10 +521,10 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) break; } struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); - prepare_transcription(rwp, &self); const full_parallel_args args = { + &self, rw->context, - &rwp->params, + rwp, parsed.samples, parsed.n_samples, n_processors, From f63d6f412b95185aeac260209c234c82ce98a981 Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 21:32:46 +0900 Subject: [PATCH 43/44] Change function name --- bindings/ruby/ext/ruby_whisper_context.c | 8 ++++---- bindings/ruby/ext/ruby_whisper_vad_context.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index e3130f9c699..c60a2b22dd6 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -419,7 +419,7 @@ release_samples(VALUE rb_parsed_args) } static VALUE -rb_full(VALUE rb_args) +full_body(VALUE rb_args) { full_args *args = (full_args *)rb_args; @@ -461,7 +461,7 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) parsed.samples, parsed.n_samples, }; - VALUE rb_result = rb_ensure(rb_full, (VALUE)&args, release_samples, (VALUE)&parsed); + VALUE rb_result = rb_ensure(full_body, (VALUE)&args, release_samples, (VALUE)&parsed); const int result = NUM2INT(rb_result); if (0 == result) { return self; @@ -471,7 +471,7 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) } static VALUE -rb_full_parallel(VALUE rb_args) +full_parallel_body(VALUE rb_args) { full_parallel_args *args = (full_parallel_args *)rb_args; @@ -529,7 +529,7 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) parsed.n_samples, n_processors, }; - const VALUE rb_result = rb_ensure(rb_full_parallel, (VALUE)&args, release_samples, (VALUE)&parsed); + const VALUE rb_result = rb_ensure(full_parallel_body, (VALUE)&args, release_samples, (VALUE)&parsed); const int result = NUM2INT(rb_result); if (0 == result) { return self; diff --git a/bindings/ruby/ext/ruby_whisper_vad_context.c b/bindings/ruby/ext/ruby_whisper_vad_context.c index cdf74331cbb..1da5388f650 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context.c +++ b/bindings/ruby/ext/ruby_whisper_vad_context.c @@ -78,7 +78,7 @@ ruby_whisper_vad_context_initialize(VALUE self, VALUE model_path) } static VALUE -rb_segments_from_samples(VALUE rb_args) +segments_from_samples_body(VALUE rb_args) { segments_from_samples_args *args = (segments_from_samples_args *)rb_args; @@ -108,7 +108,7 @@ ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self) parsed.samples, parsed.n_samples, }; - VALUE segments = rb_ensure(rb_segments_from_samples, (VALUE)&args, release_samples, (VALUE)&parsed); + VALUE segments = rb_ensure(segments_from_samples_body, (VALUE)&args, release_samples, (VALUE)&parsed); return segments; } From ac13945087059c9f77a33cb59842fdc6c096a19f Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Thu, 29 Jan 2026 21:52:02 +0900 Subject: [PATCH 44/44] Simplify function boundary --- bindings/ruby/ext/ruby_whisper_context.c | 45 +++++++++----------- bindings/ruby/ext/ruby_whisper_vad_context.c | 21 +++++---- 2 files changed, 30 insertions(+), 36 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index c60a2b22dd6..84790e3dedf 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -32,17 +32,15 @@ typedef struct fill_samples_args { } fill_samples_args; typedef struct full_args { - VALUE *rb_context; - struct whisper_context *context; - ruby_whisper_params *params; + VALUE *context; + VALUE *params; float *samples; int n_samples; } full_args; typedef struct full_parallel_args { - VALUE *rb_context; - struct whisper_context *context; - ruby_whisper_params *params; + VALUE *context; + VALUE *params; float *samples; int n_samples; int n_processors; @@ -423,8 +421,13 @@ full_body(VALUE rb_args) { full_args *args = (full_args *)rb_args; - prepare_transcription(args->params, args->rb_context); - int result = whisper_full(args->context, args->params->params, args->samples, args->n_samples); + ruby_whisper *rw; + ruby_whisper_params *rwp; + GetContext(*args->context, rw); + TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp); + + prepare_transcription(rwp, args->context); + int result = whisper_full(rw->context, rwp->params, args->samples, args->n_samples); return INT2NUM(result); } @@ -446,18 +449,12 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc); } - ruby_whisper *rw; - ruby_whisper_params *rwp; - GetContext(self, rw); - VALUE params = argv[0]; - TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); VALUE n_samples = argc == 2 ? Qnil : argv[2]; struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); full_args args = { &self, - rw->context, - rwp, + &argv[0], parsed.samples, parsed.n_samples, }; @@ -475,8 +472,13 @@ full_parallel_body(VALUE rb_args) { full_parallel_args *args = (full_parallel_args *)rb_args; - prepare_transcription(args->params, args->rb_context); - int result = whisper_full_parallel(args->context, args->params->params, args->samples, args->n_samples, args->n_processors); + ruby_whisper *rw; + ruby_whisper_params *rwp; + GetContext(*args->context, rw); + TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp); + + prepare_transcription(rwp, args->context); + int result = whisper_full_parallel(rw->context, rwp->params, args->samples, args->n_samples, args->n_processors); return INT2NUM(result); } @@ -501,13 +503,7 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..4)", argc); } - ruby_whisper *rw; - ruby_whisper_params *rwp; - GetContext(self, rw); - VALUE params = argv[0]; - TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); VALUE n_samples = argc == 2 ? Qnil : argv[2]; - int n_processors; switch (argc) { case 2: @@ -523,8 +519,7 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); const full_parallel_args args = { &self, - rw->context, - rwp, + &argv[0], parsed.samples, parsed.n_samples, n_processors, diff --git a/bindings/ruby/ext/ruby_whisper_vad_context.c b/bindings/ruby/ext/ruby_whisper_vad_context.c index 1da5388f650..97c9736b6f4 100644 --- a/bindings/ruby/ext/ruby_whisper_vad_context.c +++ b/bindings/ruby/ext/ruby_whisper_vad_context.c @@ -13,8 +13,8 @@ extern VALUE release_samples(VALUE parsed); extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments); typedef struct segments_from_samples_args { - struct whisper_vad_context *context; - struct whisper_vad_params *params; + VALUE *context; + VALUE *params; float *samples; int n_samples; } segments_from_samples_args; @@ -82,7 +82,12 @@ segments_from_samples_body(VALUE rb_args) { segments_from_samples_args *args = (segments_from_samples_args *)rb_args; - struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(args->context, *args->params, args->samples, args->n_samples); + ruby_whisper_vad_context *rwvc; + ruby_whisper_vad_params *rwvp; + GetVADContext(*args->context, rwvc); + GetVADParams(*args->params, rwvp); + + struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(rwvc->context, rwvp->params, args->samples, args->n_samples); return ruby_whisper_vad_segments_s_init(segments); } @@ -94,17 +99,11 @@ ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self) rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc); } - ruby_whisper_vad_context *rwvc; - ruby_whisper_vad_params *rwvp; - GetVADContext(self, rwvc); - VALUE params = argv[0]; - GetVADParams(params, rwvp); VALUE n_samples = argc == 2 ? Qnil : argv[2]; - struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples); segments_from_samples_args args = { - rwvc->context, - &rwvp->params, + &self, + &argv[0], parsed.samples, parsed.n_samples, };