From c3682d7350454c208809584849806d7303a9be5d Mon Sep 17 00:00:00 2001 From: Dragos Daian Date: Thu, 18 Apr 2024 15:44:19 +0200 Subject: [PATCH] Fix crash and silence (#70) * fix crash of thread by waiting for it to finish. * fix for silence maybe, just don't do anything if silence * Update register_types.cpp * upd --- .../godot_whisper/capture_stream_to_text.gd | 16 ++++++++++++++-- bin/project.godot | 3 ++- scripts/build-dev.sh | 1 - scripts/build.sh | 1 - src/register_types.cpp | 4 ++-- 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/bin/addons/godot_whisper/capture_stream_to_text.gd b/bin/addons/godot_whisper/capture_stream_to_text.gd index 02688be0..0bec4aaf 100644 --- a/bin/addons/godot_whisper/capture_stream_to_text.gd +++ b/bin/addons/godot_whisper/capture_stream_to_text.gd @@ -19,6 +19,8 @@ func _get_configuration_warnings(): recording = value if recording: _ready() + else: + thread.wait_to_finish() get: return recording ## The interval at which transcribing is done. Use a value bigger than the time it takes to transcribe (eg. depends on model). @@ -47,6 +49,9 @@ var thread : Thread func _ready(): if Engine.is_editor_hint(): return + if thread && thread.is_alive(): + recording = false + thread.wait_to_finish() thread = Thread.new() _effect_capture.clear_buffer() thread.start(transcribe_thread) @@ -62,6 +67,10 @@ func transcribe_thread(): if resampled.size() <= 0: OS.delay_msec(transcribe_interval * 1000) continue + var no_activity := voice_activity_detection(resampled) + #if no_activity: + #print("no activity") + #continue var total_time : float= (resampled.size() as float) / SpeechToText.SPEECH_SETTING_SAMPLE_RATE var audio_ctx : int = total_time * 1500 / 30 + 128 if !use_dynamic_audio_context: @@ -75,7 +84,6 @@ func transcribe_thread(): var finish_sentence = false if total_time > maximum_sentence_time: finish_sentence = true - var no_activity := voice_activity_detection(resampled) var text : String for token in tokens: text += token["text"] @@ -84,10 +92,14 @@ func transcribe_thread(): finish_sentence = true if total_time < minimum_sentence_time || abs(tokens.size() - last_token_count) > halucinating_count: finish_sentence = false + var time_processing = (Time.get_ticks_msec() - start_time) + if no_activity: + #_accumulated_frames = [] + continue if finish_sentence: _accumulated_frames = _accumulated_frames.slice(_accumulated_frames.size() - (0.2 * mix_rate)) + #if !no_activity: call_deferred("emit_signal", "transcribed_msg", finish_sentence, full_text) - var time_processing = (Time.get_ticks_msec() - start_time) last_token_count = tokens.size() #print(text) print(full_text) diff --git a/bin/project.godot b/bin/project.godot index f1e5f81e..c439fffe 100644 --- a/bin/project.godot +++ b/bin/project.godot @@ -17,9 +17,10 @@ config/icon="res://icon.png" [audio] driver/enable_input=true +driver/mix_rate=48000 enable_audio_input=true mix_rate=48000 -input/transcribe/max_tokens=64 +input/transcribe/vad_treshold=4.0 [display] diff --git a/scripts/build-dev.sh b/scripts/build-dev.sh index a1475772..984707f1 100755 --- a/scripts/build-dev.sh +++ b/scripts/build-dev.sh @@ -1,5 +1,4 @@ scons target=template_debug generate_bindings=no arch=universal dev_build=yes #scons target=template_release generate_bindings=no arch=universal precision=single rm -rf demo/addons/godot_whisper/bin -cp -rf bin/addons/godot_whisper/bin demo/addons/godot_whisper/bin diff --git a/scripts/build.sh b/scripts/build.sh index 6b9afcd9..54dd8c12 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -1,4 +1,3 @@ scons target=template_release generate_bindings=no arch=universal precision=single rm -rf samples/godot_whisper/addons/godot_whisper/bin -cp -rf bin/addons/godot_whisper/bin samples/godot_whisper/addons/godot_whisper/bin diff --git a/src/register_types.cpp b/src/register_types.cpp index ee3e29a3..d3b72e88 100644 --- a/src/register_types.cpp +++ b/src/register_types.cpp @@ -62,9 +62,9 @@ void initialize_whisper_module(ModuleInitializationLevel p_level) { // register settings register_setting("audio/input/transcribe/entropy_treshold", 2.8, PROPERTY_HINT_NONE, {}); - register_setting("audio/input/transcribe/freq_treshold", 200, PROPERTY_HINT_NONE, {}); + register_setting("audio/input/transcribe/freq_treshold", 200.0, PROPERTY_HINT_NONE, {}); register_setting("audio/input/transcribe/max_tokens", 16, PROPERTY_HINT_NONE, {}); - register_setting("audio/input/transcribe/vad_treshold", 0.3, PROPERTY_HINT_NONE, {}); + register_setting("audio/input/transcribe/vad_treshold", 2.0, PROPERTY_HINT_NONE, {}); register_setting("audio/input/transcribe/use_gpu", true, PROPERTY_HINT_NONE, {}); register_setting("audio/input/transcribe/speed_up_2x", false, PROPERTY_HINT_NONE, {}); }