From 8f76d1e5557c7a281471ca14d60bc15d81d43a08 Mon Sep 17 00:00:00 2001 From: "K. S. Ernest (iFire) Lee" Date: Fri, 1 Dec 2023 14:37:22 -0800 Subject: [PATCH] Cleanup the code. --- macros.h | 33 -- src/register_types.cpp | 2 - src/speech_processor.del | 198 ---------- src/speech_processor1cp.del | 362 ------------------ src/speech_to_text.cpp | 715 ------------------------------------ src/speech_to_text.h | 5 +- 6 files changed, 1 insertion(+), 1314 deletions(-) delete mode 100644 macros.h delete mode 100644 src/speech_processor.del delete mode 100644 src/speech_processor1cp.del diff --git a/macros.h b/macros.h deleted file mode 100644 index 3c594b9a..00000000 --- a/macros.h +++ /dev/null @@ -1,33 +0,0 @@ -/*************************************************************************/ -/* macros.h */ -/*************************************************************************/ -/* This file is part of: */ -/* GODOT ENGINE */ -/* https://godotengine.org */ -/*************************************************************************/ -/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ -/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ -/* */ -/* Permission is hereby granted, free of charge, to any person obtaining */ -/* a copy of this software and associated documentation files (the */ -/* "Software"), to deal in the Software without restriction, including */ -/* without limitation the rights to use, copy, modify, merge, publish, */ -/* distribute, sublicense, and/or sell copies of the Software, and to */ -/* permit persons to whom the Software is furnished to do so, subject to */ -/* the following conditions: */ -/* */ -/* The above copyright notice and this permission notice shall be */ -/* included in all copies or substantial portions of the Software. */ -/* */ -/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ -/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ -/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ -/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ -/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ -/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ -/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -/*************************************************************************/ - -#ifndef SPEECH_MACROS_HPP -#define SPEECH_MACROS_HPP -#endif diff --git a/src/register_types.cpp b/src/register_types.cpp index 345b31eb..b184d33b 100644 --- a/src/register_types.cpp +++ b/src/register_types.cpp @@ -17,8 +17,6 @@ void uninitialize_whisper_module(ModuleInitializationLevel p_level) { extern "C" { -// Initialization. - GDExtensionBool GDE_EXPORT godot_whisper_library_init(const GDExtensionInterfaceGetProcAddress p_get_proc_address, GDExtensionClassLibraryPtr p_library, GDExtensionInitialization *r_initialization) { godot::GDExtensionBinding::InitObject init_obj(p_get_proc_address, p_library, r_initialization); diff --git a/src/speech_processor.del b/src/speech_processor.del deleted file mode 100644 index d3853c7b..00000000 --- a/src/speech_processor.del +++ /dev/null @@ -1,198 +0,0 @@ -#ifndef SPEECH_PROCESSOR_H -#define SPEECH_PROCESSOR_H - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include - -using namespace godot; - -class SpeechToTextProcessor : public Node { - GDCLASS(SpeechToTextProcessor, Node); - Mutex mutex; - -public: - enum { - SPEECH_SETTING_CHANNEL_COUNT = 1, - SPEECH_SETTING_MILLISECONDS_PER_SECOND = 1000, - SPEECH_SETTING_MILLISECONDS_PER_PACKET = 100, - SPEECH_SETTING_BUFFER_BYTE_COUNT = sizeof(int16_t), - SPEECH_SETTING_SAMPLE_RATE = 16000, - SPEECH_SETTING_BUFFER_FRAME_COUNT = SPEECH_SETTING_SAMPLE_RATE / SPEECH_SETTING_MILLISECONDS_PER_PACKET, - SPEECH_SETTING_INTERNAL_BUFFER_SIZE = 25 * 3 * 1276, - SPEECH_SETTING_VOICE_SAMPLE_RATE = SPEECH_SETTING_SAMPLE_RATE, - SPEECH_SETTING_VOICE_BUFFER_FRAME_COUNT = SPEECH_SETTING_BUFFER_FRAME_COUNT, - SPEECH_SETTING_PCM_BUFFER_SIZE = SPEECH_SETTING_BUFFER_FRAME_COUNT * SPEECH_SETTING_BUFFER_BYTE_COUNT * SPEECH_SETTING_CHANNEL_COUNT, - SPEECH_SETTING_VOICE_PACKET_SAMPLE_RATE = SPEECH_SETTING_VOICE_SAMPLE_RATE, - }; - - inline static float SPEECH_SETTING_PACKET_DELTA_TIME = float(SpeechToTextProcessor::SPEECH_SETTING_MILLISECONDS_PER_PACKET) / float(SpeechToTextProcessor::SPEECH_SETTING_MILLISECONDS_PER_SECOND); - -protected: - static void _bind_methods(); - -private: - unsigned char internal_buffer[(size_t)SPEECH_SETTING_INTERNAL_BUFFER_SIZE]; - -public: - bool decode_buffer(void *p_speech_decoder, - const PackedByteArray *p_compressed_buffer, - PackedByteArray *p_pcm_output_buffer, - const int p_compressed_buffer_size, - const int p_pcm_output_buffer_size) { - if (p_pcm_output_buffer->size() != p_pcm_output_buffer_size) { - ERR_PRINT("OpusCodec: decode_buffer output_buffer_size mismatch!"); - return false; - } - - return false; // TODO RETURN data. - } - -private: - int32_t record_mix_frames_processed = 0; - - void *encoder = nullptr; - AudioServer *audio_server = nullptr; - AudioStreamPlayer *audio_input_stream_player = nullptr; - Ref audio_effect_capture; - uint32_t mix_rate = 0; - PackedByteArray mix_byte_array; - Vector mix_reference_buffer; - Vector mix_capture_buffer; - - PackedFloat32Array mono_capture_real_array; - PackedFloat32Array mono_reference_real_array; - PackedFloat32Array capture_real_array; - PackedFloat32Array reference_real_array; - uint32_t capture_real_array_offset = 0; - - PackedByteArray pcm_byte_array_cache; - - // LibResample - SRC_STATE *libresample_state = nullptr; - int libresample_error = 0; - - int64_t capture_discarded_frames = 0; - int64_t capture_pushed_frames = 0; - int32_t capture_ring_limit = 0; - int32_t capture_ring_current_size = 0; - int32_t capture_ring_max_size = 0; - int64_t capture_ring_size_sum = 0; - int32_t capture_get_calls = 0; - int64_t capture_get_frames = 0; - -public: - struct SpeechInput { - PackedByteArray *pcm_byte_array = nullptr; - float volume = 0.0; - }; - - struct CompressedSpeechBuffer { - PackedByteArray *compressed_byte_array = nullptr; - int buffer_size = 0; - }; - - std::function speech_processed; - void register_speech_processed( - const std::function &callback) { - speech_processed = callback; - } - - uint32_t _resample_audio_buffer(const float *p_src, - const uint32_t p_src_frame_count, - const uint32_t p_src_samplerate, - const uint32_t p_target_samplerate, - float *p_dst); - - void start(); - void stop(); - - static void _get_capture_block(AudioServer *p_audio_server, - const uint32_t &p_mix_frame_count, - const Vector2 *p_process_buffer_in, - float *p_process_buffer_out); - - void _mix_audio(const Vector2 *p_process_buffer_in); - - static bool _16_pcm_mono_to_real_stereo(const PackedByteArray *p_src_buffer, - PackedVector2Array *p_dst_buffer); - - static bool _16_pcm_mono_to_real_mono( - const PackedByteArray *p_src_buffer, PackedFloat32Array *p_dst_buffer) { - uint32_t buffer_size = p_src_buffer->size(); - - ERR_FAIL_COND_V(buffer_size % 2, false); - - uint32_t frame_count = buffer_size / 2; - - const int16_t *src_buffer_ptr = - reinterpret_cast(p_src_buffer->ptr()); - real_t *real_buffer_ptr = reinterpret_cast(p_dst_buffer->ptrw()); - - for (uint32_t i = 0; i < frame_count; i++) { - float value = ((float)*src_buffer_ptr) / 32768.0f; - - *(real_buffer_ptr) = value; - - real_buffer_ptr++; - src_buffer_ptr++; - } - - return true; - } - - virtual bool decompress_buffer_internal( - void *speech_decoder, const PackedByteArray *p_read_byte_array, - const int p_read_size, PackedVector2Array *p_write_vec2_array) { - if (decode_buffer(speech_decoder, p_read_byte_array, &pcm_byte_array_cache, - p_read_size, SPEECH_SETTING_PCM_BUFFER_SIZE)) { - if (_16_pcm_mono_to_real_stereo(&pcm_byte_array_cache, - p_write_vec2_array)) { - return true; - } - } - return true; - } - - virtual Dictionary compress_buffer(const PackedByteArray &p_pcm_byte_array, - Dictionary p_output_buffer); - - virtual PackedVector2Array - decompress_buffer(void *p_speech_decoder, - const PackedByteArray &p_read_byte_array, - const int p_read_size, - PackedVector2Array p_write_vec2_array); - - void set_streaming_bus(const String &p_name); - bool set_audio_input_stream_player(Node *p_audio_input_stream_player); - - void set_process_all(bool p_active); - - void _setup(); - void _update_stats(); - - void _notification(int p_what); - - SpeechToTextProcessor(); - ~SpeechToTextProcessor(); -}; - -#endif // SPEECH_PROCESSOR_H diff --git a/src/speech_processor1cp.del b/src/speech_processor1cp.del deleted file mode 100644 index d96f0cf5..00000000 --- a/src/speech_processor1cp.del +++ /dev/null @@ -1,362 +0,0 @@ -#include "speech_processor.h" - -#include - -#include - -#define STEREO_CHANNEL_COUNT 2 - -#define SIGNED_32_BIT_SIZE 2147483647 -#define UNSIGNED_32_BIT_SIZE 4294967295 -#define SIGNED_16_BIT_SIZE 32767 -#define UNSIGNED_16_BIT_SIZE 65536 - -#define RECORD_MIX_FRAMES 1024 * 2 -#define RESAMPLED_BUFFER_FACTOR sizeof(int) - -void SpeechToTextProcessor::_bind_methods() { - ClassDB::bind_method(D_METHOD("start"), &SpeechToTextProcessor::start); - ClassDB::bind_method(D_METHOD("stop"), &SpeechToTextProcessor::stop); - ClassDB::bind_method(D_METHOD("set_streaming_bus", "name"), - &SpeechToTextProcessor::set_streaming_bus); - ClassDB::bind_method(D_METHOD("set_audio_input_stream_player", "stream_player"), - &SpeechToTextProcessor::set_audio_input_stream_player); - ADD_SIGNAL(MethodInfo("speech_processed", - PropertyInfo(Variant::DICTIONARY, "packet"))); - - BIND_CONSTANT(SPEECH_SETTING_CHANNEL_COUNT); - BIND_CONSTANT(SPEECH_SETTING_MILLISECONDS_PER_PACKET); - BIND_CONSTANT(SPEECH_SETTING_BUFFER_BYTE_COUNT); - BIND_CONSTANT(SPEECH_SETTING_SAMPLE_RATE); - BIND_CONSTANT(SPEECH_SETTING_BUFFER_FRAME_COUNT); - BIND_CONSTANT(SPEECH_SETTING_INTERNAL_BUFFER_SIZE); - BIND_CONSTANT(SPEECH_SETTING_VOICE_SAMPLE_RATE); - BIND_CONSTANT(SPEECH_SETTING_VOICE_BUFFER_FRAME_COUNT); - BIND_CONSTANT(SPEECH_SETTING_PCM_BUFFER_SIZE); - BIND_CONSTANT(SPEECH_SETTING_MILLISECONDS_PER_SECOND); - BIND_CONSTANT(SPEECH_SETTING_VOICE_PACKET_SAMPLE_RATE); - BIND_CONSTANT(SPEECH_SETTING_PACKET_DELTA_TIME); -} - -uint32_t SpeechToTextProcessor::_resample_audio_buffer( - const float *p_src, const uint32_t p_src_frame_count, - const uint32_t p_src_samplerate, const uint32_t p_target_samplerate, - float *p_dst) { - if (p_src_samplerate != p_target_samplerate) { - SRC_DATA src_data; - - src_data.data_in = p_src; - src_data.data_out = p_dst; - - src_data.input_frames = p_src_frame_count; - src_data.output_frames = p_src_frame_count * RESAMPLED_BUFFER_FACTOR; - - src_data.src_ratio = (double)p_target_samplerate / (double)p_src_samplerate; - src_data.end_of_input = 0; - - int error = src_process(libresample_state, &src_data); - if (error != 0) { - ERR_PRINT("resample_error!"); - return 0; - } - return src_data.output_frames_gen; - } else { - memcpy(p_dst, p_src, - static_cast(p_src_frame_count) * sizeof(float)); - return p_src_frame_count; - } -} - -void SpeechToTextProcessor::_get_capture_block(AudioServer *p_audio_server, - const uint32_t &p_mix_frame_count, - const Vector2 *p_process_buffer_in, - float *p_process_buffer_out) { - for (size_t i = 0; i < p_mix_frame_count; i++) { - float mono = - p_process_buffer_in[i].x * 0.5f + p_process_buffer_in[i].y * 0.5f; - p_process_buffer_out[i] = mono; - } -} - -void SpeechToTextProcessor::_mix_audio(const Vector2 *p_capture_buffer) { - if (audio_server) { - _get_capture_block(audio_server, RECORD_MIX_FRAMES, p_capture_buffer, mono_capture_real_array.ptrw()); - // Speaker frame. - _resample_audio_buffer( - mono_reference_real_array.ptr(), // Pointer to source buffer - RECORD_MIX_FRAMES, // Size of source buffer * sizeof(float) - AudioServer::get_singleton()->get_mix_rate(), // Source sample rate - SPEECH_SETTING_VOICE_SAMPLE_RATE, // Target sample rate - reference_real_array.ptrw() + - static_cast(capture_real_array_offset)); - // Microphone frame. - uint32_t resampled_frame_count = - capture_real_array_offset + - _resample_audio_buffer( - mono_capture_real_array.ptr(), // Pointer to source buffer - RECORD_MIX_FRAMES, // Size of source buffer * sizeof(float) - mix_rate, // Source sample rate - SPEECH_SETTING_VOICE_SAMPLE_RATE, // Target sample rate - capture_real_array.ptrw() + - static_cast(capture_real_array_offset)); - capture_real_array_offset = 0; - const float *capture_real_array_read_ptr = capture_real_array.ptr(); - double_t sum = 0; - while (capture_real_array_offset < resampled_frame_count - SPEECH_SETTING_BUFFER_FRAME_COUNT) { - memcpy(mix_byte_array.ptrw(), mono_capture_real_array.ptrw(), mix_byte_array.size()); - Dictionary voice_data_packet; - voice_data_packet["buffer"] = mix_byte_array; - float average = (float)sum / (float)SPEECH_SETTING_BUFFER_FRAME_COUNT; - voice_data_packet["loudness"] = average; - - emit_signal("speech_processed", voice_data_packet); - - if (speech_processed) { - SpeechInput speech_input; - speech_input.pcm_byte_array = &mix_byte_array; - speech_input.volume = average; - - speech_processed(&speech_input); - } - - capture_real_array_offset += SPEECH_SETTING_BUFFER_FRAME_COUNT; - } - - { - float *resampled_buffer_write_ptr = capture_real_array.ptrw(); - uint32_t remaining_resampled_buffer_frames = - (resampled_frame_count - capture_real_array_offset); - - // Copy the remaining frames to the beginning of the buffer for the next - // around - if (remaining_resampled_buffer_frames > 0) { - memmove(resampled_buffer_write_ptr, - capture_real_array_read_ptr + capture_real_array_offset, - static_cast(remaining_resampled_buffer_frames) * - sizeof(float)); - } - capture_real_array_offset = remaining_resampled_buffer_frames; - } - } -} - -void SpeechToTextProcessor::start() { - if (!ProjectSettings::get_singleton()->get("audio/enable_audio_input")) { - UtilityFunctions::print("Need to enable Project settings > Audio > Enable Audio Input " - "option to use capturing."); - return; - } - - if (!audio_input_stream_player || !audio_effect_capture.is_valid()) { - return; - } - //if (AudioDriver::get_singleton()) { - //mix_rate = AudioDriver::get_singleton()->get_mix_rate(); - mix_rate = AudioServer::get_singleton()->get_mix_rate(); - //} - audio_input_stream_player->play(); - audio_effect_capture->clear_buffer(); -} - -void SpeechToTextProcessor::stop() { - if (!audio_input_stream_player) { - return; - } - audio_input_stream_player->stop(); -} - -bool SpeechToTextProcessor::_16_pcm_mono_to_real_stereo( - const PackedByteArray *p_src_buffer, PackedVector2Array *p_dst_buffer) { - uint32_t buffer_size = p_src_buffer->size(); - - ERR_FAIL_COND_V(buffer_size % 2, false); - - uint32_t frame_count = buffer_size / 2; - - const int16_t *src_buffer_ptr = - reinterpret_cast(p_src_buffer->ptr()); - real_t *real_buffer_ptr = reinterpret_cast(p_dst_buffer->ptrw()); - - for (uint32_t i = 0; i < frame_count; i++) { - float value = ((float)*src_buffer_ptr) / 32768.0f; - - *(real_buffer_ptr + 0) = value; - *(real_buffer_ptr + 1) = value; - - real_buffer_ptr += 2; - src_buffer_ptr++; - } - - return true; -} - -Dictionary -SpeechToTextProcessor::compress_buffer(const PackedByteArray &p_pcm_byte_array, - Dictionary p_output_buffer) { - if (p_pcm_byte_array.size() != SPEECH_SETTING_PCM_BUFFER_SIZE) { - ERR_PRINT("SpeechToTextProcessor: PCM buffer is incorrect size!"); - return p_output_buffer; - } - - PackedByteArray *byte_array = nullptr; - if (!p_output_buffer.has("byte_array")) { - byte_array = (PackedByteArray *)&p_output_buffer["byte_array"]; - } - - if (!byte_array) { - ERR_PRINT("SpeechToTextProcessor: did not provide valid 'byte_array' in " - "p_output_buffer argument!"); - return p_output_buffer; - } else { - if (byte_array->size() == SPEECH_SETTING_PCM_BUFFER_SIZE) { - ERR_PRINT("SpeechToTextProcessor: output byte array is incorrect size!"); - return p_output_buffer; - } - } - - CompressedSpeechBuffer compressed_speech_buffer; - compressed_speech_buffer.compressed_byte_array = byte_array; - - p_output_buffer["buffer_size"] = -1; - - p_output_buffer["byte_array"] = - *compressed_speech_buffer.compressed_byte_array; - - return p_output_buffer; -} - -PackedVector2Array -SpeechToTextProcessor::decompress_buffer(void *p_speech_decoder, - const PackedByteArray &p_read_byte_array, - const int p_read_size, - PackedVector2Array p_write_vec2_array) { - if (p_read_byte_array.size() < p_read_size) { - ERR_PRINT("SpeechToTextProcessor: read byte_array size!"); - return PackedVector2Array(); - } - - if (decompress_buffer_internal(p_speech_decoder, &p_read_byte_array, - p_read_size, &p_write_vec2_array)) { - return p_write_vec2_array; - } - - return PackedVector2Array(); -} - -void SpeechToTextProcessor::set_streaming_bus(const String &p_name) { - if (!audio_server) { - return; - } - - int index = audio_server->get_bus_index(p_name); - if (index != -1) { - int effect_count = audio_server->get_bus_effect_count(index); - for (int i = 0; i < effect_count; i++) { - audio_effect_capture = audio_server->get_bus_effect(index, i); - } - } -} - -bool SpeechToTextProcessor::set_audio_input_stream_player( - Node *p_audio_input_stream_player) { - AudioStreamPlayer *player = - cast_to(p_audio_input_stream_player); - ERR_FAIL_COND_V(!player, false); - if (!audio_server) { - return false; - } - - audio_input_stream_player = player; - return true; -} - -void SpeechToTextProcessor::_setup() {} - -void SpeechToTextProcessor::set_process_all(bool p_active) { - set_process(p_active); - set_physics_process(p_active); - set_process_input(p_active); -} - -void SpeechToTextProcessor::_update_stats() {} - -void SpeechToTextProcessor::_notification(int p_what) { - switch (p_what) { - case NOTIFICATION_READY: - _setup(); - set_process_all(true); - break; - case NOTIFICATION_ENTER_TREE: - mix_byte_array.resize(SPEECH_SETTING_BUFFER_FRAME_COUNT * - SPEECH_SETTING_BUFFER_BYTE_COUNT); - mix_byte_array.fill(0); - mix_reference_buffer.resize(SPEECH_SETTING_BUFFER_FRAME_COUNT); - mix_reference_buffer.fill(0); - mix_capture_buffer.resize(SPEECH_SETTING_BUFFER_FRAME_COUNT); - mix_capture_buffer.fill(0); - break; - case NOTIFICATION_EXIT_TREE: - stop(); - mix_byte_array.resize(0); - - audio_server = nullptr; - break; - case NOTIFICATION_PROCESS: - if (audio_effect_capture.is_valid()) { - _update_stats(); - // This is pretty ugly, but needed to keep the audio from going out of - // sync - while (true) { - PackedVector2Array audio_frames = - audio_effect_capture->get_buffer(RECORD_MIX_FRAMES); - if (audio_frames.size() == 0) { - break; - } - capture_get_calls++; - capture_get_frames += audio_frames.size(); - capture_pushed_frames = audio_effect_capture->get_pushed_frames(); - capture_discarded_frames = audio_effect_capture->get_discarded_frames(); - capture_ring_limit = audio_effect_capture->get_buffer_length_frames(); - capture_ring_current_size = - audio_effect_capture->get_frames_available(); - capture_ring_size_sum += capture_ring_current_size; - capture_ring_max_size = - (capture_ring_current_size > capture_ring_max_size) - ? capture_ring_current_size - : capture_ring_max_size; - _mix_audio(audio_frames.ptrw()); - record_mix_frames_processed++; - } - } - break; - } -} - -SpeechToTextProcessor::SpeechToTextProcessor() { - capture_discarded_frames = 0; - capture_pushed_frames = 0; - capture_ring_limit = 0; - capture_ring_current_size = 0; - capture_ring_max_size = 0; - capture_ring_size_sum = 0; - capture_get_calls = 0; - capture_get_frames = 0; - - mono_capture_real_array.resize(RECORD_MIX_FRAMES); - mono_capture_real_array.fill(0); - mono_reference_real_array.resize(RECORD_MIX_FRAMES); - mono_reference_real_array.fill(0); - capture_real_array.resize(RECORD_MIX_FRAMES * RESAMPLED_BUFFER_FACTOR); - capture_real_array.fill(0); - reference_real_array.resize(RECORD_MIX_FRAMES * RESAMPLED_BUFFER_FACTOR); - reference_real_array.fill(0); - pcm_byte_array_cache.resize(SPEECH_SETTING_PCM_BUFFER_SIZE); - pcm_byte_array_cache.fill(0); - libresample_state = src_new(SRC_SINC_BEST_QUALITY, - SPEECH_SETTING_CHANNEL_COUNT, &libresample_error); - audio_server = AudioServer::get_singleton(); -} - -SpeechToTextProcessor::~SpeechToTextProcessor() { - libresample_state = src_delete(libresample_state); -} diff --git a/src/speech_to_text.cpp b/src/speech_to_text.cpp index 6f9adf12..470d1132 100644 --- a/src/speech_to_text.cpp +++ b/src/speech_to_text.cpp @@ -112,721 +112,6 @@ SpeechToText::SpeechToText() { context_instance = whisper_init_from_file_with_params(params.model.c_str(), context_parameters); } -/* -void SpeechToText::preallocate_buffers() { - input_byte_array.resize(SpeechToTextProcessor::SPEECH_SETTING_PCM_BUFFER_SIZE); - input_byte_array.fill(0); - compression_output_byte_array.resize( - SpeechToTextProcessor::SPEECH_SETTING_PCM_BUFFER_SIZE); - compression_output_byte_array.fill(0); - for (int i = 0; i < MAX_AUDIO_BUFFER_ARRAY_SIZE; i++) { - input_audio_buffer_array[i].compressed_byte_array.resize( - SpeechToTextProcessor::SPEECH_SETTING_PCM_BUFFER_SIZE); - input_audio_buffer_array[i].compressed_byte_array.fill(0); - } -} - -void SpeechToText::setup_connections() { - if (speech_processor) { - speech_processor->register_speech_processed( - std::function(std::bind( - &SpeechToText::speech_processed, this, std::placeholders::_1))); - } -} - -SpeechToText::InputPacket *SpeechToText::get_next_valid_input_packet() { - if (current_input_size < MAX_AUDIO_BUFFER_ARRAY_SIZE) { - InputPacket *input_packet = &input_audio_buffer_array[current_input_size]; - current_input_size++; - return input_packet; - } else { - for (int i = MAX_AUDIO_BUFFER_ARRAY_SIZE - 1; i > 0; i--) { - memcpy(input_audio_buffer_array[i - 1].compressed_byte_array.ptrw(), - input_audio_buffer_array[i].compressed_byte_array.ptr(), - SpeechToTextProcessor::SPEECH_SETTING_PCM_BUFFER_SIZE); - - input_audio_buffer_array[i - 1].buffer_size = - input_audio_buffer_array[i].buffer_size; - input_audio_buffer_array[i - 1].loudness = - input_audio_buffer_array[i].loudness; - } - skipped_audio_packets++; - return &input_audio_buffer_array[MAX_AUDIO_BUFFER_ARRAY_SIZE - 1]; - } -} - -void SpeechToText::speech_processed(SpeechToTextProcessor::SpeechInput *p_mic_input) { - PackedByteArray *mic_input_byte_array = p_mic_input->pcm_byte_array; - if (!mic_input_byte_array) { - return; - } - memcpy(input_byte_array.ptrw(), mic_input_byte_array->ptr(), - SpeechToTextProcessor::SPEECH_SETTING_PCM_BUFFER_SIZE); - if (!input_byte_array.size()) { - return; - } - bool ok = SpeechToTextProcessor::_16_pcm_mono_to_real_mono(&input_byte_array, &uncompressed_audio); - ERR_FAIL_COND(!ok); - if (!context_instance) { - return; - } - - whisper_full_params whispher_params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); - whispher_params.print_progress = false; - whispher_params.print_special = params.print_special; - whispher_params.print_realtime = false; - whispher_params.print_timestamps = !params.no_timestamps; - whispher_params.translate = params.translate; - whispher_params.single_segment = true; - whispher_params.max_tokens = params.max_tokens; - whispher_params.language = params.language.c_str(); - whispher_params.n_threads = params.n_threads; - whispher_params.audio_ctx = params.audio_ctx; - whispher_params.speed_up = params.speed_up; - whispher_params.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data(); - whispher_params.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size(); - - // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured - whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr); - - if (whisper_full(context_instance, whispher_params, uncompressed_audio.ptr(), SpeechToTextProcessor::SPEECH_SETTING_BUFFER_FRAME_COUNT) != 0) { - ERR_PRINT("Failed to process audio"); - return; - } - - const int n_segments = whisper_full_n_segments(context_instance); - for (int i = 0; i < n_segments; ++i) { - const char *text = whisper_full_get_segment_text(context_instance, i); - UtilityFunctions::print(vformat("%s", text)); - } -} - -int SpeechToText::get_jitter_buffer_speedup() const { - return JITTER_BUFFER_SPEEDUP; -} - -void SpeechToText::set_jitter_buffer_speedup(int p_jitter_buffer_speedup) { - JITTER_BUFFER_SPEEDUP = p_jitter_buffer_speedup; -} - -int SpeechToText::get_jitter_buffer_slowdown() const { - return JITTER_BUFFER_SLOWDOWN; -} - -void SpeechToText::set_jitter_buffer_slowdown(int p_jitter_buffer_slowdown) { - JITTER_BUFFER_SLOWDOWN = p_jitter_buffer_slowdown; -} - -float SpeechToText::get_stream_speedup_pitch() const { - return STREAM_SPEEDUP_PITCH; -} - -void SpeechToText::set_stream_speedup_pitch(float p_stream_speedup_pitch) { - STREAM_SPEEDUP_PITCH = p_stream_speedup_pitch; -} - -int SpeechToText::get_max_jitter_buffer_size() const { - return MAX_JITTER_BUFFER_SIZE; -} - -void SpeechToText::set_max_jitter_buffer_size(int p_max_jitter_buffer_size) { - MAX_JITTER_BUFFER_SIZE = p_max_jitter_buffer_size; -} - -float SpeechToText::get_buffer_delay_threshold() const { - return BUFFER_DELAY_THRESHOLD; -} - -void SpeechToText::set_buffer_delay_threshold(float p_buffer_delay_threshold) { - BUFFER_DELAY_THRESHOLD = p_buffer_delay_threshold; -} - -float SpeechToText::get_stream_standard_pitch() const { - return STREAM_STANDARD_PITCH; -} - -void SpeechToText::set_stream_standard_pitch(float p_stream_standard_pitch) { - STREAM_STANDARD_PITCH = p_stream_standard_pitch; -} - -bool SpeechToText::get_debug() const { - return DEBUG; -} - -void SpeechToText::set_debug(bool val) { - DEBUG = val; -} - -bool SpeechToText::get_use_sample_stretching() const { - return use_sample_stretching; -} - -void SpeechToText::set_use_sample_stretching(bool val) { - use_sample_stretching = val; -} - -PackedFloat32Array SpeechToText::get_uncompressed_audio() const { - return uncompressed_audio; -} - -void SpeechToText::set_uncompressed_audio(PackedFloat32Array val) { - uncompressed_audio = val; -} - -int SpeechToText::get_packets_received_this_frame() const { - return packets_received_this_frame; -} - -void SpeechToText::set_packets_received_this_frame(int val) { - packets_received_this_frame = val; -} - -int SpeechToText::get_playback_ring_buffer_length() const { - return playback_ring_buffer_length; -} - -void SpeechToText::set_playback_ring_buffer_length(int val) { - playback_ring_buffer_length = val; -} - -PackedVector2Array SpeechToText::get_blank_packet() const { - return blank_packet; -} - -void SpeechToText::set_blank_packet(PackedVector2Array val) { - blank_packet = val; -} - -Dictionary SpeechToText::get_player_audio() { - return player_audio; -} - -void SpeechToText::set_player_audio(Dictionary val) { - player_audio = val; -} - -int SpeechToText::nearest_shift(int p_number) { - for (int32_t i = 30; i-- > 0;) { - if (p_number & (1 << i)) { - return i + 1; - } - } - return 0; -} - -int SpeechToText::calc_playback_ring_buffer_length(Ref audio_stream_generator) { - int target_buffer_size = int(audio_stream_generator->get_mix_rate() * audio_stream_generator->get_buffer_length()); - return (1 << nearest_shift(target_buffer_size)); -} - -void SpeechToText::_bind_methods() { - ClassDB::bind_method(D_METHOD("get_speech_processor"), - &SpeechToText::get_speech_processor); - ClassDB::bind_method(D_METHOD("get_skipped_audio_packets"), - &SpeechToText::get_skipped_audio_packets); - ClassDB::bind_method(D_METHOD("clear_skipped_audio_packets"), - &SpeechToText::clear_skipped_audio_packets); - ClassDB::bind_method(D_METHOD("copy_and_clear_buffers"), - &SpeechToText::copy_and_clear_buffers); - - ClassDB::bind_method(D_METHOD("start_recording"), &SpeechToText::start_recording); - ClassDB::bind_method(D_METHOD("end_recording"), &SpeechToText::end_recording); - - ClassDB::bind_method(D_METHOD("set_streaming_bus", "bus"), - &SpeechToText::set_streaming_bus); - ClassDB::bind_method(D_METHOD("set_audio_input_stream_player", "player"), - &SpeechToText::set_audio_input_stream_player); - ClassDB::bind_method(D_METHOD("set_buffer_delay_threshold", "buffer_delay_threshold"), - &SpeechToText::set_buffer_delay_threshold); - ClassDB::bind_method(D_METHOD("get_buffer_delay_threshold"), - &SpeechToText::get_buffer_delay_threshold); - ClassDB::bind_method(D_METHOD("get_stream_standard_pitch"), - &SpeechToText::get_stream_standard_pitch); - ClassDB::bind_method(D_METHOD("set_stream_standard_pitch", "stream_standard_pitch"), - &SpeechToText::set_stream_standard_pitch); - ClassDB::bind_method(D_METHOD("get_stream_speedup_pitch"), - &SpeechToText::get_stream_standard_pitch); - ClassDB::bind_method(D_METHOD("set_stream_speedup_pitch", "stream_speedup_pitch"), - &SpeechToText::set_stream_standard_pitch); - ClassDB::bind_method(D_METHOD("get_max_jitter_buffer_size"), - &SpeechToText::get_max_jitter_buffer_size); - ClassDB::bind_method(D_METHOD("set_max_jitter_buffer_size", "max_jitter_buffer_size"), - &SpeechToText::set_max_jitter_buffer_size); - ClassDB::bind_method(D_METHOD("get_jitter_buffer_speedup"), - &SpeechToText::get_jitter_buffer_speedup); - ClassDB::bind_method(D_METHOD("set_jitter_buffer_speedup", "jitter_buffer_speedup"), - &SpeechToText::set_jitter_buffer_speedup); - ClassDB::bind_method(D_METHOD("get_jitter_buffer_slowdown"), - &SpeechToText::get_jitter_buffer_slowdown); - ClassDB::bind_method(D_METHOD("set_jitter_buffer_slowdown", "jitter_buffer_slowdown"), - &SpeechToText::set_jitter_buffer_slowdown); - ClassDB::bind_method(D_METHOD("get_debug"), - &SpeechToText::get_debug); - ClassDB::bind_method(D_METHOD("set_debug", "debug"), - &SpeechToText::set_debug); - ClassDB::bind_method(D_METHOD("get_uncompressed_audio"), - &SpeechToText::get_uncompressed_audio); - ClassDB::bind_method(D_METHOD("set_uncompressed_audio", "uncompressed_audio"), - &SpeechToText::set_uncompressed_audio); - ClassDB::bind_method(D_METHOD("get_packets_received_this_frame"), - &SpeechToText::get_packets_received_this_frame); - ClassDB::bind_method(D_METHOD("set_packets_received_this_frame", "packets_received_this_frame"), - &SpeechToText::set_packets_received_this_frame); - ClassDB::bind_method(D_METHOD("get_playback_ring_buffer_length"), - &SpeechToText::get_playback_ring_buffer_length); - ClassDB::bind_method(D_METHOD("set_playback_ring_buffer_length", "playback_ring_buffer_length"), - &SpeechToText::set_playback_ring_buffer_length); - ClassDB::bind_method(D_METHOD("get_blank_packet"), - &SpeechToText::get_blank_packet); - ClassDB::bind_method(D_METHOD("set_blank_packet", "blank_packet"), - &SpeechToText::set_blank_packet); - ClassDB::bind_method(D_METHOD("get_player_audio"), - &SpeechToText::get_player_audio); - ClassDB::bind_method(D_METHOD("set_player_audio", "player_audio"), - &SpeechToText::set_player_audio); - ClassDB::bind_method(D_METHOD("get_use_sample_stretching"), - &SpeechToText::get_use_sample_stretching); - ClassDB::bind_method(D_METHOD("set_use_sample_stretching", "use_sample_stretching"), - &SpeechToText::set_use_sample_stretching); - ClassDB::bind_method(D_METHOD("calc_playback_ring_buffer_length", "generator"), - &SpeechToText::calc_playback_ring_buffer_length); - ClassDB::bind_method(D_METHOD("add_player_audio", "player_id", "audio_stream_player"), - &SpeechToText::add_player_audio); - ClassDB::bind_method(D_METHOD("on_received_audio_packet", "peer_id", "sequence_id", "packet"), - &SpeechToText::on_received_audio_packet); - ClassDB::bind_method(D_METHOD("get_playback_stats", "speech_stat"), - &SpeechToText::get_playback_stats); - ClassDB::bind_method(D_METHOD("remove_player_audio", "player_id"), - &SpeechToText::remove_player_audio); - ClassDB::bind_method(D_METHOD("clear_all_player_audio"), - &SpeechToText::clear_all_player_audio); - ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "BUFFER_DELAY_THRESHOLD"), "set_buffer_delay_threshold", - "get_buffer_delay_threshold"); - ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "STREAM_STANDARD_PITCH"), "set_stream_standard_pitch", - "get_stream_standard_pitch"); - ADD_PROPERTY(PropertyInfo(Variant::INT, "MAX_JITTER_BUFFER_SIZE"), "set_max_jitter_buffer_size", - "get_max_jitter_buffer_size"); - ADD_PROPERTY(PropertyInfo(Variant::INT, "STREAM_SPEEDUP_PITCH"), "set_stream_speedup_pitch", - "get_stream_speedup_pitch"); - ADD_PROPERTY(PropertyInfo(Variant::INT, "JITTER_BUFFER_SLOWDOWN"), "set_jitter_buffer_slowdown", - "get_jitter_buffer_slowdown"); - ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "JITTER_BUFFER_SPEEDUP"), "set_jitter_buffer_speedup", - "get_jitter_buffer_speedup"); - ADD_PROPERTY(PropertyInfo(Variant::BOOL, "DEBUG"), "set_debug", - "get_debug"); - ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_sample_stretching"), "set_use_sample_stretching", - "get_use_sample_stretching"); - ADD_PROPERTY(PropertyInfo(Variant::PACKED_VECTOR2_ARRAY, "uncompressed_audio", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE), "set_uncompressed_audio", - "get_uncompressed_audio"); - ADD_PROPERTY(PropertyInfo(Variant::INT, "packets_received_this_frame"), "set_packets_received_this_frame", - "get_packets_received_this_frame"); - ADD_PROPERTY(PropertyInfo(Variant::INT, "playback_ring_buffer_length"), "set_playback_ring_buffer_length", - "get_playback_ring_buffer_length"); - ADD_PROPERTY(PropertyInfo(Variant::PACKED_VECTOR2_ARRAY, "blank_packet", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE), "set_blank_packet", - "get_blank_packet"); - ADD_PROPERTY(PropertyInfo(Variant::DICTIONARY, "player_audio", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE), "set_player_audio", - "get_player_audio"); -} - -int SpeechToText::get_skipped_audio_packets() { - return skipped_audio_packets; -} - -void SpeechToText::clear_skipped_audio_packets() { - skipped_audio_packets = 0; -} - -PackedVector2Array SpeechToText::decompress_buffer(void *p_speech_decoder, PackedByteArray p_read_byte_array, const int p_read_size, PackedVector2Array p_write_vec2_array) { - if (p_read_byte_array.size() < p_read_size) { - ERR_PRINT("SpeechDecoder: read byte_array size!"); - return PackedVector2Array(); - } - - if (speech_processor->decompress_buffer_internal( - p_speech_decoder, &p_read_byte_array, p_read_size, - &p_write_vec2_array)) { - return p_write_vec2_array; - } - - return PackedVector2Array(); -} - -Array SpeechToText::copy_and_clear_buffers() { - MutexLock mutex_lock(audio_mutex); - - Array output_array; - output_array.resize(current_input_size); - - for (int i = 0; i < current_input_size; i++) { - Dictionary dict; - - dict["byte_array"] = input_audio_buffer_array[i].compressed_byte_array; - dict["buffer_size"] = input_audio_buffer_array[i].buffer_size; - dict["loudness"] = input_audio_buffer_array[i].loudness; - - output_array[i] = dict; - } - current_input_size = 0; - - return output_array; -} - -bool SpeechToText::start_recording() { - if (speech_processor) { - speech_processor->start(); - skipped_audio_packets = 0; - return true; - } - - return false; -} - -bool SpeechToText::end_recording() { - bool result = true; - if (speech_processor) { - speech_processor->stop(); - } else { - result = false; - } - if (has_method("clear_all_player_audio")) { - call("clear_all_player_audio"); - } - return result; -} - -void SpeechToText::_notification(int p_what) { - switch (p_what) { - case NOTIFICATION_READY: { - setup_connections(); - if (speech_processor) { - add_child(speech_processor, true); - speech_processor->set_owner(get_owner()); - } - uncompressed_audio.resize( - SpeechToTextProcessor::SPEECH_SETTING_BUFFER_FRAME_COUNT); - uncompressed_audio.fill(float()); - set_process_internal(true); - break; - } - case NOTIFICATION_EXIT_TREE: { - if (speech_processor) { - remove_child(speech_processor); - } - break; - } - case NOTIFICATION_POSTINITIALIZE: { - blank_packet.resize(SpeechToTextProcessor::SPEECH_SETTING_BUFFER_FRAME_COUNT); - blank_packet.fill(Vector2()); - for (int32_t i = 0; i < SpeechToTextProcessor::SPEECH_SETTING_BUFFER_FRAME_COUNT; i++) { - blank_packet[i] = Vector2(); - } - break; - } - case NOTIFICATION_INTERNAL_PROCESS: { - Array keys = player_audio.keys(); - for (int32_t i = 0; i < keys.size(); i++) { - Variant key = keys[i]; - if (!player_audio.has(key)) { - continue; - } - Dictionary elem = player_audio[key]; - if (!elem.has("audio_stream_player")) { - continue; - } - if (!elem.has("jitter_buffer")) { - continue; - } - Array jitter_buffer = elem["jitter_buffer"]; - if (!elem.has("playback_stats")) { - continue; - } - Ref playback_stats = elem["playback_stats"]; - Dictionary dict = player_audio[key]; - dict["packets_received_this_frame"] = 0; - player_audio[key] = dict; - } - packets_received_this_frame = 0; - break; - } - default: { - break; - } - } -} - -void SpeechToText::set_streaming_bus(const String &p_name) { - if (speech_processor) { - speech_processor->set_streaming_bus(p_name); - } -} - -bool SpeechToText::set_audio_input_stream_player(Node *p_audio_stream) { - AudioStreamPlayer *player = cast_to(p_audio_stream); - ERR_FAIL_NULL_V(player, false); - if (!speech_processor) { - return false; - } - speech_processor->set_audio_input_stream_player(player); - return true; -} - -SpeechToText::SpeechToText() { - speech_processor = memnew(SpeechToTextProcessor); - preallocate_buffers(); - - params.n_threads = MIN(4, (int32_t)std::thread::hardware_concurrency()); - params.step_ms = 3000; - params.keep_ms = 200; - params.capture_id = -1; - params.max_tokens = 32; - params.audio_ctx = 0; - params.vad_thold = 0.6f; - params.freq_thold = 100.0f; - params.speed_up = false; - params.translate = false; - params.no_fallback = false; - params.print_special = false; - params.no_context = true; - params.no_timestamps = false; - params.language = "en"; - params.model = "models/ggml-base.en.bin"; - - context_instance = whisper_init_from_file_with_params(params.model.c_str(), context_parameters); -} - -SpeechToText::~SpeechToText() { - memdelete(speech_processor); -} - -void SpeechToText::add_player_audio(int p_player_id, Node *p_audio_stream_player) { - if (!cast_to(p_audio_stream_player) && !cast_to(p_audio_stream_player) && !cast_to(p_audio_stream_player)) { - return; - } - if (player_audio.has(p_player_id)) { - ERR_PRINT(vformat("Attempted to duplicate player_audio entry (%s)!", p_player_id)); - } - Ref new_generator; - new_generator.instantiate(); - new_generator->set_mix_rate(SpeechToTextProcessor::SPEECH_SETTING_VOICE_PACKET_SAMPLE_RATE); - new_generator->set_buffer_length(BUFFER_DELAY_THRESHOLD); - playback_ring_buffer_length = calc_playback_ring_buffer_length(new_generator); - p_audio_stream_player->call("set_stream", new_generator); - p_audio_stream_player->call("set_bus", "VoiceOutput"); - p_audio_stream_player->call("set_autoplay", true); - p_audio_stream_player->call("play"); - Ref pstats = memnew(SpeechToTextPlaybackStats); - pstats->playback_ring_buffer_length = playback_ring_buffer_length; - pstats->buffer_frame_count = SpeechToTextProcessor::SPEECH_SETTING_BUFFER_FRAME_COUNT; - Dictionary dict; - dict["playback_last_skips"] = 0; - dict["audio_stream_player"] = p_audio_stream_player; - dict["jitter_buffer"] = Array(); - dict["sequence_id"] = -1; - dict["last_update"] = Time::get_singleton()->get_ticks_msec(); - dict["packets_received_this_frame"] = 0; - dict["excess_packets"] = 0; - dict["playback_stats"] = pstats; - dict["playback_start_time"] = 0; - dict["playback_prev_time"] = -1; - player_audio[p_player_id] = dict; -} - -void SpeechToText::vc_debug_print(String p_str) const { - if (!DEBUG) { - return; - } - UtilityFunctions::print(p_str); -} - -void SpeechToText::vc_debug_printerr(String p_str) const { - if (!DEBUG) { - return; - } - ERR_PRINT(p_str); -} - -void SpeechToText::on_received_audio_packet(int p_peer_id, int p_sequence_id, PackedByteArray p_packet) { - vc_debug_print( - vformat("Received_audio_packet: peer_id: {%s} sequence_id: {%s}", itos(p_peer_id), itos(p_sequence_id))); - if (!player_audio.has(p_peer_id)) { - return; - } - Dictionary elem = player_audio[p_peer_id]; - // Detects if no audio packets have been received from this player yet. - if (int64_t(elem["sequence_id"]) == -1) { - elem["sequence_id"] = p_sequence_id - 1; - } - - elem["packets_received_this_frame"] = int64_t(elem["packets_received_this_frame"]) + 1; - packets_received_this_frame += 1; - int64_t current_sequence_id = elem["sequence_id"]; - Array jitter_buffer = elem["jitter_buffer"]; - int64_t sequence_id_offset = p_sequence_id - current_sequence_id; - if (sequence_id_offset > 0) { - // For skipped buffers, add empty packets. - int64_t skipped_packets = sequence_id_offset - 1; - if (skipped_packets) { - Variant fill_packets; - // If using stretching, fill with last received packet. - if (use_sample_stretching && jitter_buffer.size() > 0) { - Dictionary new_jitter_buffer = jitter_buffer.back(); - fill_packets = new_jitter_buffer["packet"]; - } - for (int32_t _i = 0; _i < skipped_packets; _i++) { - Dictionary dict; - dict["packet"] = fill_packets; - dict["valid"] = false; - jitter_buffer.push_back(dict); - } - } - { - // Add the new valid buffer. - Dictionary dict; - dict["packet"] = p_packet; - dict["valid"] = true; - jitter_buffer.push_back(dict); - } - int64_t excess_packet_count = jitter_buffer.size() - MAX_JITTER_BUFFER_SIZE; - if (excess_packet_count > 0) { - for (int32_t _i = 0; _i < excess_packet_count; _i++) { - elem["excess_packets"] = (int64_t)elem["excess_packets"] + 1; - jitter_buffer.pop_front(); - } - } - elem["sequence_id"] = int64_t(elem["sequence_id"]) + sequence_id_offset; - } else { - int64_t sequence_id = jitter_buffer.size() - 1 + sequence_id_offset; - vc_debug_print(vformat("Updating existing sequence_id: %s", itos(sequence_id))); - if (sequence_id >= 0) { - // Update the existing buffer. - if (use_sample_stretching) { - int32_t jitter_buffer_size = jitter_buffer.size(); - for (int32_t i = sequence_id; i < jitter_buffer_size - 1; i++) { - Dictionary buffer = jitter_buffer[i]; - if (buffer["valid"]) { - break; - } - Dictionary dict; - dict["packet"] = p_packet; - dict["valid"] = false; - jitter_buffer[i] = dict; - } - } - Dictionary dict; - dict["packet"] = p_packet; - dict["valid"] = true; - jitter_buffer[sequence_id] = dict; - } else { - vc_debug_printerr("Invalid repair sequence_id."); - } - } - elem["jitter_buffer"] = jitter_buffer; - player_audio[p_peer_id] = elem; -} - -Dictionary SpeechToText::get_playback_stats(Dictionary speech_stat_dict) { - Dictionary stat_dict = speech_stat_dict.duplicate(true); - stat_dict["capture_get_percent"] = 0; - stat_dict["capture_discard_percent"] = 0; - if (double(stat_dict["capture_pushed_s"]) > 0) { - stat_dict["capture_get_percent"] = 100.0 * double(stat_dict["capture_get_s"]) / double(stat_dict["capture_pushed_s"]); - stat_dict["capture_discard_percent"] = 100.0 * double(stat_dict["capture_discarded_s"]) / double(stat_dict["capture_pushed_s"]); - } - - Array keys = player_audio.keys(); - for (int32_t key_i = 0; key_i < keys.size(); key_i++) { - Variant key = keys[key_i]; - Dictionary elem = player_audio[key]; - Ref playback_stats = elem["playback_stats"]; - if (playback_stats.is_null()) { - continue; - } - Dictionary stats = playback_stats->get_playback_stats(); - stats["playback_total_time"] = (Time::get_singleton()->get_ticks_msec() - int64_t(elem["playback_start_time"])) / double(SpeechToTextProcessor::SPEECH_SETTING_MILLISECONDS_PER_SECOND); - stats["excess_packets"] = elem["excess_packets"]; - stats["excess_s"] = int64_t(elem["excess_packets"]) * SpeechToTextProcessor::SPEECH_SETTING_PACKET_DELTA_TIME; - stat_dict[key] = stats; - } - return stat_dict; -} - -void SpeechToText::remove_player_audio(int p_player_id) { - if (player_audio.has(p_player_id)) { - if (player_audio.erase(p_player_id)) { - return; - } - } - ERR_PRINT(vformat("Attempted to remove a non-existant player_audio entry (%s)", p_player_id)); -} - -void SpeechToText::clear_all_player_audio() { - Array keys = player_audio.keys(); - for (int32_t i = 0; i < keys.size(); i++) { - Variant key = keys[i]; - Variant element = player_audio[key]; - if (element.get_type() != Variant::DICTIONARY) { - continue; - } - Dictionary elem = element; - if (!elem.has("audio_stream_player")) { - continue; - } - Dictionary dict = player_audio[key]; - Node *node = cast_to(dict["audio_stream_player"]); - if (!node) { - continue; - } - node->queue_free(); - } - - player_audio = Dictionary(); -} - -Dictionary SpeechToTextPlaybackStats::get_playback_stats() { - double playback_pushed_frames = playback_pushed_calls * (buffer_frame_count * 1.0); - double playback_discarded_frames = playback_discarded_calls * (buffer_frame_count * 1.0); - Dictionary dict; - dict["playback_ring_limit_s"] = playback_ring_buffer_length / double(SpeechToTextProcessor::SPEECH_SETTING_VOICE_PACKET_SAMPLE_RATE); - dict["playback_ring_current_size_s"] = playback_ring_current_size / double(SpeechToTextProcessor::SPEECH_SETTING_VOICE_PACKET_SAMPLE_RATE); - dict["playback_ring_max_size_s"] = playback_ring_max_size / double(SpeechToTextProcessor::SPEECH_SETTING_VOICE_PACKET_SAMPLE_RATE); - dict["playback_ring_mean_size_s"] = 0; - if (playback_push_buffer_calls > 0) { - dict["playback_ring_mean_size_s"] = playback_ring_size_sum / playback_push_buffer_calls / double(SpeechToTextProcessor::SPEECH_SETTING_VOICE_PACKET_SAMPLE_RATE); - } else { - dict["playback_ring_mean_size_s"] = 0; - } - dict["jitter_buffer_current_size_s"] = float(jitter_buffer_current_size) * SpeechToTextProcessor::SPEECH_SETTING_PACKET_DELTA_TIME; - dict["jitter_buffer_max_size_s"] = float(jitter_buffer_max_size) * SpeechToTextProcessor::SPEECH_SETTING_PACKET_DELTA_TIME; - dict["jitter_buffer_mean_size_s"] = 0; - if (jitter_buffer_calls > 0) { - dict["jitter_buffer_mean_size_s"] = float(jitter_buffer_size_sum) / jitter_buffer_calls * SpeechToTextProcessor::SPEECH_SETTING_PACKET_DELTA_TIME; - } - dict["jitter_buffer_calls"] = jitter_buffer_calls; - dict["playback_position_s"] = playback_position; - dict["playback_get_percent"] = 0; - dict["playback_discard_percent"] = 0; - if (playback_pushed_frames > 0) { - dict["playback_get_percent"] = 100.0 * playback_get_frames / playback_pushed_frames; - dict["playback_discard_percent"] = 100.0 * playback_discarded_frames / playback_pushed_frames; - } - dict["playback_get_s"] = playback_get_frames / double(SpeechToTextProcessor::SPEECH_SETTING_VOICE_PACKET_SAMPLE_RATE); - dict["playback_pushed_s"] = playback_pushed_frames / double(SpeechToTextProcessor::SPEECH_SETTING_VOICE_PACKET_SAMPLE_RATE); - dict["playback_discarded_s"] = playback_discarded_frames / double(SpeechToTextProcessor::SPEECH_SETTING_VOICE_PACKET_SAMPLE_RATE); - dict["playback_push_buffer_calls"] = floor(playback_push_buffer_calls); - dict["playback_blank_s"] = playback_blank_push_calls * SpeechToTextProcessor::SPEECH_SETTING_PACKET_DELTA_TIME; - dict["playback_blank_percent"] = 0; - if (playback_push_buffer_calls > 0) { - dict["playback_blank_percent"] = 100.0 * playback_blank_push_calls / playback_push_buffer_calls; - } - dict["playback_skips"] = floor(playback_skips); - return dict; -} - -void SpeechToTextPlaybackStats::_bind_methods() { - ClassDB::bind_method(D_METHOD("get_playback_stats"), - &SpeechToTextPlaybackStats::get_playback_stats); -} -*/ void SpeechToText::_bind_methods() { ClassDB::bind_method(D_METHOD("transcribe", "buffer"), &SpeechToText::transcribe); BIND_CONSTANT(SPEECH_SETTING_SAMPLE_RATE); diff --git a/src/speech_to_text.h b/src/speech_to_text.h index 9786b333..105bf218 100644 --- a/src/speech_to_text.h +++ b/src/speech_to_text.h @@ -43,10 +43,7 @@ class SpeechToText : public Node { whisper_context *context_instance = nullptr; protected: - static void _bind_methods() { - ClassDB::bind_method(D_METHOD("transcribe", "buffer"), &SpeechToText::transcribe); - BIND_CONSTANT(SPEECH_SETTING_SAMPLE_RATE); - } + static void _bind_methods(); public: enum {