diff --git a/packages/bci-whispercpp/vcpkg-overlays/whisper-cpp/0003-bci-variable-conv1-kernel.patch b/packages/bci-whispercpp/vcpkg-overlays/whisper-cpp/0003-bci-variable-conv1-kernel.patch new file mode 100644 index 0000000000..025f8c29c0 --- /dev/null +++ b/packages/bci-whispercpp/vcpkg-overlays/whisper-cpp/0003-bci-variable-conv1-kernel.patch @@ -0,0 +1,28 @@ +diff --git a/src/whisper.cpp b/src/whisper.cpp +--- a/src/whisper.cpp ++++ b/src/whisper.cpp +@@ -633,6 +633,7 @@ + int32_t n_mels = 80; + int32_t ftype = 1; + float eps = 1e-5f; ++ int32_t n_audio_conv1_kernel = 3; + }; + + // audio encoding layer +@@ -1535,6 +1536,7 @@ + read_safe(loader, hparams.n_text_layer); + read_safe(loader, hparams.n_mels); + read_safe(loader, hparams.ftype); ++ read_safe(loader, hparams.n_audio_conv1_kernel); + + assert(hparams.n_text_state == hparams.n_audio_state); + +@@ -1775,7 +1777,7 @@ + // encoder + model.e_pe = create_tensor(ASR_TENSOR_ENC_POS_EMBD, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx)); + +- model.e_conv_1_w = create_tensor(ASR_TENSOR_CONV1_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state)); ++ model.e_conv_1_w = create_tensor(ASR_TENSOR_CONV1_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, hparams.n_audio_conv1_kernel, n_mels, n_audio_state)); + model.e_conv_1_b = create_tensor(ASR_TENSOR_CONV1_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state)); + + model.e_conv_2_w = create_tensor(ASR_TENSOR_CONV2_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state)); diff --git a/packages/bci-whispercpp/vcpkg-overlays/whisper-cpp/0004-bci-windowed-attention.patch b/packages/bci-whispercpp/vcpkg-overlays/whisper-cpp/0004-bci-windowed-attention.patch new file mode 100644 index 0000000000..9161158071 --- /dev/null +++ b/packages/bci-whispercpp/vcpkg-overlays/whisper-cpp/0004-bci-windowed-attention.patch @@ -0,0 +1,97 @@ +diff --git a/src/whisper.cpp b/src/whisper.cpp +--- a/src/whisper.cpp ++++ b/src/whisper.cpp +@@ -633,6 +633,8 @@ + int32_t ftype = 1; + float eps = 1e-5f; + int32_t n_audio_conv1_kernel = 3; ++ int32_t n_audio_window_size = 0; ++ int32_t n_audio_last_window_layer = -1; + }; + + // audio encoding layer +@@ -1536,6 +1538,8 @@ + read_safe(loader, hparams.n_mels); + read_safe(loader, hparams.ftype); + read_safe(loader, hparams.n_audio_conv1_kernel); ++ read_safe(loader, hparams.n_audio_window_size); ++ read_safe(loader, hparams.n_audio_last_window_layer); + + assert(hparams.n_text_state == hparams.n_audio_state); + +@@ -2114,6 +2118,15 @@ + + struct ggml_tensor * inpL = cur; + ++ struct ggml_tensor * window_mask = nullptr; ++ const int window_size = hparams.n_audio_window_size; ++ const int last_window_layer = hparams.n_audio_last_window_layer; ++ if (window_size > 0 && last_window_layer >= 0) { ++ window_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_ctx, 1); ++ ggml_set_name(window_mask, "window_mask"); ++ ggml_set_input(window_mask); ++ } ++ + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers_encoder[il]; + +@@ -2177,7 +2190,8 @@ + ggml_element_size(kv_pad.v)*n_state_head, + 0); + +- cur = ggml_flash_attn_ext(ctx0, Q, K, V, nullptr, KQscale, 0.0f, 0.0f); ++ struct ggml_tensor * attn_mask_fa = (window_mask && il <= last_window_layer) ? window_mask : nullptr; ++ cur = ggml_flash_attn_ext(ctx0, Q, K, V, attn_mask_fa, KQscale, 0.0f, 0.0f); + + cur = ggml_reshape_2d(ctx0, cur, n_state, n_ctx); + } else { +@@ -2191,7 +2205,8 @@ + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + +- struct ggml_tensor * KQ_soft_max = ggml_soft_max_ext(ctx0, KQ, nullptr, KQscale, 0.0f); ++ struct ggml_tensor * enc_attn_mask = (window_mask && il <= last_window_layer) ? window_mask : nullptr; ++ struct ggml_tensor * KQ_soft_max = ggml_soft_max_ext(ctx0, KQ, enc_attn_mask, KQscale, 0.0f); + + struct ggml_tensor * V = + ggml_cast(ctx0, +@@ -2442,6 +2457,25 @@ + return false; + } + ++ { ++ struct ggml_tensor * wmask = ggml_graph_get_tensor(gf, "window_mask"); ++ if (wmask) { ++ const int n_ctx = wstate.exp_n_audio_ctx > 0 ++ ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx; ++ const int ws = wctx.model.hparams.n_audio_window_size; ++ const int half_w = ws / 2; ++ std::vector mask_data(n_ctx * n_ctx); ++ for (int i = 0; i < n_ctx; ++i) { ++ for (int j = 0; j < n_ctx; ++j) { ++ mask_data[i * n_ctx + j] = ++ (abs(i - j) <= half_w) ? 0.0f : -INFINITY; ++ } ++ } ++ ggml_backend_tensor_set(wmask, mask_data.data(), 0, ++ n_ctx * n_ctx * sizeof(float)); ++ } ++ } ++ + if (!ggml_graph_compute_helper(sched, gf, n_threads)) { + return false; + } +@@ -6949,7 +6983,12 @@ + } else { + prompt_init.push_back(whisper_token_transcribe(ctx)); + } +- } ++ } else if (ctx->model.hparams.n_audio_window_size > 0) { ++ const int lang_id = whisper_lang_id(params.language); ++ state->lang_id = lang_id; ++ prompt_init.push_back(whisper_token_lang(ctx, lang_id)); ++ prompt_init.push_back(whisper_token_transcribe(ctx)); ++ } + + // first release distilled models require the "no_timestamps" token + { diff --git a/packages/bci-whispercpp/vcpkg-overlays/whisper-cpp/portfile.cmake b/packages/bci-whispercpp/vcpkg-overlays/whisper-cpp/portfile.cmake index 176ade149d..52e171819a 100644 --- a/packages/bci-whispercpp/vcpkg-overlays/whisper-cpp/portfile.cmake +++ b/packages/bci-whispercpp/vcpkg-overlays/whisper-cpp/portfile.cmake @@ -2,13 +2,15 @@ set(VERSION "a8d002cfd879315632a579e73f0148d06959de36") vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH - REPO tetherto/qvac-ext-lib-whisper.cpp + REPO ggml-org/whisper.cpp REF ${VERSION} SHA512 aea24debb836131d14d362ff78c6d12cfe2e82188340e69e71e6874a1fa51fa9405f2c03fe43888b1ff4183f4288bf64f07dd1106224b0108c3e0f844989a409 HEAD_REF master PATCHES 0001-fix-vcpkg-build.patch 0002-fix-apple-silicon-cross-compile.patch + 0003-bci-variable-conv1-kernel.patch + 0004-bci-windowed-attention.patch ) set(PLATFORM_OPTIONS) diff --git a/packages/bci-whispercpp/vcpkg.json b/packages/bci-whispercpp/vcpkg.json index 867b85f130..1aa5713b96 100644 --- a/packages/bci-whispercpp/vcpkg.json +++ b/packages/bci-whispercpp/vcpkg.json @@ -8,11 +8,6 @@ }, "whisper-cpp", "gtest" - ], - "overrides": [ - { - "name": "whisper-cpp", - "version": "1.7.5.1" - } ] } +