tetherto · sharmaraju352 · Apr 15, 2026 · Apr 15, 2026
@@ -0,0 +1,28 @@
+diff --git a/src/whisper.cpp b/src/whisper.cpp
+--- a/src/whisper.cpp
++++ b/src/whisper.cpp
+@@ -633,6 +633,7 @@
+     int32_t n_mels        = 80;
+     int32_t ftype         = 1;
+     float   eps           = 1e-5f;
++    int32_t n_audio_conv1_kernel = 3;
+ };
+
+ // audio encoding layer
+@@ -1535,6 +1536,7 @@
+         read_safe(loader, hparams.n_text_layer);
+         read_safe(loader, hparams.n_mels);
+         read_safe(loader, hparams.ftype);
++        read_safe(loader, hparams.n_audio_conv1_kernel);
+
+         assert(hparams.n_text_state == hparams.n_audio_state);
+
+@@ -1775,7 +1777,7 @@
+         // encoder
+         model.e_pe = create_tensor(ASR_TENSOR_ENC_POS_EMBD, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx));
+
+-        model.e_conv_1_w = create_tensor(ASR_TENSOR_CONV1_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state));
++        model.e_conv_1_w = create_tensor(ASR_TENSOR_CONV1_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, hparams.n_audio_conv1_kernel, n_mels, n_audio_state));
+         model.e_conv_1_b = create_tensor(ASR_TENSOR_CONV1_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state));
+
+         model.e_conv_2_w = create_tensor(ASR_TENSOR_CONV2_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state));
@@ -0,0 +1,97 @@
+diff --git a/src/whisper.cpp b/src/whisper.cpp
+--- a/src/whisper.cpp
++++ b/src/whisper.cpp
+@@ -633,6 +633,8 @@
+     int32_t ftype         = 1;
+     float   eps           = 1e-5f;
+     int32_t n_audio_conv1_kernel = 3;
++    int32_t n_audio_window_size  = 0;
++    int32_t n_audio_last_window_layer = -1;
+ };
+
+ // audio encoding layer
+@@ -1536,6 +1538,8 @@
+         read_safe(loader, hparams.n_mels);
+         read_safe(loader, hparams.ftype);
+         read_safe(loader, hparams.n_audio_conv1_kernel);
++        read_safe(loader, hparams.n_audio_window_size);
++        read_safe(loader, hparams.n_audio_last_window_layer);
+
+         assert(hparams.n_text_state == hparams.n_audio_state);
+
+@@ -2114,6 +2118,15 @@
+
+     struct ggml_tensor * inpL = cur;
+
++    struct ggml_tensor * window_mask = nullptr;
++    const int window_size = hparams.n_audio_window_size;
++    const int last_window_layer = hparams.n_audio_last_window_layer;
++    if (window_size > 0 && last_window_layer >= 0) {
++        window_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_ctx, 1);
++        ggml_set_name(window_mask, "window_mask");
++        ggml_set_input(window_mask);
++    }
++
+     for (int il = 0; il < n_layer; ++il) {
+         const auto & layer = model.layers_encoder[il];
+
+@@ -2177,7 +2190,8 @@
+                             ggml_element_size(kv_pad.v)*n_state_head,
+                             0);
+
+-                cur = ggml_flash_attn_ext(ctx0, Q, K, V, nullptr, KQscale, 0.0f, 0.0f);
++                struct ggml_tensor * attn_mask_fa = (window_mask && il <= last_window_layer) ? window_mask : nullptr;
++                cur = ggml_flash_attn_ext(ctx0, Q, K, V, attn_mask_fa, KQscale, 0.0f, 0.0f);
+
+                 cur = ggml_reshape_2d(ctx0, cur, n_state, n_ctx);
+             } else {
+@@ -2191,7 +2205,8 @@
+                 // K * Q
+                 struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+-                struct ggml_tensor * KQ_soft_max = ggml_soft_max_ext(ctx0, KQ, nullptr, KQscale, 0.0f);
++                struct ggml_tensor * enc_attn_mask = (window_mask && il <= last_window_layer) ? window_mask : nullptr;
++                struct ggml_tensor * KQ_soft_max = ggml_soft_max_ext(ctx0, KQ, enc_attn_mask, KQscale, 0.0f);
+
+                 struct ggml_tensor * V =
+                     ggml_cast(ctx0,
+@@ -2442,6 +2457,25 @@
+             return false;
+         }
+
++        {
++            struct ggml_tensor * wmask = ggml_graph_get_tensor(gf, "window_mask");
++            if (wmask) {
++                const int n_ctx = wstate.exp_n_audio_ctx > 0
++                    ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx;
++                const int ws = wctx.model.hparams.n_audio_window_size;
++                const int half_w = ws / 2;
++                std::vector<float> mask_data(n_ctx * n_ctx);
++                for (int i = 0; i < n_ctx; ++i) {
++                    for (int j = 0; j < n_ctx; ++j) {
++                        mask_data[i * n_ctx + j] =
++                            (abs(i - j) <= half_w) ? 0.0f : -INFINITY;
++                    }
++                }
++                ggml_backend_tensor_set(wmask, mask_data.data(), 0,
++                    n_ctx * n_ctx * sizeof(float));
++            }
++        }
++
+         if (!ggml_graph_compute_helper(sched, gf, n_threads)) {
+             return false;
+         }
+@@ -6949,7 +6983,12 @@
+         } else {
+             prompt_init.push_back(whisper_token_transcribe(ctx));
+         }
+-    }
++    } else if (ctx->model.hparams.n_audio_window_size > 0) {
++        const int lang_id = whisper_lang_id(params.language);
++        state->lang_id = lang_id;
++        prompt_init.push_back(whisper_token_lang(ctx, lang_id));
++        prompt_init.push_back(whisper_token_transcribe(ctx));
++    }
+
+     // first release distilled models require the "no_timestamps" token
+     {
@@ -2,13 +2,15 @@ set(VERSION "a8d002cfd879315632a579e73f0148d06959de36")
 
 vcpkg_from_github(
   OUT_SOURCE_PATH SOURCE_PATH
-  REPO tetherto/qvac-ext-lib-whisper.cpp
+  REPO ggml-org/whisper.cpp
   REF ${VERSION}
   SHA512 aea24debb836131d14d362ff78c6d12cfe2e82188340e69e71e6874a1fa51fa9405f2c03fe43888b1ff4183f4288bf64f07dd1106224b0108c3e0f844989a409
   HEAD_REF master
   PATCHES
     0001-fix-vcpkg-build.patch
     0002-fix-apple-silicon-cross-compile.patch
+    0003-bci-variable-conv1-kernel.patch
+    0004-bci-windowed-attention.patch
 )
 
 set(PLATFORM_OPTIONS)

@@ -8,11 +8,6 @@
     },
     "whisper-cpp",
     "gtest"
-  ],
-  "overrides": [
-    {
-      "name": "whisper-cpp",
-      "version": "1.7.5.1"
-    }
   ]
 }
+