From 775d4364385f6858c603042c2eaba1e9839f0319 Mon Sep 17 00:00:00 2001 From: Raju Date: Thu, 16 Apr 2026 15:51:21 +0530 Subject: [PATCH 01/10] feat(bci): add variable conv1 kernel size support Read n_audio_conv1_kernel from model hparams to allow BCI models to use a non-standard first convolution kernel size. Standard whisper models default to kernel size 3. Made-with: Cursor --- src/whisper.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index c4f912a9a4d..47cb1034f1f 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -600,6 +600,7 @@ struct whisper_hparams { int32_t n_mels = 80; int32_t ftype = 1; float eps = 1e-5f; + int32_t n_audio_conv1_kernel = 3; }; // audio encoding layer @@ -1517,6 +1518,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con read_safe(loader, hparams.n_text_layer); read_safe(loader, hparams.n_mels); read_safe(loader, hparams.ftype); + read_safe(loader, hparams.n_audio_conv1_kernel); assert(hparams.n_text_state == hparams.n_audio_state); @@ -1757,7 +1759,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // encoder model.e_pe = create_tensor(ASR_TENSOR_ENC_POS_EMBD, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx)); - model.e_conv_1_w = create_tensor(ASR_TENSOR_CONV1_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state)); + model.e_conv_1_w = create_tensor(ASR_TENSOR_CONV1_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, hparams.n_audio_conv1_kernel, n_mels, n_audio_state)); model.e_conv_1_b = create_tensor(ASR_TENSOR_CONV1_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state)); model.e_conv_2_w = create_tensor(ASR_TENSOR_CONV2_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state)); From a52f95384af86bde2d6a734551d0a0fe76e33ab0 Mon Sep 17 00:00:00 2001 From: Raju Date: Thu, 16 Apr 2026 19:57:47 +0530 Subject: [PATCH 02/10] feat(bci): add windowed self-attention for encoder layers - Add n_audio_window_size and n_audio_last_window_layer hparams - When present, encoder self-attention is restricted to a local window for layers up to last_window_layer - Bypass flash attention when windowed mask is active (Metal FA does not support custom F32 masks); flash attention remains enabled for non-BCI models and for the decoder - Populate window_mask data on the encoder graph (not the cross graph) - Add proper SOS token (language + transcribe) initialization for BCI models Backward-compatible: n_audio_window_size defaults to 0 and n_audio_last_window_layer defaults to -1, disabling windowed attention entirely for standard whisper models. Made-with: Cursor --- src/whisper.cpp | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 47cb1034f1f..c5c39f075df 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -601,6 +601,8 @@ struct whisper_hparams { int32_t ftype = 1; float eps = 1e-5f; int32_t n_audio_conv1_kernel = 3; + int32_t n_audio_window_size = 0; + int32_t n_audio_last_window_layer = -1; }; // audio encoding layer @@ -1519,6 +1521,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con read_safe(loader, hparams.n_mels); read_safe(loader, hparams.ftype); read_safe(loader, hparams.n_audio_conv1_kernel); + read_safe(loader, hparams.n_audio_window_size); + read_safe(loader, hparams.n_audio_last_window_layer); assert(hparams.n_text_state == hparams.n_audio_state); @@ -2097,6 +2101,15 @@ static struct ggml_cgraph * whisper_build_graph_encoder( struct ggml_tensor * inpL = cur; + struct ggml_tensor * window_mask = nullptr; + const int window_size = hparams.n_audio_window_size; + const int last_window_layer = hparams.n_audio_last_window_layer; + if (window_size > 0 && last_window_layer >= 0) { + window_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_ctx, 1); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + } + for (int il = 0; il < n_layer; ++il) { const auto & layer = model.layers_encoder[il]; @@ -2140,7 +2153,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder( ggml_reshape_3d(ctx0, Qcur, n_state_head, n_head, n_ctx), 0, 2, 1, 3); - if (wctx.params.flash_attn) { + if (wctx.params.flash_attn && !window_mask) { ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, ggml_view_1d(ctx0, kv_pad.k, n_ctx*n_state, 0))); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, ggml_view_1d(ctx0, kv_pad.v, n_ctx*n_state, 0))); @@ -2172,7 +2185,8 @@ static struct ggml_cgraph * whisper_build_graph_encoder( // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - struct ggml_tensor * KQ_soft_max = ggml_soft_max_ext(ctx0, KQ, nullptr, KQscale, 0.0f); + struct ggml_tensor * enc_attn_mask = (window_mask && il <= last_window_layer) ? window_mask : nullptr; + struct ggml_tensor * KQ_soft_max = ggml_soft_max_ext(ctx0, KQ, enc_attn_mask, KQscale, 0.0f); struct ggml_tensor * V = ggml_cast(ctx0, @@ -2430,6 +2444,25 @@ static bool whisper_encode_internal( return false; } + { + struct ggml_tensor * wmask = ggml_graph_get_tensor(gf, "window_mask"); + if (wmask) { + const int n_ctx = wstate.exp_n_audio_ctx > 0 + ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx; + const int ws = wctx.model.hparams.n_audio_window_size; + const int half_w = ws / 2; + std::vector mask_data(n_ctx * n_ctx); + for (int i = 0; i < n_ctx; ++i) { + for (int j = 0; j < n_ctx; ++j) { + mask_data[i * n_ctx + j] = + (abs(i - j) <= half_w) ? 0.0f : -INFINITY; + } + } + ggml_backend_tensor_set(wmask, mask_data.data(), 0, + n_ctx * n_ctx * sizeof(float)); + } + } + if (!ggml_graph_compute_helper(sched, gf, n_threads)) { return false; } @@ -6960,6 +6993,11 @@ int whisper_full_with_state( } else { prompt_init.push_back(whisper_token_transcribe(ctx)); } + } else if (ctx->model.hparams.n_audio_window_size > 0) { + const int lang_id = whisper_lang_id(params.language); + state->lang_id = lang_id; + prompt_init.push_back(whisper_token_lang(ctx, lang_id)); + prompt_init.push_back(whisper_token_transcribe(ctx)); } // first release distilled models require the "no_timestamps" token From e6fcbaac508783441f0ea6687b36a9b549e8c713 Mon Sep 17 00:00:00 2001 From: Raju Date: Thu, 26 Mar 2026 10:40:02 +0530 Subject: [PATCH 03/10] fix vcpkg build Made-with: Cursor --- CMakeLists.txt | 45 +++++++++++++++++++-------- cmake/git-vars.cmake | 50 +++++++++++++++++++----------- cmake/whisper-config.cmake.in | 58 ++++++++++++++++++++++------------- ggml/CMakeLists.txt | 17 +++++++--- ggml/src/CMakeLists.txt | 2 +- src/CMakeLists.txt | 6 +++- 6 files changed, 120 insertions(+), 58 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a0f74041321..a9cc9b5887e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,10 +23,18 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) set(WHISPER_STANDALONE ON) - include(git-vars) + find_package(Git QUIET) + if(GIT_FOUND) + include(git-vars) + else() + set(GIT_SHA1 "unknown") + set(GIT_DATE "unknown") + set(GIT_COMMIT_SUBJECT "unknown") + endif() - # configure project version - configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/package-tmpl.json ${CMAKE_SOURCE_DIR}/bindings/javascript/package.json @ONLY) + if(EXISTS ${CMAKE_SOURCE_DIR}/bindings/javascript/package-tmpl.json) + configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/package-tmpl.json ${CMAKE_SOURCE_DIR}/bindings/javascript/package.json @ONLY) + endif() else() set(WHISPER_STANDALONE OFF) endif() @@ -172,27 +180,38 @@ set(WHISPER_BUILD_NUMBER ${BUILD_NUMBER}) set(WHISPER_BUILD_COMMIT ${BUILD_COMMIT}) set(WHISPER_INSTALL_VERSION ${CMAKE_PROJECT_VERSION}) -set(WHISPER_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") +set(WHISPER_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}/whisper CACHE PATH "Location of header files") set(WHISPER_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") set(WHISPER_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS) set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h) -install(TARGETS whisper LIBRARY PUBLIC_HEADER) + +install( + TARGETS whisper + EXPORT whisper-targets + PUBLIC_HEADER + DESTINATION ${WHISPER_INCLUDE_INSTALL_DIR}) + +install( + EXPORT whisper-targets + FILE whisper-targets.cmake + NAMESPACE whisper:: + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/whisper) + +install( + FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/whisper) target_compile_definitions(whisper PRIVATE WHISPER_VERSION="${PROJECT_VERSION}" ) configure_package_config_file( - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake - INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper - PATH_VARS - WHISPER_INCLUDE_INSTALL_DIR - WHISPER_LIB_INSTALL_DIR - WHISPER_BIN_INSTALL_DIR ) + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/whisper) write_basic_package_version_file( ${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake @@ -201,7 +220,7 @@ write_basic_package_version_file( install(FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake ${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper) + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/whisper) configure_file(cmake/whisper.pc.in "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc" diff --git a/cmake/git-vars.cmake b/cmake/git-vars.cmake index 1a4c24ebf6a..8dc51859dc3 100644 --- a/cmake/git-vars.cmake +++ b/cmake/git-vars.cmake @@ -1,22 +1,36 @@ find_package(Git) -# the commit's SHA1 -execute_process(COMMAND - "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8 - WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" - OUTPUT_VARIABLE GIT_SHA1 - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) +if(GIT_FOUND) + execute_process(COMMAND + "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8 + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_SHA1 + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE GIT_SHA1_RESULT) -# the date of the commit -execute_process(COMMAND - "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local - WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" - OUTPUT_VARIABLE GIT_DATE - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND + "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_DATE + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE GIT_DATE_RESULT) -# the subject of the commit -execute_process(COMMAND - "${GIT_EXECUTABLE}" log -1 --format=%s - WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" - OUTPUT_VARIABLE GIT_COMMIT_SUBJECT - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND + "${GIT_EXECUTABLE}" log -1 --format=%s + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_COMMIT_SUBJECT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE GIT_COMMIT_SUBJECT_RESULT) +endif() + +if(NOT GIT_FOUND OR GIT_SHA1_RESULT OR NOT GIT_SHA1) + set(GIT_SHA1 "unknown") +endif() + +if(NOT GIT_FOUND OR GIT_DATE_RESULT OR NOT GIT_DATE) + set(GIT_DATE "unknown") +endif() + +if(NOT GIT_FOUND OR GIT_COMMIT_SUBJECT_RESULT OR NOT GIT_COMMIT_SUBJECT) + set(GIT_COMMIT_SUBJECT "unknown") +endif() diff --git a/cmake/whisper-config.cmake.in b/cmake/whisper-config.cmake.in index b70c1e5af44..678ff9f0cb8 100644 --- a/cmake/whisper-config.cmake.in +++ b/cmake/whisper-config.cmake.in @@ -3,28 +3,44 @@ set(WHISPER_BUILD_COMMIT @WHISPER_BUILD_COMMIT@) set(WHISPER_BUILD_NUMBER @WHISPER_BUILD_NUMBER@) set(WHISPER_SHARED_LIB @BUILD_SHARED_LIBS@) +set(GGML_METAL @GGML_METAL@) +set(GGML_BLAS @GGML_BLAS@) +set(GGML_CUDA @GGML_CUDA@) +set(GGML_HIPBLAS @GGML_HIPBLAS@) +set(GGML_ACCELERATE @GGML_ACCELERATE@) + @PACKAGE_INIT@ -set_and_check(WHISPER_INCLUDE_DIR "@PACKAGE_WHISPER_INCLUDE_INSTALL_DIR@") -set_and_check(WHISPER_LIB_DIR "@PACKAGE_WHISPER_LIB_INSTALL_DIR@") -set_and_check(WHISPER_BIN_DIR "@PACKAGE_WHISPER_BIN_INSTALL_DIR@") - -find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake) - -find_library(whisper_LIBRARY whisper - REQUIRED - HINTS ${WHISPER_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH -) - -add_library(whisper UNKNOWN IMPORTED) -set_target_properties(whisper - PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${WHISPER_INCLUDE_DIR}" - INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;" - IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" - IMPORTED_LOCATION "${whisper_LIBRARY}" - INTERFACE_COMPILE_FEATURES cxx_std_11 - POSITION_INDEPENDENT_CODE ON ) +include(CMakeFindDependencyMacro) + +find_dependency(Threads REQUIRED) + +if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) +endif() + +if (GGML_BLAS) + find_dependency(BLAS REQUIRED) +endif() + +if (GGML_CUDA) + find_dependency(CUDAToolkit REQUIRED) +endif() + +if (GGML_METAL) + find_library(FOUNDATION_FRAMEWORK Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) +endif() + +if (GGML_HIPBLAS) + find_dependency(hip REQUIRED) + find_dependency(hipblas REQUIRED) + find_dependency(rocblas REQUIRED) +endif() + +find_dependency(ggml CONFIG REQUIRED) + +include("${CMAKE_CURRENT_LIST_DIR}/whisper-targets.cmake") check_required_components(whisper) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index c780077acaa..137d7b605be 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -336,8 +336,17 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") #if (GGML_METAL) # set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal") #endif() -install(TARGETS ggml LIBRARY PUBLIC_HEADER) -install(TARGETS ggml-base LIBRARY) +install( + TARGETS ggml ggml-base + EXPORT ggml-targets + PUBLIC_HEADER + DESTINATION ${GGML_INCLUDE_INSTALL_DIR}) + +install( + EXPORT ggml-targets + FILE ggml-targets.cmake + NAMESPACE ggml:: + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/ggml) if (GGML_STANDALONE) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in @@ -388,7 +397,7 @@ set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of configure_package_config_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/ggml-config.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake - INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml + INSTALL_DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/ggml PATH_VARS GGML_INCLUDE_INSTALL_DIR GGML_LIB_INSTALL_DIR GGML_BIN_INSTALL_DIR) @@ -407,7 +416,7 @@ message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}") install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml) + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/ggml) if (MSVC) set(MSVC_WARNING_FLAGS diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 78853304d9f..ddd7f5747ed 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -259,7 +259,7 @@ function(ggml_add_backend_library backend) else() add_library(${backend} ${ARGN}) target_link_libraries(ggml PUBLIC ${backend}) - install(TARGETS ${backend} LIBRARY) + install(TARGETS ${backend} EXPORT ggml-targets) endif() target_link_libraries(${backend} PRIVATE ggml-base) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 095a2791de5..f360411d704 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -115,7 +115,11 @@ set_target_properties(whisper PROPERTIES SOVERSION ${SOVERSION} ) -target_include_directories(whisper PUBLIC . ../include) +target_include_directories( + whisper + PUBLIC + $ + $) target_compile_features (whisper PUBLIC cxx_std_11) # don't bump if (CMAKE_CXX_BYTE_ORDER STREQUAL "BIG_ENDIAN") From 461f07d30e9189ce7d734fda6e32ceddebb5cc80 Mon Sep 17 00:00:00 2001 From: Raju Date: Thu, 26 Mar 2026 10:40:59 +0530 Subject: [PATCH 04/10] fix apple silicon cross compile Made-with: Cursor --- ggml/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 137d7b605be..d446138ecb7 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -99,7 +99,9 @@ else() set(GGML_BLAS_VENDOR_DEFAULT "Generic") endif() -if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH}) +if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH} OR + (APPLE AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64" AND + CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")) message(STATUS "Setting GGML_NATIVE_DEFAULT to OFF") set(GGML_NATIVE_DEFAULT OFF) else() From bbb353562f5a57542553bc722ed6d59b573097ef Mon Sep 17 00:00:00 2001 From: Raju Date: Thu, 26 Mar 2026 10:41:11 +0530 Subject: [PATCH 05/10] fix windows pthread Made-with: Cursor --- cmake/whisper-config.cmake.in | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/whisper-config.cmake.in b/cmake/whisper-config.cmake.in index 678ff9f0cb8..16162504fa3 100644 --- a/cmake/whisper-config.cmake.in +++ b/cmake/whisper-config.cmake.in @@ -13,7 +13,9 @@ set(GGML_ACCELERATE @GGML_ACCELERATE@) include(CMakeFindDependencyMacro) -find_dependency(Threads REQUIRED) +if(NOT WIN32) + find_dependency(Threads REQUIRED) +endif() if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) From aa8d9939df55b8197689f70bd2dfb41012a7c642 Mon Sep 17 00:00:00 2001 From: Raju Date: Sat, 18 Apr 2026 10:46:14 +0530 Subject: [PATCH 06/10] fix(bci): guard BCI hparams behind n_mels check, add is_bci flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback: 1. Guard read_safe for BCI-specific hparams (n_audio_conv1_kernel, n_audio_window_size, n_audio_last_window_layer) behind a n_mels > 256 check. Standard whisper models have n_mels <= 128 and do not contain these fields — reading them unconditionally would corrupt the file position and break model loading. 2. Add explicit is_bci flag to hparams struct, set when BCI fields are detected during loading. 3. Use is_bci flag (instead of n_audio_window_size > 0) to guard the BCI-specific decoder SOS token initialization. 4. Log BCI-specific hparams when a BCI model is detected. Made-with: Cursor --- src/whisper.cpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index c5c39f075df..162b02aeba4 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -603,6 +603,7 @@ struct whisper_hparams { int32_t n_audio_conv1_kernel = 3; int32_t n_audio_window_size = 0; int32_t n_audio_last_window_layer = -1; + bool is_bci = false; }; // audio encoding layer @@ -1520,9 +1521,13 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con read_safe(loader, hparams.n_text_layer); read_safe(loader, hparams.n_mels); read_safe(loader, hparams.ftype); - read_safe(loader, hparams.n_audio_conv1_kernel); - read_safe(loader, hparams.n_audio_window_size); - read_safe(loader, hparams.n_audio_last_window_layer); + + if (hparams.n_mels > 256) { + read_safe(loader, hparams.n_audio_conv1_kernel); + read_safe(loader, hparams.n_audio_window_size); + read_safe(loader, hparams.n_audio_last_window_layer); + hparams.is_bci = true; + } assert(hparams.n_text_state == hparams.n_audio_state); @@ -1577,6 +1582,13 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con WHISPER_LOG_INFO("%s: ftype = %d\n", __func__, model.hparams.ftype); WHISPER_LOG_INFO("%s: qntvr = %d\n", __func__, qntvr); WHISPER_LOG_INFO("%s: type = %d (%s%s)\n", __func__, model.type, g_model_name.at(model.type).c_str(), mver.c_str()); + + if (hparams.is_bci) { + WHISPER_LOG_INFO("%s: is_bci = true\n", __func__); + WHISPER_LOG_INFO("%s: conv1_kernel = %d\n", __func__, hparams.n_audio_conv1_kernel); + WHISPER_LOG_INFO("%s: window_size = %d\n", __func__, hparams.n_audio_window_size); + WHISPER_LOG_INFO("%s: last_win_layer= %d\n", __func__, hparams.n_audio_last_window_layer); + } } // load mel filters @@ -6993,7 +7005,7 @@ int whisper_full_with_state( } else { prompt_init.push_back(whisper_token_transcribe(ctx)); } - } else if (ctx->model.hparams.n_audio_window_size > 0) { + } else if (ctx->model.hparams.is_bci) { const int lang_id = whisper_lang_id(params.language); state->lang_id = lang_id; prompt_init.push_back(whisper_token_lang(ctx, lang_id)); From 0f1031ae7c0f798cdcbcef62e9a147edb0146660 Mon Sep 17 00:00:00 2001 From: Raju Date: Sun, 19 Apr 2026 14:11:21 +0530 Subject: [PATCH 07/10] perf(bci): precompute window_mask at init instead of every encode The windowed attention mask values depend only on n_ctx and window_size, both fixed after model load. Move the O(n_ctx^2) computation from whisper_encode_internal (called every encode) to whisper_init_state (called once). The encode path now just copies the precomputed data to the graph tensor. Made-with: Cursor --- src/whisper.cpp | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 162b02aeba4..ca2f67ce070 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -886,6 +886,9 @@ struct whisper_state { std::vector inp_mel; std::vector inp_mask; + // pre-computed BCI windowed attention mask (constant after init) + std::vector window_mask_data; + // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; @@ -2458,20 +2461,9 @@ static bool whisper_encode_internal( { struct ggml_tensor * wmask = ggml_graph_get_tensor(gf, "window_mask"); - if (wmask) { - const int n_ctx = wstate.exp_n_audio_ctx > 0 - ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx; - const int ws = wctx.model.hparams.n_audio_window_size; - const int half_w = ws / 2; - std::vector mask_data(n_ctx * n_ctx); - for (int i = 0; i < n_ctx; ++i) { - for (int j = 0; j < n_ctx; ++j) { - mask_data[i * n_ctx + j] = - (abs(i - j) <= half_w) ? 0.0f : -INFINITY; - } - } - ggml_backend_tensor_set(wmask, mask_data.data(), 0, - n_ctx * n_ctx * sizeof(float)); + if (wmask && !wstate.window_mask_data.empty()) { + ggml_backend_tensor_set(wmask, wstate.window_mask_data.data(), 0, + wstate.window_mask_data.size() * sizeof(float)); } } @@ -3588,6 +3580,20 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_sched_size(state->sched_decode) / 1e6); } + if (ctx->model.hparams.is_bci) { + const auto & hparams = ctx->model.hparams; + const int n_ctx = hparams.n_audio_ctx; + const int half_w = hparams.n_audio_window_size / 2; + + state->window_mask_data.resize(n_ctx * n_ctx); + for (int i = 0; i < n_ctx; ++i) { + for (int j = 0; j < n_ctx; ++j) { + state->window_mask_data[i * n_ctx + j] = + (abs(i - j) <= half_w) ? 0.0f : -INFINITY; + } + } + } + return state; } From c7af5755a2fad7030d720344bcff92d5d5b16c07 Mon Sep 17 00:00:00 2001 From: Raju Date: Mon, 20 Apr 2026 14:52:07 +0530 Subject: [PATCH 08/10] =?UTF-8?q?fix(bci):=20address=20ogad-tether=20revie?= =?UTF-8?q?w=20=E2=80=94=20window=5Fmask=20sizing,=20pkg-config,=20Threads?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Fix window_mask_data / exp_n_audio_ctx mismatch: the precomputed mask uses hparams.n_audio_ctx, but the graph tensor is sized from exp_n_audio_ctx when params.audio_ctx is overridden. Now falls back to recomputing the mask at the effective n_ctx when sizes differ, preventing a buffer overflow into the smaller tensor. 2. Update whisper.pc.in: the install interface was changed to include/whisper but the pkg-config includedir still pointed to include/. Consumers using pkg-config would not find whisper.h. 3. Fix whisper-config.cmake.in: the whisper target publicly links Threads::Threads but find_dependency(Threads) was skipped on Windows, leaving downstream find_package(whisper) with an unresolved imported target. Now always resolve Threads. --- cmake/whisper-config.cmake.in | 4 +--- cmake/whisper.pc.in | 2 +- src/whisper.cpp | 20 +++++++++++++++++--- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/cmake/whisper-config.cmake.in b/cmake/whisper-config.cmake.in index 16162504fa3..678ff9f0cb8 100644 --- a/cmake/whisper-config.cmake.in +++ b/cmake/whisper-config.cmake.in @@ -13,9 +13,7 @@ set(GGML_ACCELERATE @GGML_ACCELERATE@) include(CMakeFindDependencyMacro) -if(NOT WIN32) - find_dependency(Threads REQUIRED) -endif() +find_dependency(Threads REQUIRED) if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) diff --git a/cmake/whisper.pc.in b/cmake/whisper.pc.in index 00ec7912014..73d9f4a0c1e 100644 --- a/cmake/whisper.pc.in +++ b/cmake/whisper.pc.in @@ -1,7 +1,7 @@ prefix=@CMAKE_INSTALL_PREFIX@ exec_prefix=${prefix} libdir=${exec_prefix}/lib -includedir=${prefix}/include +includedir=${prefix}/include/whisper Name: whisper Description: Port of OpenAI's Whisper model in C/C++ diff --git a/src/whisper.cpp b/src/whisper.cpp index ca2f67ce070..f14e17af9cf 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -2461,9 +2461,23 @@ static bool whisper_encode_internal( { struct ggml_tensor * wmask = ggml_graph_get_tensor(gf, "window_mask"); - if (wmask && !wstate.window_mask_data.empty()) { - ggml_backend_tensor_set(wmask, wstate.window_mask_data.data(), 0, - wstate.window_mask_data.size() * sizeof(float)); + if (wmask) { + const auto & hparams = wctx.model.hparams; + const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx; + const int half_w = hparams.n_audio_window_size / 2; + + if ((int) wstate.window_mask_data.size() == n_ctx * n_ctx) { + ggml_backend_tensor_set(wmask, wstate.window_mask_data.data(), 0, + wstate.window_mask_data.size() * sizeof(float)); + } else { + std::vector mask(n_ctx * n_ctx); + for (int i = 0; i < n_ctx; ++i) { + for (int j = 0; j < n_ctx; ++j) { + mask[i * n_ctx + j] = (abs(i - j) <= half_w) ? 0.0f : -INFINITY; + } + } + ggml_backend_tensor_set(wmask, mask.data(), 0, mask.size() * sizeof(float)); + } } } From 5645ad60cc7b2dde4bc29736a4506301be22e57d Mon Sep 17 00:00:00 2001 From: Raju Date: Mon, 20 Apr 2026 15:03:40 +0530 Subject: [PATCH 09/10] perf(bci): cache window_mask for effective n_ctx, enable per-layer flash attention 1. Cache fallback mask recompute: when exp_n_audio_ctx overrides the default n_audio_ctx, the window mask is now recomputed once and cached in wstate (keyed on window_mask_n_ctx) instead of allocating a new std::vector on every whisper_encode_internal call. 2. Per-layer flash attention: layers above last_window_layer no longer need the windowed attention mask. The flash attention path is now used for those layers even when BCI windowed attention is active, instead of globally falling back to the softmax path for the entire encoder. 3. Use std::abs instead of C abs in both init-time and encode-time mask computation paths. --- src/whisper.cpp | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index f14e17af9cf..de232c79b80 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -888,6 +888,7 @@ struct whisper_state { // pre-computed BCI windowed attention mask (constant after init) std::vector window_mask_data; + int window_mask_n_ctx = 0; // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; @@ -2168,7 +2169,9 @@ static struct ggml_cgraph * whisper_build_graph_encoder( ggml_reshape_3d(ctx0, Qcur, n_state_head, n_head, n_ctx), 0, 2, 1, 3); - if (wctx.params.flash_attn && !window_mask) { + const bool layer_needs_mask = window_mask && il <= last_window_layer; + + if (wctx.params.flash_attn && !layer_needs_mask) { ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, ggml_view_1d(ctx0, kv_pad.k, n_ctx*n_state, 0))); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, ggml_view_1d(ctx0, kv_pad.v, n_ctx*n_state, 0))); @@ -2200,8 +2203,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder( // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - struct ggml_tensor * enc_attn_mask = (window_mask && il <= last_window_layer) ? window_mask : nullptr; - struct ggml_tensor * KQ_soft_max = ggml_soft_max_ext(ctx0, KQ, enc_attn_mask, KQscale, 0.0f); + struct ggml_tensor * KQ_soft_max = ggml_soft_max_ext(ctx0, KQ, layer_needs_mask ? window_mask : nullptr, KQscale, 0.0f); struct ggml_tensor * V = ggml_cast(ctx0, @@ -2464,20 +2466,21 @@ static bool whisper_encode_internal( if (wmask) { const auto & hparams = wctx.model.hparams; const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx; - const int half_w = hparams.n_audio_window_size / 2; - if ((int) wstate.window_mask_data.size() == n_ctx * n_ctx) { - ggml_backend_tensor_set(wmask, wstate.window_mask_data.data(), 0, - wstate.window_mask_data.size() * sizeof(float)); - } else { - std::vector mask(n_ctx * n_ctx); + if (wstate.window_mask_n_ctx != n_ctx) { + const int half_w = hparams.n_audio_window_size / 2; + wstate.window_mask_data.resize(n_ctx * n_ctx); for (int i = 0; i < n_ctx; ++i) { for (int j = 0; j < n_ctx; ++j) { - mask[i * n_ctx + j] = (abs(i - j) <= half_w) ? 0.0f : -INFINITY; + wstate.window_mask_data[i * n_ctx + j] = + (std::abs(i - j) <= half_w) ? 0.0f : -INFINITY; } } - ggml_backend_tensor_set(wmask, mask.data(), 0, mask.size() * sizeof(float)); + wstate.window_mask_n_ctx = n_ctx; } + + ggml_backend_tensor_set(wmask, wstate.window_mask_data.data(), 0, + wstate.window_mask_data.size() * sizeof(float)); } } @@ -3603,9 +3606,10 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { for (int i = 0; i < n_ctx; ++i) { for (int j = 0; j < n_ctx; ++j) { state->window_mask_data[i * n_ctx + j] = - (abs(i - j) <= half_w) ? 0.0f : -INFINITY; + (std::abs(i - j) <= half_w) ? 0.0f : -INFINITY; } } + state->window_mask_n_ctx = n_ctx; } return state; From 3e91e3a4434c9cf3d7d0f27711f2988242bccf11 Mon Sep 17 00:00:00 2001 From: Raju Date: Mon, 20 Apr 2026 16:41:56 +0530 Subject: [PATCH 10/10] =?UTF-8?q?fix(bci):=20address=20jpgaribotti=20revie?= =?UTF-8?q?w=20=E2=80=94=20helper,=20is=5Fbci=20guard,=20hparam=20validati?= =?UTF-8?q?on?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Extract compute_window_mask() helper on whisper_state to eliminate the duplicated O(n_ctx^2) mask fill loop that appeared in both whisper_init_state and whisper_encode_internal. Both call sites now use the single helper, preventing future drift. 2. Guard the encode-time mask block with hparams.is_bci before doing the ggml_graph_get_tensor lookup. Cheaper and more explicit than relying on the tensor name string to determine whether BCI windowed attention is active. 3. Add hparams.is_bci to the graph builder guard for window_mask tensor creation, aligning it with the other BCI code paths. 4. Add validation for BCI hparams after reading from file: n_audio_conv1_kernel must be > 0, n_audio_window_size must be >= 0. Log an error and return false on invalid values instead of proceeding with garbage. 5. Add comment explaining the n_mels > 256 threshold used to discriminate BCI models from standard whisper models, and noting that a dedicated file-format marker should be introduced if this assumption ever breaks. Made-with: Cursor --- src/whisper.cpp | 52 ++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index de232c79b80..0a9619dcac7 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -890,6 +890,18 @@ struct whisper_state { std::vector window_mask_data; int window_mask_n_ctx = 0; + // (Re)compute the banded attention mask for a given context length. + void compute_window_mask(int n_ctx, int half_w) { + window_mask_data.resize(n_ctx * n_ctx); + for (int i = 0; i < n_ctx; ++i) { + for (int j = 0; j < n_ctx; ++j) { + window_mask_data[i * n_ctx + j] = + (std::abs(i - j) <= half_w) ? 0.0f : -INFINITY; + } + } + window_mask_n_ctx = n_ctx; + } + // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; @@ -1526,11 +1538,25 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con read_safe(loader, hparams.n_mels); read_safe(loader, hparams.ftype); + // BCI models encode three extra hparams after ftype. Standard whisper + // models use n_mels <= 128 (80 or 128); BCI models use 512. The + // threshold of 256 is a safe discriminator today. If a future + // non-BCI model exceeds 256 mels, a dedicated file-format marker + // should be introduced instead. if (hparams.n_mels > 256) { read_safe(loader, hparams.n_audio_conv1_kernel); read_safe(loader, hparams.n_audio_window_size); read_safe(loader, hparams.n_audio_last_window_layer); hparams.is_bci = true; + + if (hparams.n_audio_conv1_kernel <= 0) { + WHISPER_LOG_ERROR("%s: invalid n_audio_conv1_kernel: %d\n", __func__, hparams.n_audio_conv1_kernel); + return false; + } + if (hparams.n_audio_window_size < 0) { + WHISPER_LOG_ERROR("%s: invalid n_audio_window_size: %d\n", __func__, hparams.n_audio_window_size); + return false; + } } assert(hparams.n_text_state == hparams.n_audio_state); @@ -2120,7 +2146,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder( struct ggml_tensor * window_mask = nullptr; const int window_size = hparams.n_audio_window_size; const int last_window_layer = hparams.n_audio_last_window_layer; - if (window_size > 0 && last_window_layer >= 0) { + if (hparams.is_bci && window_size > 0 && last_window_layer >= 0) { window_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_ctx, 1); ggml_set_name(window_mask, "window_mask"); ggml_set_input(window_mask); @@ -2461,22 +2487,14 @@ static bool whisper_encode_internal( return false; } - { + if (wctx.model.hparams.is_bci) { struct ggml_tensor * wmask = ggml_graph_get_tensor(gf, "window_mask"); if (wmask) { const auto & hparams = wctx.model.hparams; const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx; if (wstate.window_mask_n_ctx != n_ctx) { - const int half_w = hparams.n_audio_window_size / 2; - wstate.window_mask_data.resize(n_ctx * n_ctx); - for (int i = 0; i < n_ctx; ++i) { - for (int j = 0; j < n_ctx; ++j) { - wstate.window_mask_data[i * n_ctx + j] = - (std::abs(i - j) <= half_w) ? 0.0f : -INFINITY; - } - } - wstate.window_mask_n_ctx = n_ctx; + wstate.compute_window_mask(n_ctx, hparams.n_audio_window_size / 2); } ggml_backend_tensor_set(wmask, wstate.window_mask_data.data(), 0, @@ -3599,17 +3617,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { if (ctx->model.hparams.is_bci) { const auto & hparams = ctx->model.hparams; - const int n_ctx = hparams.n_audio_ctx; - const int half_w = hparams.n_audio_window_size / 2; - - state->window_mask_data.resize(n_ctx * n_ctx); - for (int i = 0; i < n_ctx; ++i) { - for (int j = 0; j < n_ctx; ++j) { - state->window_mask_data[i * n_ctx + j] = - (std::abs(i - j) <= half_w) ? 0.0f : -INFINITY; - } - } - state->window_mask_n_ctx = n_ctx; + state->compute_window_mask(hparams.n_audio_ctx, hparams.n_audio_window_size / 2); } return state;