diff --git a/parakeet-cpp/.gitignore b/parakeet-cpp/.gitignore index fa06a7c0cb4..ae20949e54e 100644 --- a/parakeet-cpp/.gitignore +++ b/parakeet-cpp/.gitignore @@ -1,5 +1,6 @@ # ggml is cloned via scripts/setup-ggml.sh at a pinned commit; don't track it. ggml/ +ggml # Python virtualenv for the converter + reference-dump scripts. venv/ diff --git a/parakeet-cpp/CMakeLists.txt b/parakeet-cpp/CMakeLists.txt index b1ae5c25fd2..eac64cc6957 100644 --- a/parakeet-cpp/CMakeLists.txt +++ b/parakeet-cpp/CMakeLists.txt @@ -63,6 +63,53 @@ if (CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "Android") endif() endif() +# Android default backend stack: dynamic loading of Vulkan + OpenCL + +# per-arch CPU variants. Mirrors the qvac llm-llamacpp Android config +# (see qvac-registry-vcpkg/ports/llama-cpp/portfile.cmake) so the +# parakeet prebuilds drop into the same `qvac__transcription-parakeet/` +# folder shape as the llamacpp ones: a `.bare` module + sibling +# `libggml-{vulkan,opencl,cpu-android_armv*_*}.so` files that +# `ggml_backend_load_all_from_path()` discovers at runtime. +# +# Selection at runtime is centralised in `init_gpu_backend()` +# (src/parakeet_ctc.cpp): OpenCL when an Adreno 700+ device is +# present, Vulkan for every other GPU (non-Adreno, Adreno < 700, +# Mali, Xclipse, ...). No static GPU backend entry points are linked +# anywhere in libparakeet; the registry walk reaches the right +# backend in both GGML_BACKEND_DL=ON (Android prebuild) and +# GGML_BACKEND_DL=OFF (desktop dev) modes. +# +# Callers that have specific reasons to deviate (e.g. a desktop bring- +# up build that wants Vulkan only) can still override any of these +# at the cmake command line; we only set defaults that haven't already +# been provided. +if (CMAKE_SYSTEM_NAME STREQUAL "Android") + if (NOT DEFINED CACHE{GGML_BACKEND_DL}) + set(GGML_BACKEND_DL ON CACHE BOOL "" FORCE) + endif() + if (NOT DEFINED CACHE{GGML_CPU_ALL_VARIANTS}) + set(GGML_CPU_ALL_VARIANTS ON CACHE BOOL "" FORCE) + endif() + if (NOT DEFINED CACHE{GGML_CPU_REPACK}) + set(GGML_CPU_REPACK ON CACHE BOOL "" FORCE) + endif() + if (NOT DEFINED CACHE{GGML_VULKAN}) + set(GGML_VULKAN ON CACHE BOOL "" FORCE) + endif() + if (NOT DEFINED CACHE{GGML_OPENCL}) + set(GGML_OPENCL ON CACHE BOOL "" FORCE) + endif() + # ggml-vulkan's coopmat / coopmat2 shader compile pulls in extensions + # that most Android Vulkan drivers don't expose; the upstream llama + # Android build disables both for the same reason. + if (NOT DEFINED CACHE{GGML_VULKAN_DISABLE_COOPMAT}) + set(GGML_VULKAN_DISABLE_COOPMAT ON CACHE BOOL "" FORCE) + endif() + if (NOT DEFINED CACHE{GGML_VULKAN_DISABLE_COOPMAT2}) + set(GGML_VULKAN_DISABLE_COOPMAT2 ON CACHE BOOL "" FORCE) + endif() +endif() + # Two related workarounds for clang-cl / MSVC builds on Windows. Both # come from msys2 sneaking GCC-flavoured libraries onto CMake's search # paths and then being mismatched against clang-cl-compiled translation @@ -108,6 +155,30 @@ if (WIN32 AND NOT MINGW) endif() endif() +# Bundled-ggml library filename prefix. qvac-ext-ggml's `speech` branch +# exposes `GGML_LIB_OUTPUT_PREFIX` (commit 4cec2d3a) which handles both +# the OUTPUT_NAME rename for every ggml target (core + per-backend +# .so/.dll/.a) AND the runtime loader's filename prefix +# (`GGML_BACKEND_DL_PROJECT_PREFIX` compile define on ggml-base), so +# the renamed `libspeech-ggml-{vulkan,opencl,cpu-*}.so` files are +# actually discovered by `ggml_backend_load_all_from_path()` at +# runtime. +# +# Setting `GGML_LIB_OUTPUT_PREFIX` here (as a cache variable, before +# `add_subdirectory(ggml)`) is the supported way to override the +# branch default (`qvac-speech-`) on a per-consumer basis without +# editing the ggml subtree. The `speech-` prefix is shared across the +# QVAC speech stack (whisper, parakeet, chatterbox, supertonic, ...) +# so they can vendor a single ggml file set side-by-side without +# colliding with the `qvac-` prefix used by the llm fork. +if (PARAKEET_GGML_LIB_PREFIX AND NOT PARAKEET_USE_SYSTEM_GGML) + if (NOT DEFINED CACHE{GGML_LIB_OUTPUT_PREFIX}) + set(GGML_LIB_OUTPUT_PREFIX "speech-" CACHE STRING + "ggml: prefix for built ggml library filenames (parakeet default)" FORCE) + endif() + message(STATUS "parakeet: bundled ggml libraries will be emitted with prefix '${GGML_LIB_OUTPUT_PREFIX}' (set PARAKEET_GGML_LIB_PREFIX=OFF to use the qvac-ext-ggml@speech default, or override -DGGML_LIB_OUTPUT_PREFIX=)") +endif() + if (NOT TARGET ggml) if (PARAKEET_USE_SYSTEM_GGML) find_package(ggml CONFIG REQUIRED) @@ -120,50 +191,6 @@ if (NOT TARGET ggml) endif() endif() -function(parakeet_apply_ggml_prefix target) - if (NOT TARGET ${target}) - return() - endif() - get_target_property(_qpgp_type ${target} TYPE) - if (_qpgp_type STREQUAL "INTERFACE_LIBRARY" OR _qpgp_type STREQUAL "OBJECT_LIBRARY") - return() - endif() - get_target_property(_qpgp_old_name ${target} OUTPUT_NAME) - if (NOT _qpgp_old_name OR _qpgp_old_name STREQUAL "_qpgp_old_name-NOTFOUND") - set(_qpgp_old_name ${target}) - endif() - set_target_properties(${target} PROPERTIES - OUTPUT_NAME "speech-${_qpgp_old_name}" - ) -endfunction() - -if (PARAKEET_GGML_LIB_PREFIX AND NOT PARAKEET_USE_SYSTEM_GGML) - foreach (_qpgp_target ggml ggml-base) - parakeet_apply_ggml_prefix(${_qpgp_target}) - endforeach() - if (DEFINED GGML_AVAILABLE_BACKENDS) - foreach (_qpgp_target ${GGML_AVAILABLE_BACKENDS}) - parakeet_apply_ggml_prefix(${_qpgp_target}) - endforeach() - endif() - # Renaming the bundled backend .so/.dll files alone is not enough: - # ggml's runtime loader (`ggml_backend_load_best`) hard-codes the - # `libggml-` / `ggml-` filename prefix when scanning for backends - # under `GGML_BACKEND_DL=ON`. The companion patch - # `patches/ggml-backend-reg-filename-prefix.patch` adds a - # `GGML_BACKEND_DL_PROJECT_PREFIX` macro to that loader; defining - # it here teaches the runtime to look for our prefixed filenames - # instead. Otherwise the renamed .so/.dll files exist on disk but - # are never discovered, and Vulkan/OpenCL/CUDA backends silently - # fail to load. - if (TARGET ggml) - target_compile_definitions(ggml PRIVATE - GGML_BACKEND_DL_PROJECT_PREFIX="speech-" - ) - endif() - message(STATUS "parakeet: bundled ggml libraries will be emitted as libspeech-ggml-* (set PARAKEET_GGML_LIB_PREFIX=OFF to keep upstream filenames)") -endif() - # Same OpenMP avoidance as for ggml above: on Windows non-MinGW builds # CMake's FindOpenMP picks LLVM's `-fopenmp=libomp` compile flag but # resolves OpenMP_*_LIBRARIES to msys2 libgomp -> link-time mismatch. @@ -180,29 +207,25 @@ if (PARAKEET_OPENMP) find_package(OpenMP) endif() -# Centralised GGML_USE_* backend defines. Anything that compiles -# parakeet_ctc.cpp (the library target plus the standalone test -# executables that recompile it) must link against this so the -# `init_gpu_backend` / BLAS / CUDA / Metal / Vulkan code paths get -# selected consistently. Without this, e.g. test-encoder would silently -# build with the GPU branch compiled out and `--n-gpu-layers 1` would -# be a no-op. +# Legacy interface library kept for export-set compatibility (it is +# still part of `install(EXPORT parakeet-cpp-targets)` below and +# downstream `find_package(parakeet-cpp)` consumers list it as a link +# dep). Body intentionally empty: parakeet routes every backend +# decision through the ggml-backend registry +# (`ggml_backend_load_all` + `ggml_backend_dev_*`, see +# `init_gpu_backend()` / `init_cpu_backend()` / `init_blas_backend()` +# in src/parakeet_ctc.cpp) and does NOT call any +# `ggml_backend__init` / `ggml_backend_is_` entry +# point directly. The `GGML_USE_VULKAN` / `GGML_USE_OPENCL` / +# `GGML_USE_METAL` / `GGML_USE_CUDA` / `GGML_USE_BLAS` compile defines +# that used to live here were only consumed by `#ifdef` cascades that +# called those static entry points; with the registry-only design +# they're dead, and shipping them would falsely advertise a static +# backend dependency that the GGML_BACKEND_DL=ON Android/Linux builds +# explicitly do not have (their backends live in separately-loadable +# `.so` files that are dlopen()'d by `ggml_backend_load_all_from_path` +# at runtime). add_library(parakeet-backend-defs INTERFACE) -if (GGML_CUDA) - target_compile_definitions(parakeet-backend-defs INTERFACE GGML_USE_CUDA) -endif() -if (GGML_METAL) - target_compile_definitions(parakeet-backend-defs INTERFACE GGML_USE_METAL) -endif() -if (GGML_VULKAN) - target_compile_definitions(parakeet-backend-defs INTERFACE GGML_USE_VULKAN) -endif() -if (GGML_BLAS) - target_compile_definitions(parakeet-backend-defs INTERFACE GGML_USE_BLAS) -endif() -if (GGML_OPENCL) - target_compile_definitions(parakeet-backend-defs INTERFACE GGML_USE_OPENCL) -endif() set(PARAKEET_LIB_SOURCES src/parakeet_ctc.cpp @@ -421,28 +444,14 @@ if (PARAKEET_BUILD_TESTS) endif() endfunction() - # Helper: keep the parakeet_ctc.cpp #ifdefs (BLAS / CUDA / Metal / Vulkan / - # OpenCL backend init) consistent across the parakeet library and any - # test executable that compiles parakeet_ctc.cpp from source. Without this, - # tests that don't link the library would always evaluate the #ifdefs as - # "no backend defined", producing link errors against ggml-blas / ggml-vk - # / ggml-opencl when the parent build did enable them. + # Helper: keep PARAKEET_EXPERIMENTAL_FLASH_ATTN consistent across the + # parakeet library and any test executable that recompiles + # parakeet_ctc.cpp from source. Backend selection itself goes + # through the ggml-backend registry (no per-backend `GGML_USE_*` + # #ifdef cascade in parakeet_ctc.cpp anymore -- see the comment on + # `parakeet-backend-defs` above), so this helper only carries the + # flash-attn gate plus the shared ccache launcher. function(parakeet_apply_backend_defs target) - if (GGML_BLAS) - target_compile_definitions(${target} PRIVATE GGML_USE_BLAS) - endif() - if (GGML_CUDA) - target_compile_definitions(${target} PRIVATE GGML_USE_CUDA) - endif() - if (GGML_METAL) - target_compile_definitions(${target} PRIVATE GGML_USE_METAL) - endif() - if (GGML_VULKAN) - target_compile_definitions(${target} PRIVATE GGML_USE_VULKAN) - endif() - if (GGML_OPENCL) - target_compile_definitions(${target} PRIVATE GGML_USE_OPENCL) - endif() if (PARAKEET_FLASH_ATTN) target_compile_definitions(${target} PRIVATE PARAKEET_EXPERIMENTAL_FLASH_ATTN) endif() diff --git a/parakeet-cpp/include/parakeet/engine.h b/parakeet-cpp/include/parakeet/engine.h index 9aa9f9616c6..236157e1746 100644 --- a/parakeet-cpp/include/parakeet/engine.h +++ b/parakeet-cpp/include/parakeet/engine.h @@ -81,6 +81,46 @@ struct EngineOptions { bool verbose = false; + // Directory to scan for dynamically-loaded ggml backends + // (`libspeech-ggml-vulkan.so`, `libspeech-ggml-opencl.so`, + // `libspeech-ggml-cpu-android_armv8.2_1.so`, ...). Forwarded to + // `ggml_backend_load_all_from_path()` on the first Engine + // construction in the process; subsequent constructions reuse the + // already-populated registry. + // + // Leave empty to fall back to ggml's default search path + // (`ggml_backend_load_all()`), which walks compile-time defaults + // (`$EXE_DIR`, `LD_LIBRARY_PATH`, ...). Embedded host applications + // built with `GGML_BACKEND_DL=ON` (the Android / Linux non-Apple + // default; see CMakeLists.txt) should pass an explicit dir + // because the .so files ship next to the host's binary in a + // platform-specific subfolder rather than on the system loader's + // path. + // + // No-op on builds where ggml is statically linked + // (`GGML_BACKEND_DL=OFF`, e.g. desktop dev cmake builds and the + // Apple xcframework). On those, every backend is registered at + // constructor time from inside libggml and no filesystem scan + // takes place. + std::string backends_dir; + + // Sets `$GGML_OPENCL_CACHE_DIR` before the first backend init so + // ggml-opencl persists `clCreateProgramWithBinary` blobs across + // process restarts (see the program-binary-cache patch on + // qvac-ext-ggml@speech). Strongly recommended on Android where + // the cold `clBuildProgram` cost dominates first-utterance + // latency; pass a writable per-app directory (typically the + // app's `cacheDir` from the host platform). + // + // Honoured only on `__ANDROID__` builds; ignored elsewhere + // (desktop OpenCL platforms don't ship the binary-cache patch + // and would otherwise pollute the user's tmpdir). + // + // Leave empty to keep the existing `$GGML_OPENCL_CACHE_DIR` env + // value (or no cache at all). Wrapper scripts that already + // export the env take precedence. + std::string opencl_cache_dir; + // Opt-in cold-start mitigation. // // When `prewarm == true`, the Engine constructor runs one diff --git a/parakeet-cpp/patches/README.md b/parakeet-cpp/patches/README.md deleted file mode 100644 index d55e53e27a4..00000000000 --- a/parakeet-cpp/patches/README.md +++ /dev/null @@ -1,264 +0,0 @@ -# ggml patches for parakeet.cpp - -`ggml` is vendored as a pristine upstream clone (see the top-level -[`README.md`](../README.md) and [`scripts/setup-ggml.sh`](../scripts/setup-ggml.sh)), -so the local fixes parakeet.cpp depends on live here as standalone -patches and are applied after the clone. - -Three patches ship today: - -1. [`ggml-backend-reg-filename-prefix.patch`](#ggml-backend-reg-filename-prefixpatch) - — teaches `ggml_backend_load_best()` to honour a compile-time - `GGML_BACKEND_DL_PROJECT_PREFIX` macro, so renaming the bundled - backend .so/.dll files (parakeet does this to avoid colliding with - another consumer's `libggml-*` files in the same host process) does - not break runtime backend discovery under `GGML_BACKEND_DL=ON`. - No-op when the macro is undefined. -2. [`ggml-opencl-allow-non-adreno.patch`](#ggml-opencl-allow-non-adrenopatch) - — lets the OpenCL backend bring up on commodity desktop GPUs - (NVIDIA, AMD, Apple) so `parakeet.cpp` can be built and parity- - tested with `-DGGML_OPENCL=ON` outside Adreno-only environments. - No-op on real Adreno targets (the patch only relaxes the rejection - of unknown GPU vendors and the assertion in - `ggml_backend_opencl_init()` when no devices were found). -3. [`ggml-opencl-program-binary-cache.patch`](#ggml-opencl-program-binary-cachepatch) - — adds a persistent on-disk cache for compiled OpenCL kernel - binaries, removing the multi-second `clBuildProgram` wave at every - cold start. Honours `$GGML_OPENCL_CACHE_DIR`, with - `$XDG_CACHE_HOME/ggml/opencl` → `$HOME/.cache/ggml/opencl` - fallbacks. Opt-out via `GGML_OPENCL_CACHE_DIR=""`. - -`scripts/setup-ggml.sh` applies every `patches/ggml-*.patch` in -lexicographic order; the script is idempotent and resets the ggml -worktree to the pinned commit before applying. - -## Apply - -The top-level [`scripts/setup-ggml.sh`](../scripts/setup-ggml.sh) does -everything for you: - -```bash -# From the repo root. Clones ggml if needed, checks out the pinned -# commit, and applies every patch under patches/. Idempotent -- -# re-running is a no-op. -./scripts/setup-ggml.sh -``` - -Then configure + build as usual. Pick the backend flags for your -platform; OpenCL pulls in the patch automatically: - -```bash -# Apple Silicon -cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_METAL=ON -DGGML_METAL_EMBED_LIBRARY=ON - -# NVIDIA / desktop -cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON - -# Vulkan (anything else) -cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_VULKAN=ON - -# OpenCL: Adreno (Android) target -cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_OPENCL=ON - -# OpenCL: NVIDIA / AMD / Apple desktop (dev / CI parity testing) -- -# Adreno-tuned matmul kernels OFF, generic OpenCL paths only: -cmake -S . -B build -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=OFF -``` - -If you'd rather run the steps by hand (e.g. to pin a different -upstream commit), the script is effectively: - -```bash -git clone https://github.com/ggml-org/ggml.git ggml -cd ggml && git checkout $GGML_COMMIT -git apply ../patches/ggml-backend-reg-filename-prefix.patch -git apply ../patches/ggml-opencl-allow-non-adreno.patch -git apply ../patches/ggml-opencl-program-binary-cache.patch -``` - -`GGML_COMMIT` lives at the top of `scripts/setup-ggml.sh` as the -single source of truth -- bump it when re-generating the patches -against a newer upstream ggml. To confirm everything applied -cleanly: - -```bash -(cd ggml && git status --short) -# Expected: 2 modified files -# ggml/src/ggml-backend-reg.cpp (filename-prefix patch) -# ggml/src/ggml-opencl/ggml-opencl.cpp (both OpenCL patches stack on this file) -``` - -CPU / CUDA / Metal / Vulkan builds get the pinned commit and the -filename-prefix patch (which is a strict no-op when the host -project does not define `GGML_BACKEND_DL_PROJECT_PREFIX`); the -OpenCL changes are no-op for every other backend. - -## `ggml-backend-reg-filename-prefix.patch` - -Base commit: `58c38058` (`sync : llama.cpp`, 2026-04-09). - -Adds a single compile-time switch -`GGML_BACKEND_DL_PROJECT_PREFIX` to `ggml_backend_load_best()` so -the runtime backend-discovery walk can be retargeted at the -filename prefix used by a host project that renames the bundled -`libggml-*` files to avoid colliding with another consumer's -`libggml-*` files in the same host process. - -Background: parakeet ships its bundled ggml backends as -`libspeech-ggml-*.{so,dll}` (CMake option -`PARAKEET_GGML_LIB_PREFIX=ON`, default) so a host process that -loads two consumers each vendoring its own ggml does not see a -name clash on `libggml-vulkan.so` / `libggml-cuda.so` / etc. The -`speech-` prefix is shared with the rest of the QVAC speech stack -(whisper, parakeet, chatterbox, supertonic, ...) so the family -co-vendors a single ggml file set. -Without this patch, the rename works at link time but -`ggml_backend_load_best()` still searches for `libggml-*.so` / -`ggml-*.dll`, so under `GGML_BACKEND_DL=ON` the renamed files are -on disk but never discovered and Vulkan/OpenCL/CUDA backends -silently fail to load. - -| Symptom | Root cause | What this patch does | -|---------|-----------|----------------------| -| `speech-ggml-vulkan.so` (etc.) is on disk but ggml's loader never picks it up under `GGML_BACKEND_DL=ON` | `backend_filename_prefix()` hard-codes `libggml-` / `ggml-` and `ggml_backend_load_best` filters directory entries by that fixed prefix | Honour an optional compile-time `GGML_BACKEND_DL_PROJECT_PREFIX` string literal (e.g. `"speech-"`); when defined, the loader searches for `libggml-*` / `ggml-*` instead. Macro undefined ⇒ behaviour byte-equal to upstream. | - -The CMake side wires the macro from `PARAKEET_GGML_LIB_PREFIX`: -when that option is on (the default), parakeet's top-level -`CMakeLists.txt` does -`target_compile_definitions(ggml PRIVATE GGML_BACKEND_DL_PROJECT_PREFIX="speech-")` -on the `ggml` target (which is what compiles -`ggml-backend-reg.cpp`). Consumers that prefer the upstream -filenames (system ggml, single-consumer hosts) configure with -`-DPARAKEET_GGML_LIB_PREFIX=OFF` and the macro stays undefined, -so the loader behaviour matches stock ggml exactly. - -## `ggml-opencl-allow-non-adreno.patch` - -Base commit: `58c38058` (`sync : llama.cpp`, 2026-04-09). - -Fixes two gaps in `ggml-opencl` that make `-DGGML_OPENCL=ON` builds of -`parakeet.cpp` impossible to bring up outside an Adreno-only -environment: - -| Symptom | Root cause in `ggml-opencl` | What this patch does | -|--------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Every NVIDIA / AMD / Apple OpenCL device is dropped at init with `Unsupported GPU: ` | `ggml_cl2_init()` whitelists `Adreno` / `Qualcomm` / `Intel` and returns `nullptr` for everything else. Even with `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`, a non-Adreno GPU never reaches the generic kernels. | Default behaviour is byte-equal to upstream (still returns `nullptr`). Set `GGML_OPENCL_ALLOW_UNKNOWN_GPU=1` to opt the device through with `GPU_FAMILY::UNKNOWN`; we additionally require `cl_intel_required_subgroup_size` *or* `cl_qcom_reqd_sub_group_size` (the matmul-vec kernels need one to define `N_DST`/`N_SIMDGROUP`/`N_SIMDWIDTH`), so AMD/NVIDIA still fall back to host instead of crashing in `clBuildProgram`. | -| `parakeet --n-gpu-layers 1` aborts with `GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg))` when zero usable devices were found | `ggml_backend_opencl_init()` calls `ggml_backend_reg_dev_get(reg, 0)` unconditionally. When the device discovery cleared the list (e.g. only an unsupported GPU was present), `dev_get(0)` asserts and the host process aborts. parakeet's `init_gpu_backend()` cascade expects a nullable result so it can fall back. | Check `ggml_backend_reg_dev_count(reg) == 0` before `dev_get` and return `nullptr` on empty. Also propagate `nullptr` when `ggml_cl2_init()` rejects the device, so the host-side fallback path actually runs. | - -The patch is **strictly additive** for real Adreno targets: -`gpu_family == ADRENO` is computed exactly as before, the Adreno -shuffle / large-buffer paths still trigger when (and only when) the -device is Adreno, and without `GGML_OPENCL_ALLOW_UNKNOWN_GPU=1` the -non-Adreno reject path is byte-equal to upstream so production Android -builds get the same compile-time guarantees as before. - -The intended audience for the patch is: - - * `parakeet.cpp` developers running CI on Intel iGPU desktop - hardware (the matmul-vec kernels gate on - `cl_intel_required_subgroup_size`, so Intel iGPU is the only - desktop class that can actually execute the OpenCL kernels; - AMD/NVIDIA users get a clean CPU fallback instead of crashing - inside `clBuildProgram`). - * Anyone who wants to reproduce the OpenCL backend's mel/encoder - parity numbers without an Adreno device. - -Opt-in is gated behind `GGML_OPENCL_ALLOW_UNKNOWN_GPU=1` so misconfigured -production builds still get the same explicit `Unsupported GPU` error -upstream returned, instead of a silent "running with an untested GPU". - -It is **not** intended to ship a fast OpenCL path on NVIDIA / AMD / -Apple desktops (CUDA / Vulkan / Metal are far better suited there); -its only purpose is bring-up + parity testing. - -## `ggml-opencl-program-binary-cache.patch` - -Base commit: `58c38058` (`sync : llama.cpp`, 2026-04-09). - -Adds a persistent on-disk cache for compiled OpenCL kernel binaries -to `ggml-opencl`. Upstream `build_program_from_source()` calls -`clCreateProgramWithSource` + `clBuildProgram` on every cold start, -re-paying the driver's shader-compile wave (multiple seconds on -Adreno / Mesa / Mali; tens of ms on most desktop drivers). This -patch drops the call to `clCreateProgramWithBinary` against a -device-specific cache blob whenever one exists, and persists every -freshly-compiled program back to disk on miss. - -| Symptom | Root cause | What this patch does | -|----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------| -| Every cold-start `parakeet --n-gpu-layers 1` re-compiles all 88 OpenCL kernels | `build_program_from_source` always calls `clCreateProgramWithSource` + `clBuildProgram` | Look up `/.bin` first via `clCreateProgramWithBinary`; only fall through to source compile on miss | -| Hosts already `setenv` `GGML_OPENCL_CACHE_DIR` for the same goal, but ggml-opencl ignores it | The env var is read **nowhere** in upstream ggml-opencl at this commit | Resolves cache dir from `$GGML_OPENCL_CACHE_DIR` → `$XDG_CACHE_HOME/ggml/opencl` → `$HOME/.cache/ggml/opencl`, so the env-var contract takes effect. | - -### Cache key - -`____.bin`, -where each component is FNV-1a-64. Each kernel's `program_buffer` -hashes independently (88 different cache files per device); a -driver upgrade or moving to a different device silently invalidates -the cache because either `driver_hash` or `dev_*_hash` changes. -There is no manual invalidation step. - -### Atomic writes - -The cache writer dumps `getProgramInfo(CL_PROGRAM_BINARIES)` to -`.tmp` then `rename(2)`s into place. POSIX rename is atomic, -so concurrent processes can't read a half-written file; the -last-writer-wins result is fine because each blob is independently -valid for the same `(src, opts, driver, dev)` combination. - -### Footprint - -Each kernel binary lands at ~10-200 KB on Adreno (driver-dependent); -88 kernels × ~50 KB average ≈ 4-5 MB on disk per device per process -family. No size cap on disk today -- if it ever becomes a concern -on tightly-budgeted mobile installs, wrap the writer with a -ceiling. - -### Opt-out / disable - -`GGML_OPENCL_CACHE_DIR=""` (literal empty string) short-circuits -both the read and the write paths and runs the original -source-compile route. Useful for benchmarking the cold-start cost, -or in a CI runner that wants every run to re-compile. - -When the cache dir resolves but `mkdir -p` fails (read-only -filesystem, permissions, ...), the writer logs nothing and falls -through to source compile silently -- no behavioural difference -versus running with the patch absent. - -### Stale-cache handling - -`clCreateProgramWithBinary` can return `CL_INVALID_BINARY` (or the -subsequent `clBuildProgram` can fail) when the on-disk blob is -stale (driver upgrade, different shader IR version, mismatched -device). The patch handles every such failure by releasing the -program and falling through to source compile. The next run then -overwrites the bad blob. - -### Measured impact - -This patch is **not yet benchmarked on a real Adreno device**: the -benchmark hosts the patch was developed on are NVIDIA-only, and -NVIDIA's OpenCL driver lacks the fp16 / OpenCL C 2.0 features -ggml-opencl mandates -- the kernels never compile at all there, so -there is nothing to cache. Expected impact: - - * **Cold start (no cache)**: same as upstream -- multi-second - shader compile wave on Adreno. - * **Warm cache** (any subsequent invocation): saves the entire - `clBuildProgram` wave; typical Adreno saving is multiple - seconds per process. - -Once Adreno hardware is available for follow-up benchmarking, the -expected bench shape is the standard pipeline-cache curve: -cold ≫ ggml-warm ≈ both-warm. - -## Dropping the patches - -If upstream ggml-opencl decides to relax the GPU-vendor whitelist -itself, or ships its own kernel binary cache, delete the patch -file(s) and remove the corresponding entry from the `PATCHES=(…)` -glob in `scripts/setup-ggml.sh`. The C++ side of parakeet uses -only ops that ggml-opencl already supports natively (per the -op-coverage audit), so nothing else needs to change. diff --git a/parakeet-cpp/patches/ggml-backend-reg-filename-prefix.patch b/parakeet-cpp/patches/ggml-backend-reg-filename-prefix.patch deleted file mode 100644 index e5e824e592c..00000000000 --- a/parakeet-cpp/patches/ggml-backend-reg-filename-prefix.patch +++ /dev/null @@ -1,35 +0,0 @@ -diff --git a/src/ggml-backend-reg.cpp b/src/ggml-backend-reg.cpp ---- a/src/ggml-backend-reg.cpp -+++ b/src/ggml-backend-reg.cpp -@@ -442,12 +442,31 @@ static std::string get_executable_path() { - #endif - } - -+// parakeet patch: allow consuming projects to override the backend -+// shared-library filename prefix at compile time. Without this, the -+// loader hard-codes "ggml-" (Windows) / "libggml-" (other), so two -+// addons that vendor different ggml versions and rename their bundled -+// backend .so/.dll files to avoid filename collisions still cannot be -+// loaded with `GGML_BACKEND_DL=ON`: the discovery walk in -+// `ggml_backend_load_best` only matches the unprefixed names. Define -+// `GGML_BACKEND_DL_PROJECT_PREFIX` (a string literal, e.g. -+// "speech-") at compile time and the loader will instead search for -+// "ggml-*" / "libggml-*". Default behaviour (macro -+// undefined) is byte-equal to upstream. - static fs::path backend_filename_prefix() { -+#if defined(GGML_BACKEND_DL_PROJECT_PREFIX) -+#ifdef _WIN32 -+ return fs::u8path(GGML_BACKEND_DL_PROJECT_PREFIX "ggml-"); -+#else -+ return fs::u8path("lib" GGML_BACKEND_DL_PROJECT_PREFIX "ggml-"); -+#endif -+#else - #ifdef _WIN32 - return fs::u8path("ggml-"); - #else - return fs::u8path("libggml-"); - #endif -+#endif - } - - static fs::path backend_filename_extension() { diff --git a/parakeet-cpp/patches/ggml-opencl-allow-non-adreno.patch b/parakeet-cpp/patches/ggml-opencl-allow-non-adreno.patch deleted file mode 100644 index 458c10f8768..00000000000 --- a/parakeet-cpp/patches/ggml-opencl-allow-non-adreno.patch +++ /dev/null @@ -1,91 +0,0 @@ -diff --git a/src/ggml-opencl/ggml-opencl.cpp b/src/ggml-opencl/ggml-opencl.cpp -index 6f3fc588..96942915 100644 ---- a/src/ggml-opencl/ggml-opencl.cpp -+++ b/src/ggml-opencl/ggml-opencl.cpp -@@ -3020,9 +3020,57 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { - } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) { - backend_ctx->gpu_family = GPU_FAMILY::INTEL; - } else { -- GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str()); -+ // parakeet patch: upstream ggml-opencl rejects any GPU that is -+ // not Adreno/Qualcomm or Intel. Parakeet's real OpenCL deployment -+ // target is Adreno (Android); for desktop dev/CI parity on Intel -+ // iGPUs we let the device through with `gpu_family = UNKNOWN` -+ // when the host opts in via `GGML_OPENCL_ALLOW_UNKNOWN_GPU=1`. -+ // -+ // Default (env var unset) preserves upstream behaviour byte-equal, -+ // so production Adreno builds get no behavioural change and a -+ // misconfigured non-Adreno consumer gets the same clear error as -+ // before instead of crashing later in kernel-compile. -+ // -+ // The matmul-vec kernels (mul_mv_q4_0_f32_v.cl etc.) auto-define -+ // INTEL_GPU / ADRENO_GPU based on `cl_intel_required_subgroup_size` -+ // / `cl_qcom_reqd_sub_group_size`. Without one of those extensions -+ // the kernel source has no way to define N_DST / N_SIMDGROUP / -+ // N_SIMDWIDTH and `clBuildProgram` aborts the host process. So we -+ // additionally require one of those two extensions before letting -+ // the device through; AMD/NVIDIA desktop drivers expose neither -+ // and now fall back cleanly to CPU instead of crashing. -+ const char * allow = getenv("GGML_OPENCL_ALLOW_UNKNOWN_GPU"); -+ if (!allow || allow[0] != '1') { -+ GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str()); -+ backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; -+ return nullptr; -+ } -+ -+ size_t ext_size = 0; -+ clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_size); -+ std::string ext; -+ if (ext_size > 0) { -+ ext.resize(ext_size); -+ clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, ext_size, ext.data(), NULL); -+ } -+ const bool has_intel_sg = ext.find("cl_intel_required_subgroup_size") != std::string::npos; -+ const bool has_qcom_sg = ext.find("cl_qcom_reqd_sub_group_size") != std::string::npos; -+ if (!has_intel_sg && !has_qcom_sg) { -+ GGML_LOG_ERROR("ggml_opencl: GPU '%s' has neither cl_intel_required_subgroup_size " -+ "nor cl_qcom_reqd_sub_group_size; matmul-vec kernels cannot define " -+ "N_DST/N_SIMDGROUP/N_SIMDWIDTH and clBuildProgram would abort. " -+ "Falling back to host (parakeet patch).\n", -+ dev_ctx->device_name.c_str()); -+ backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; -+ return nullptr; -+ } -+ -+ GGML_LOG_WARN("ggml_opencl: GPU '%s' is not Adreno/Qualcomm or Intel; " -+ "running with generic OpenCL kernels (parakeet patch + " -+ "GGML_OPENCL_ALLOW_UNKNOWN_GPU=1). " -+ "Adreno-specific kernels and large-buffer paths stay off.\n", -+ dev_ctx->device_name.c_str()); - backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; -- return nullptr; - } - - #ifdef GGML_OPENCL_USE_ADRENO_KERNELS -@@ -4075,8 +4123,25 @@ static ggml_backend_i ggml_backend_opencl_i = { - }; - - ggml_backend_t ggml_backend_opencl_init(void) { -- ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0); -+ // parakeet patch: bail out cleanly when the OpenCL backend -+ // discovery saw zero usable devices. Upstream calls -+ // ggml_backend_reg_dev_get() unconditionally, which asserts on an -+ // empty device list. Parakeet's host code expects a nullable result -+ // from ggml_backend_opencl_init() (it falls back to CPU when the -+ // returned backend is null); the assertion makes that fallback path -+ // unreachable on hosts where ggml-opencl can't find any GPU it -+ // accepts (Adreno-only environments without an Adreno device, -+ // headless CI runners, etc.). -+ ggml_backend_reg_t reg = ggml_backend_opencl_reg(); -+ if (ggml_backend_reg_dev_count(reg) == 0) { -+ return nullptr; -+ } -+ -+ ggml_backend_dev_t dev = ggml_backend_reg_dev_get(reg, 0); - ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev); -+ if (backend_ctx == nullptr) { -+ return nullptr; -+ } - - ggml_backend_t backend = new ggml_backend { - /* .guid = */ ggml_backend_opencl_guid(), diff --git a/parakeet-cpp/patches/ggml-opencl-program-binary-cache.patch b/parakeet-cpp/patches/ggml-opencl-program-binary-cache.patch deleted file mode 100644 index bdf15bf2169..00000000000 --- a/parakeet-cpp/patches/ggml-opencl-program-binary-cache.patch +++ /dev/null @@ -1,269 +0,0 @@ -diff --git a/src/ggml-opencl/ggml-opencl.cpp b/src/ggml-opencl/ggml-opencl.cpp -index 96942915..7c2e4bc2 100644 ---- a/src/ggml-opencl/ggml-opencl.cpp -+++ b/src/ggml-opencl/ggml-opencl.cpp -@@ -20,6 +20,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -29,6 +30,32 @@ - #include - #include - -+// parakeet patch: persistent kernel binary cache support. The -+// helpers below sit on POSIX file primitives (mkdir/unlink/fsync) but -+// also need to build on MinGW / MSVC where those names map to the -+// `_`-prefixed Windows variants and mkdir takes a single argument. -+// Wrap them in parakeet_* macros so the rest of the patch stays -+// platform-agnostic. -+#include -+#include -+#include -+#ifdef _WIN32 -+# include -+# include -+# define parakeet_mkdir(path) _mkdir(path) -+# define parakeet_unlink(path) _unlink(path) -+# define parakeet_open_ro(path) _open((path), _O_RDONLY | _O_BINARY) -+# define parakeet_close(fd) _close(fd) -+# define parakeet_fsync(fd) _commit(fd) -+#else -+# include -+# define parakeet_mkdir(path) mkdir((path), 0755) -+# define parakeet_unlink(path) unlink(path) -+# define parakeet_open_ro(path) open((path), O_RDONLY) -+# define parakeet_close(fd) close(fd) -+# define parakeet_fsync(fd) fsync(fd) -+#endif -+ - #undef MIN - #undef MAX - #define MIN(a, b) ((a) < (b) ? (a) : (b)) -@@ -755,6 +782,193 @@ inline std::string read_file(const std::string &path) { - return text; - } - -+// parakeet patch: persistent OpenCL kernel-binary cache. -+// ggml-opencl as shipped at this commit JIT-compiles every embedded -+// kernel via `clBuildProgram(clCreateProgramWithSource)` on each cold -+// start. On Adreno that's tens of seconds of shader compile per -+// process invocation; on Mesa / Mali / iGPU drivers it's similar. -+// This patch caches the device-specific compiled binaries under -+// `$GGML_OPENCL_CACHE_DIR` (or `$XDG_CACHE_HOME/ggml/opencl` → -+// `$HOME/.cache/ggml/opencl` fallback) keyed on a 64-bit FNV-1a hash of -+// (source + compile_opts + driver_version + device_name + ggml_commit). -+// Cache hit -> `clCreateProgramWithBinary`; miss / corrupted blob -> -+// fall through to source compile and write the resulting binary back. -+// -+// The opt-out path is `GGML_OPENCL_CACHE_DIR=""` (empty string) which -+// short-circuits the cache and runs the original source path. With no -+// cache directory writable, the helper logs a warning and falls -+// through to source compile silently. -+// -+// Hosts that already `setenv("GGML_OPENCL_CACHE_DIR", ...)` to point -+// the runtime at a writable location (typical pattern on Android -+// Adreno deployments) get the cache for free; this patch makes that -+// env-var contract take effect rather than being ignored upstream. -+ -+static uint64_t fnv1a_hash64(const void * data, size_t n) { -+ const uint8_t * p = static_cast(data); -+ uint64_t h = 0xcbf29ce484222325ULL; -+ for (size_t i = 0; i < n; ++i) { -+ h ^= p[i]; -+ h *= 0x100000001b3ULL; -+ } -+ return h; -+} -+ -+static std::string opencl_cache_dir(cl_device_id dev) { -+ const char * env = getenv("GGML_OPENCL_CACHE_DIR"); -+ if (env && *env == '\0') return ""; // explicit opt-out: empty string -+ if (env && *env != '\0') return env; -+ if (const char * xdg = getenv("XDG_CACHE_HOME"); xdg && *xdg) { -+ return std::string(xdg) + "/ggml/opencl"; -+ } -+ if (const char * home = getenv("HOME"); home && *home) { -+ return std::string(home) + "/.cache/ggml/opencl"; -+ } -+ GGML_UNUSED(dev); -+ return ""; // no plausible default; opt out gracefully -+} -+ -+static bool opencl_mkdir_p(const std::string & path) { -+ // Lightweight `mkdir -p` without C++17 dep on the -+ // ggml-opencl side (some downstream consumers compile against -+ // libstdc++ versions where std::filesystem requires linking -+ // -lstdc++fs explicitly). Returns true if the directory exists -+ // afterwards. -+ if (path.empty()) return false; -+ std::string cur; -+ cur.reserve(path.size()); -+ for (size_t i = 0; i <= path.size(); ++i) { -+ const char c = i < path.size() ? path[i] : '/'; -+ if ((c == '/' || c == '\\') && !cur.empty()) { -+ if (parakeet_mkdir(cur.c_str()) != 0 && errno != EEXIST) { -+ return false; -+ } -+ } -+ if (i < path.size()) cur.push_back(c); -+ } -+ return true; -+} -+ -+static std::string opencl_cache_key(const char * program_buffer, -+ size_t program_size, -+ const std::string & compile_opts, -+ cl_device_id dev) { -+ // Combine source + opts + device + driver into the cache key so a -+ // driver bump or a different SoC reuses different blobs. We hash -+ // each component separately and combine to avoid pathological -+ // FNV behaviour on long buffers. -+ uint64_t h_src = fnv1a_hash64(program_buffer, program_size); -+ uint64_t h_opts = fnv1a_hash64(compile_opts.data(), compile_opts.size()); -+ -+ // Driver version + device name + OpenCL C version pinpoint the -+ // driver instance the binary was emitted by. Pinpointing too -+ // tightly is a feature: a driver bump silently invalidates the -+ // cache, exactly the policy you want. -+ char driver_buf[256] = {0}; -+ char devname_buf[256] = {0}; -+ char devver_buf[256] = {0}; -+ size_t n; -+ clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_buf) - 1, driver_buf, &n); -+ clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(devname_buf) - 1, devname_buf, &n); -+ clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(devver_buf) - 1, devver_buf, &n); -+ uint64_t h_drv = fnv1a_hash64(driver_buf, strlen(driver_buf)); -+ uint64_t h_dev = fnv1a_hash64(devname_buf, strlen(devname_buf)); -+ uint64_t h_devver = fnv1a_hash64(devver_buf, strlen(devver_buf)); -+ -+ // Five 16-char hex tokens + 4 underscores + ".bin" + NUL = 89 bytes. -+ // Use PRIx64 + (uint64_t) so the format-spec width is correct on -+ // both LP64 (Linux/Android) and LLP64 (Windows MinGW/MSVC) where -+ // `unsigned long` is 32 bits and `%016lx` would silently truncate -+ // the upper half of each FNV hash. -+ char buf[128]; -+ std::snprintf(buf, sizeof(buf), -+ "%016" PRIx64 "_%016" PRIx64 "_%016" PRIx64 -+ "_%016" PRIx64 "_%016" PRIx64 ".bin", -+ h_src, h_opts, h_drv, h_dev, h_devver); -+ return buf; -+} -+ -+static cl_program opencl_build_program_with_cache(cl_context ctx, -+ cl_device_id dev, -+ const char * program_buffer, -+ size_t program_size, -+ const std::string & compile_opts, -+ const std::string & cache_dir, -+ const std::string & key) { -+ if (cache_dir.empty() || key.empty()) return nullptr; -+ const std::string path = cache_dir + "/" + key; -+ std::ifstream ifs(path, std::ios::binary); -+ if (!ifs) return nullptr; -+ ifs.seekg(0, std::ios::end); -+ const std::streamsize n = ifs.tellg(); -+ if (n <= 0) return nullptr; -+ ifs.seekg(0, std::ios::beg); -+ std::vector blob((size_t) n); -+ if (!ifs.read(reinterpret_cast(blob.data()), n)) return nullptr; -+ -+ cl_int err1 = CL_SUCCESS, err2 = CL_SUCCESS; -+ const unsigned char * data = blob.data(); -+ const size_t len = blob.size(); -+ cl_program p = clCreateProgramWithBinary(ctx, 1, &dev, &len, &data, &err1, &err2); -+ if (err1 != CL_SUCCESS || err2 != CL_SUCCESS || !p) { -+ if (p) clReleaseProgram(p); -+ return nullptr; -+ } -+ if (clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL) != CL_SUCCESS) { -+ clReleaseProgram(p); -+ return nullptr; -+ } -+ GGML_UNUSED(program_buffer); -+ GGML_UNUSED(program_size); -+ return p; -+} -+ -+static void opencl_save_program_binary(cl_program p, cl_device_id /*dev*/, -+ const std::string & cache_dir, -+ const std::string & key) { -+ if (cache_dir.empty() || key.empty()) return; -+ if (!opencl_mkdir_p(cache_dir)) return; -+ -+ size_t bin_size = 0; -+ if (clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), -+ &bin_size, nullptr) != CL_SUCCESS || bin_size == 0) return; -+ std::vector blob(bin_size); -+ unsigned char * blob_ptr = blob.data(); -+ if (clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(unsigned char *), -+ &blob_ptr, nullptr) != CL_SUCCESS) return; -+ -+ // Atomic write: tmp + fsync + rename. Without the fsync the kernel -+ // can flush blocks out of order on power loss, leaving the renamed -+ // file pointing at zero/garbage data and forcing the next process -+ // into the source-compile fallback (and the bad blob lives forever -+ // unless explicitly invalidated). -+ const std::string final_path = cache_dir + "/" + key; -+ const std::string tmp_path = final_path + ".tmp"; -+ { -+ std::ofstream ofs(tmp_path, std::ios::binary); -+ if (!ofs) return; -+ ofs.write(reinterpret_cast(blob.data()), (std::streamsize) blob.size()); -+ ofs.close(); -+ if (!ofs) { parakeet_unlink(tmp_path.c_str()); return; } -+ } -+ { -+ int fd = parakeet_open_ro(tmp_path.c_str()); -+ if (fd >= 0) { -+ parakeet_fsync(fd); -+ parakeet_close(fd); -+ } -+ } -+ // Windows rename() refuses to overwrite an existing destination, so -+ // unlink it first. POSIX rename is atomic and replaces silently; -+ // the redundant unlink there is a no-op when the target is missing. -+#ifdef _WIN32 -+ parakeet_unlink(final_path.c_str()); -+#endif -+ if (rename(tmp_path.c_str(), final_path.c_str()) != 0) { -+ parakeet_unlink(tmp_path.c_str()); -+ } -+} -+ - static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) { - cl_program p; - char *program_log; -@@ -764,6 +978,17 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co - - program_size = strlen(program_buffer); - -+ // parakeet patch: try the persistent cache first. -+ const std::string cache_dir = opencl_cache_dir(dev); -+ const std::string cache_key = cache_dir.empty() -+ ? std::string() -+ : opencl_cache_key(program_buffer, program_size, compile_opts, dev); -+ if (cl_program cached = opencl_build_program_with_cache( -+ ctx, dev, program_buffer, program_size, compile_opts, -+ cache_dir, cache_key)) { -+ return cached; -+ } -+ - p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); - if(err < 0) { - GGML_LOG_ERROR("OpenCL error creating program"); -@@ -781,6 +1006,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co - exit(1); - } - -+ // parakeet patch: save the freshly compiled binary. Fast path -+ // (cache hit) above avoids re-compiling next time. Failures here -+ // are non-fatal -- next process just re-pays the compile cost. -+ opencl_save_program_binary(p, dev, cache_dir, cache_key); -+ - return p; - } - diff --git a/parakeet-cpp/scripts/setup-ggml.sh b/parakeet-cpp/scripts/setup-ggml.sh index 69674f5e577..dccf0c36dec 100644 --- a/parakeet-cpp/scripts/setup-ggml.sh +++ b/parakeet-cpp/scripts/setup-ggml.sh @@ -1,113 +1,63 @@ #!/usr/bin/env bash -# Clone ggml into ./ggml at the commit this repo is pinned against, and -# apply every patch under patches/ in lexicographic order. Idempotent: -# safe to re-run. +# Clone qvac-ext-ggml into ./ggml on the branch this repo is pinned against. +# Idempotent: safe to re-run. # -# Update GGML_COMMIT here whenever the pin is bumped; this file is the -# single source of truth for which upstream ggml parakeet.cpp builds +# Update GGML_URL / GGML_BRANCH here whenever the pin is bumped; this file +# is the single source of truth for which ggml fork parakeet.cpp builds # against. # -# Patches we ship today: -# patches/ggml-backend-reg-filename-prefix.patch -# Teaches ggml_backend_load_best() to honour a compile-time -# GGML_BACKEND_DL_PROJECT_PREFIX macro so renaming the bundled -# backend .so/.dll files (PARAKEET_GGML_LIB_PREFIX=ON, the default, -# emits libspeech-ggml-*.so) does not break runtime backend -# discovery under GGML_BACKEND_DL=ON. No-op when the macro is -# undefined. -# patches/ggml-opencl-allow-non-adreno.patch -# Lets the ggml-opencl backend run on non-Adreno/Intel GPUs -# (NVIDIA, AMD, Apple) so the build can be parity-tested on -# commodity desktop hardware. Real Adreno deployments build with -# the patch applied as a no-op (Adreno path is unchanged). -# patches/ggml-opencl-program-binary-cache.patch -# Persistent OpenCL kernel binary cache via clCreateProgramWithBinary + -# CL_PROGRAM_BINARIES. Removes seconds of cold-start shader compile on -# every Adreno / Mesa / Mali / iGPU launch by serialising compiled -# kernels under $GGML_OPENCL_CACHE_DIR (or XDG/HOME fallback). -# See patches/README.md for the full rationale. +# qvac-ext-ggml's `speech` branch carries the equivalents of the patches +# that used to live under patches/ggml-*.patch (backend-reg filename +# prefix, opencl non-Adreno support, opencl program binary cache). The +# script therefore does not apply local patches anymore. set -euo pipefail -GGML_COMMIT="58c38058" -GGML_URL="https://github.com/ggml-org/ggml.git" +GGML_URL="https://github.com/tetherto/qvac-ext-ggml.git" +GGML_BRANCH="speech" REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" cd "$REPO_ROOT" -echo "parakeet.cpp: setting up ggml at pinned commit ${GGML_COMMIT}" +echo "parakeet.cpp: setting up ggml from ${GGML_URL} (branch: ${GGML_BRANCH})" -if [ ! -d ggml/.git ]; then - echo " -> cloning ${GGML_URL}" - git clone "$GGML_URL" ggml +if [ -e ggml ] && [ ! -d ggml/.git ]; then + if [ -L ggml ]; then + echo " -> ggml is a symlink to '$(readlink ggml)'; leaving it alone" + echo " (delete the symlink and re-run this script to clone fresh)" + exit 0 + fi + echo " ERROR: ./ggml exists but is not a git checkout." >&2 + echo " Remove it and re-run this script." >&2 + exit 1 fi -# Find every patch under patches/ matching ggml-*.patch, sorted. -shopt -s nullglob -PATCHES=( "$REPO_ROOT"/patches/ggml-*.patch ) -shopt -u nullglob +if [ ! -d ggml/.git ]; then + echo " -> cloning ${GGML_URL} (branch ${GGML_BRANCH})" + git clone --branch "$GGML_BRANCH" "$GGML_URL" ggml +fi cd ggml -CURRENT="$(git rev-parse --short=8 HEAD 2>/dev/null || echo '')" -NEED_CHECKOUT="0" -if [ "$CURRENT" != "$GGML_COMMIT" ]; then - NEED_CHECKOUT="1" +# Make sure the local checkout actually has the requested branch fetched +# (e.g. when the user previously cloned with a narrow refspec). +if ! git rev-parse --verify --quiet "refs/heads/${GGML_BRANCH}" >/dev/null; then + echo " -> fetching ${GGML_BRANCH}" + git fetch origin "${GGML_BRANCH}:${GGML_BRANCH}" fi -if [ "$NEED_CHECKOUT" = "1" ]; then - git checkout -- . 2>/dev/null || true - git checkout "$GGML_COMMIT" - echo " -> ok, at $(git rev-parse --short=8 HEAD)" -fi - -# Apply patches. We always reset to the pinned commit before applying so -# this is fully idempotent: re-running the script never stacks patches on -# top of patches. We bail loudly on a real failure (CRLF in working -# tree, conflict, ...) instead of silently linking against unpatched ggml. -if [ ${#PATCHES[@]} -gt 0 ]; then - if [ "$NEED_CHECKOUT" = "0" ]; then - # Same commit as last run, but patches may already be applied; - # reset to pristine before re-applying. - if ! git diff --quiet || ! git diff --cached --quiet; then - echo " -> resetting ggml worktree to pristine ${GGML_COMMIT}" - git checkout -- . - fi +CURRENT_BRANCH="$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo '')" +if [ "$CURRENT_BRANCH" != "$GGML_BRANCH" ]; then + if ! git diff --quiet || ! git diff --cached --quiet; then + echo " -> resetting ggml worktree to pristine before switching branches" + git checkout -- . fi - for patch in "${PATCHES[@]}"; do - name="$(basename "$patch")" - # Detect whether the patch has already been applied (idempotent - # re-run of the script). `git apply --reverse --check` succeeds - # iff every hunk reverses cleanly, which only happens when the - # patch is currently applied to the working tree. - if git apply --reverse --check "$patch" 2>/dev/null; then - echo " -> $name: already applied, skipping" - continue - fi - - # Strip CR line endings from the patch on the fly. Windows checkouts - # with `core.autocrlf=true` (git's default on Windows) leave the - # patch as CRLF in the working tree even though it is LF in the - # index, and `git apply` then refuses with a context-mismatch - # error. This converts on read instead of mutating the file. - sanitized="$(mktemp)" - # shellcheck disable=SC2064 - trap "rm -f '$sanitized'" EXIT - tr -d '\r' < "$patch" > "$sanitized" - - echo " -> applying $name" - if ! git apply --check "$sanitized" 2>/tmp/setup-ggml-apply.err; then - echo " ERROR: patch '$name' does not apply against ggml@${GGML_COMMIT}." >&2 - sed 's/^/ /' /tmp/setup-ggml-apply.err >&2 - echo " Aborting so the build does not silently link unpatched ggml." >&2 - rm -f /tmp/setup-ggml-apply.err - exit 1 - fi - rm -f /tmp/setup-ggml-apply.err - git apply "$sanitized" - done + echo " -> checking out ${GGML_BRANCH}" + git checkout "$GGML_BRANCH" fi +echo " -> ok, on ${GGML_BRANCH} at $(git rev-parse --short=8 HEAD)" + echo echo "ggml is ready. Next:" echo " cmake -S . -B build -DCMAKE_BUILD_TYPE=Release" diff --git a/parakeet-cpp/src/main.cpp b/parakeet-cpp/src/main.cpp index 7f7e3dbeee5..48b0538c7ac 100644 --- a/parakeet-cpp/src/main.cpp +++ b/parakeet-cpp/src/main.cpp @@ -68,6 +68,14 @@ void print_usage(const char * argv0) { " patch under patches/ relaxes the upstream Adreno-only\n" " device whitelist for dev/CI parity testing). Production\n" " Adreno deployments leave both at their defaults.\n" + " --backends-dir DIR directory to scan for dynamically-loaded\n" + " ggml backend .so/.dll/.dylib files\n" + " (e.g. libspeech-ggml-vulkan.so,\n" + " libspeech-ggml-opencl.so,\n" + " libspeech-ggml-cpu-android_armv8.2_1.so).\n" + " Forwarded to ggml_backend_load_all_from_path()\n" + " on first backend init. Empty => ggml's compile-\n" + " time default search path.\n" " --opencl-cache-dir DIR persistent OpenCL kernel binary cache directory\n" " (sets $GGML_OPENCL_CACHE_DIR; consumed by\n" " patches/ggml-opencl-program-binary-cache.patch).\n" @@ -272,6 +280,14 @@ struct ExtraCliOpts { std::string opencl_device; // GGML_OPENCL_DEVICE bool opencl_disable_fusion = false; // GGML_OPENCL_DISABLE_FUSION=1 bool opencl_adreno_use_large_buffer = false; // GGML_OPENCL_ADRENO_USE_LARGE_BUFFER=1 + + // Forwarded to `parakeet::set_backends_directory()` before any + // backend init so `ggml_backend_load_all_from_path()` finds the + // `libggml-{vulkan,opencl,cpu-*}.so` files in a custom + // location. Mirrors `--opencl-cache-dir`'s "applied before + // backend init" lifetime contract. Empty => use ggml's default + // search path (`$LD_LIBRARY_PATH`, exe dir, etc.). + std::string backends_dir; }; // Apply OpenCL runtime overrides from the CLI to the process env. @@ -377,6 +393,8 @@ extern "C" int parakeet_cli_main(int argc, char ** argv) { opts.n_gpu_layers = std::atoi(argv[++i]); } else if (a == "--opencl-cache-dir" && i + 1 < argc) { extra.opencl_cache_dir = argv[++i]; + } else if (a == "--backends-dir" && i + 1 < argc) { + extra.backends_dir = argv[++i]; } else if (a == "--opencl-platform" && i + 1 < argc) { extra.opencl_platform = argv[++i]; } else if (a == "--opencl-device" && i + 1 < argc) { @@ -466,6 +484,14 @@ extern "C" int parakeet_cli_main(int argc, char ** argv) { // just aren't read by anything). apply_opencl_cli_env(extra); + // Same lifetime contract as the OpenCL env overrides above: + // applied before any backend init so `ggml_backend_load_all_from_path` + // runs against the requested directory on first use. Empty => + // fall back to ggml's compile-time default search path. + if (!extra.backends_dir.empty()) { + set_backends_directory(extra.backends_dir); + } + const auto t_load = clock::now(); ParakeetCtcModel model; if (int rc = load_from_gguf(opts.model_gguf_path, model, opts.n_threads, opts.n_gpu_layers, opts.verbose); rc != 0) { diff --git a/parakeet-cpp/src/parakeet_ctc.cpp b/parakeet-cpp/src/parakeet_ctc.cpp index f9d1c9b1220..62a95cf1c63 100644 --- a/parakeet-cpp/src/parakeet_ctc.cpp +++ b/parakeet-cpp/src/parakeet_ctc.cpp @@ -10,17 +10,24 @@ #include "gguf.h" #include +#include #include #include #include +#include #include #include #include +#include #include #include #include #include +#if defined(__ANDROID__) || defined(__unix__) || defined(__APPLE__) +#include +#endif + namespace parakeet { struct EncoderGraph { @@ -128,6 +135,34 @@ ggml_context * ParakeetCtcModel::weights_ctx() const { namespace { +// Backends-dir / OpenCL-cache-dir override + warning state. The +// setters are intended to be called by the first Engine +// construction; both are consumed once and then frozen for the rest +// of the process lifetime (the ggml-backend registry and +// $GGML_OPENCL_CACHE_DIR are both process-singleton state -- see +// comment on `ensure_backends_loaded` and the analogous note in +// `set_opencl_cache_dir`). +// +// `g_backends_loaded` is the canonical "registry already populated" +// flag, set inside `ensure_backends_loaded()` *before* the load-all +// call returns AND under the mutex so concurrent `set_*` calls +// either land their write (and have it picked up by the in-flight +// load) or atomically observe the flag and warn. We track it +// separately from `g_recorded_backends_dir` because the first +// Engine may have legitimately constructed with an empty +// `backends_dir` (default ggml search path), in which case +// `g_recorded_backends_dir` stays empty and is no longer a reliable +// "have we loaded?" sentinel -- a subsequent setter would otherwise +// silently write to `g_backends_dir`, never get re-scanned, and +// surface zero diagnostic to the caller. +std::mutex g_backends_dir_mutex; +std::string g_backends_dir; +std::string g_recorded_backends_dir; +std::string g_recorded_opencl_cache_dir; +std::atomic g_backends_loaded{false}; +std::atomic g_backends_dir_warned{false}; +std::atomic g_opencl_cache_dir_warned{false}; + // Trigger one-time discovery + load of every available ggml backend. // Idempotent: repeated calls inside the same process are no-ops once // the registry is populated. Routed through a static guard so we don't @@ -145,23 +180,92 @@ namespace { // ggml_backend_load_all() is a cheap no-op. Both modes therefore // reach the same registry walk below, matching the convention used // by llama.cpp and other ggml-based libraries. +// +// The optional backends dir comes from `set_backends_directory()` +// (typically wired from `EngineOptions::backends_dir`). When set and +// non-empty, the loader walks that single directory instead of the +// compile-time defaults so embedded host apps can ship the +// `libggml-{vulkan,opencl,cpu-*}.so` files in their own +// per-module folder rather than relying on `LD_LIBRARY_PATH` / +// `dlopen()` heuristics. void ensure_backends_loaded() { static const bool loaded = []() { - ggml_backend_load_all(); + std::string dir; + { + std::lock_guard lock(g_backends_dir_mutex); + dir = g_backends_dir; + g_recorded_backends_dir = g_backends_dir; + // Flip the loaded sentinel under the mutex (and *before* + // we release it for the load-all call below) so any + // concurrent setter that's about to acquire the mutex + // sees the registry as already-claimed and falls into + // its warn-once branch. Without this, a setter racing + // a first Engine construction would land its value + // *after* we already captured `dir` into the local -- + // the registry would scan against the wrong directory + // (or the default), and the second Engine would have + // no idea its override was lost. + g_backends_loaded.store(true, std::memory_order_release); + } + if (!dir.empty()) { + ggml_backend_load_all_from_path(dir.c_str()); + } else { + ggml_backend_load_all(); + } return true; }(); (void) loaded; } +// Parse the Adreno generation number from a device name / +// description string. Returns: +// - a 3-or-4-digit generation number ("Adreno (TM) 750" -> 750, +// "Adreno 830" -> 830, "Adreno 660" -> 660) +// - a synthetic 800 for the "Adreno X" naming used by +// Snapdragon X Elite parts (X1-85 / X1-45 etc.). These are +// 7xx/8xx-tier silicon with kernels that ggml-opencl supports +// and outperform Vulkan on. Mapped to 800 here so they take +// the OpenCL branch in the tier policy. +// - -1 when no Adreno marker is present (Mali, desktop GPUs, ...) +// +// Used to drive the OpenCL vs Vulkan tier policy below: Adreno +// 7xx/8xx/X ship OpenCL kernels that outperform Vulkan on those +// parts, while Adreno 6xx ggml-opencl is known broken (incorrect +// results). Mirrors the equivalent helper in llm-llamacpp's +// BackendSelection.cpp::parseAdrenoVersion so the two stacks reach +// the same decision on the same hardware. +int parse_adreno_version(const char * s) { + if (!s) return -1; + const char * p = strstr(s, "Adreno"); + if (!p) p = strstr(s, "adreno"); + if (!p) return -1; + p += 6; // strlen("Adreno") == strlen("adreno") == 6 + // Skip whitespace, "(TM)", punctuation; stop at first letter or digit. + while (*p && !(*p >= '0' && *p <= '9') && *p != 'X' && *p != 'x') ++p; + if (!*p) return -1; + // X1 / X2 ... naming for Snapdragon X Elite -> treat as 800-tier. + if (*p == 'X' || *p == 'x') { + ++p; + if (*p < '0' || *p > '9') return -1; // "Xclipse" etc. is not Adreno-X + return 800; + } + int v = 0; + while (*p >= '0' && *p <= '9') { + v = v * 10 + (*p - '0'); + ++p; + if (v > 100000) return -1; + } + return v; +} + bool is_adreno_6xx(const char * s) { - if (!s) return false; - if (!strstr(s, "Adreno")) return false; - for (const char * q = s; *q; ++q) { - if (*q == '6' && q[1] >= '0' && q[1] <= '9' && q[2] >= '0' && q[2] <= '9') { - return true; - } - } - return false; + const int v = parse_adreno_version(s); + return v >= 600 && v < 700; +} + +bool is_adreno_700plus(const char * s) { + const int v = parse_adreno_version(s); + return v >= 700; } const char * dev_reg_name(ggml_backend_dev_t dev) { @@ -171,19 +275,42 @@ const char * dev_reg_name(ggml_backend_dev_t dev) { } +// Pick a GPU backend using the same tier policy as llm-llamacpp's +// BackendSelection: ggml-opencl is only used when an Adreno 700+ +// device is present (where its kernels are validated and faster than +// Vulkan); every other GPU (Vulkan, Metal, CUDA, Mali, Intel iGPU, +// ...) goes through the non-OpenCL preference. Adreno 6xx OpenCL is +// known broken (incorrect outputs) and is force-skipped unless the +// caller opts in via `PARAKEET_ALLOW_ADRENO_6XX=1`. +// +// Routed exclusively through the ggml-backend registry +// (`ggml_backend_load_all` + `ggml_backend_dev_*`). No direct calls +// to `ggml_backend_vulkan_init` / `ggml_backend_opencl_init` / +// `ggml_backend_metal_init` are made anywhere in parakeet — under +// the GGML_BACKEND_DL=ON build mode embedded host applications ship +// with, those entry points live in separate shared libraries that +// are dlopen()'d at runtime and are not linkable from libparakeet. +// The registry walk reaches the same backends in both modes. ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose) { if (n_gpu_layers <= 0) return nullptr; ensure_backends_loaded(); - // Walk the registry in registration order and pick the first - // GPU/IGPU device. Registry order is defined by the ggml-backend - // registry's static init list (CUDA -> Metal -> Vulkan -> OpenCL - // -> ...), so this preserves the priority of the legacy direct- - // init cascade. The Adreno-6xx fallback policy stays on top: - // ggml-opencl produces incorrect results on Adreno 6xx; force- - // skip and continue the walk (or fall through to CPU) unless - // `PARAKEET_ALLOW_ADRENO_6XX=1` is set. + // Collect GPU/IGPU devices into three buckets so we can apply the + // tier policy after the walk. We keep the device handles + their + // human-readable names for both the policy decision and the final + // log line. + struct Cand { + ggml_backend_dev_t dev; + const char * name; + const char * desc; + const char * reg_name; + }; + std::vector opencl_adreno_700plus; + std::vector other_gpu; // Vulkan / Metal / CUDA / Mali / Intel / ... + std::vector opencl_other; // Non-Adreno OpenCL (e.g. desktop) + int max_adreno_version = -1; + const size_t n_dev = ggml_backend_dev_count(); for (size_t i = 0; i < n_dev; ++i) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); @@ -198,32 +325,72 @@ ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose) { const char * reg_name = dev_reg_name(dev); const bool is_opencl = std::strcmp(reg_name, "OpenCL") == 0; - if (is_opencl && (is_adreno_6xx(name) || is_adreno_6xx(desc))) { - const char * reported = name ? name : (desc ? desc : "unknown"); - const char * override_env = getenv("PARAKEET_ALLOW_ADRENO_6XX"); - if (!override_env || override_env[0] != '1') { - if (verbose) PARAKEET_LOG_WARN( - "parakeet: OpenCL device '%s' is Adreno 6xx; " - "skipping (7xx/8xx/X1E supported, set " - "PARAKEET_ALLOW_ADRENO_6XX=1 to override)\n", - reported); - continue; + const int adreno_v = std::max(parse_adreno_version(name), + parse_adreno_version(desc)); + if (adreno_v > max_adreno_version) max_adreno_version = adreno_v; + + if (is_opencl) { + if (adreno_v >= 700) { + opencl_adreno_700plus.push_back({dev, name, desc, reg_name}); + } else if (adreno_v >= 600 && adreno_v < 700) { + const char * reported = name ? name : (desc ? desc : "unknown"); + const char * override_env = getenv("PARAKEET_ALLOW_ADRENO_6XX"); + if (!override_env || override_env[0] != '1') { + if (verbose) PARAKEET_LOG_WARN( + "parakeet: OpenCL device '%s' is Adreno 6xx; " + "skipping (7xx/8xx/X1E supported, set " + "PARAKEET_ALLOW_ADRENO_6XX=1 to override)\n", + reported); + continue; + } + if (verbose) PARAKEET_LOG_INFO( + "parakeet: PARAKEET_ALLOW_ADRENO_6XX=1 set; " + "keeping OpenCL backend on '%s' anyway\n", reported); + opencl_other.push_back({dev, name, desc, reg_name}); + } else { + opencl_other.push_back({dev, name, desc, reg_name}); } + } else { + other_gpu.push_back({dev, name, desc, reg_name}); + } + } + + // Tier policy: + // 1. Adreno 700+: prefer OpenCL (validated, faster than Vulkan + // on Snapdragon 8 Gen 2/3/4 etc.). + // 2. Anything else with a non-OpenCL GPU: prefer that + // (Vulkan on all non-Adreno Android, Metal on Apple, CUDA + // on Linux/Windows desktop, Mali iGPU via Vulkan, ...). + // 3. Last resort: any other OpenCL device (e.g. desktop OpenCL + // or non-Adreno mobile when no Vulkan is registered). + auto try_init = [&](const std::vector & bucket) -> ggml_backend_t { + for (const Cand & c : bucket) { + ggml_backend_t b = ggml_backend_dev_init(c.dev, nullptr); + if (!b) continue; if (verbose) PARAKEET_LOG_INFO( - "parakeet: PARAKEET_ALLOW_ADRENO_6XX=1 set; " - "keeping OpenCL backend on '%s' anyway\n", reported); + "parakeet: using %s backend (%s)\n", + c.reg_name && *c.reg_name ? c.reg_name : "GPU", + c.name ? c.name : (c.desc ? c.desc : "unknown")); + return b; } + return nullptr; + }; - ggml_backend_t b = ggml_backend_dev_init(dev, nullptr); - if (!b) continue; - if (verbose) PARAKEET_LOG_INFO( - "parakeet: using %s backend (%s)\n", - reg_name && *reg_name ? reg_name : "GPU", - name ? name : (desc ? desc : "unknown")); - return b; + if (!opencl_adreno_700plus.empty()) { + if (ggml_backend_t b = try_init(opencl_adreno_700plus)) return b; } + if (ggml_backend_t b = try_init(other_gpu)) return b; + if (ggml_backend_t b = try_init(opencl_other)) return b; - if (verbose) PARAKEET_LOG_INFO("parakeet: no GPU backend available, falling back to CPU\n"); + if (verbose) { + if (max_adreno_version >= 600 && max_adreno_version < 700) { + PARAKEET_LOG_INFO( + "parakeet: only Adreno 6xx OpenCL detected (broken); " + "falling back to CPU\n"); + } else { + PARAKEET_LOG_INFO("parakeet: no GPU backend available, falling back to CPU\n"); + } + } return nullptr; } @@ -300,6 +467,91 @@ std::vector read_filterbank_to_vector(ggml_tensor * t) { } +void set_backends_directory(const std::string & dir) { + std::lock_guard lock(g_backends_dir_mutex); + if (g_backends_loaded.load(std::memory_order_acquire)) { + // Registry already populated for this process. We can't + // re-scan a different directory mid-flight (ggml's registry + // is a process-wide singleton), so log the conflict at most + // once and otherwise stay silent on subsequent identical + // sets (the common case when a host instantiates several + // Engines back-to-back from the same backends folder, or + // when the second value happens to match the recorded one). + if (dir != g_recorded_backends_dir && + !g_backends_dir_warned.exchange(true)) { + if (g_recorded_backends_dir.empty()) { + // First Engine constructed without an explicit + // backends_dir, so ggml's compile-time default + // search path was used. The current caller wanted + // a specific dir but missed the window. + PARAKEET_LOG_WARN( + "parakeet: set_backends_directory('%s') ignored -- the " + "ggml-backend registry was already populated against " + "ggml's default search path (no explicit backends_dir on " + "the first Engine). Call set_backends_directory() (or " + "construct an Engine with backends_dir set) before the " + "first Engine to influence which directory is scanned.\n", + dir.c_str()); + } else { + PARAKEET_LOG_WARN( + "parakeet: set_backends_directory('%s') ignored -- backends " + "already loaded from '%s' earlier in this process.\n", + dir.c_str(), g_recorded_backends_dir.c_str()); + } + } + return; + } + g_backends_dir = dir; +} + +void set_opencl_cache_dir(const std::string & dir) { +#if defined(__ANDROID__) + // Same "first Engine wins" contract as set_backends_directory: + // ggml-opencl reads $GGML_OPENCL_CACHE_DIR once per process at + // backend init (before the first kernel build), so a setenv + // after init is effectively a no-op on the cache binding. Gate + // on the shared g_backends_loaded flag because the OpenCL + // backend is registered at the same `ggml_backend_load_all*` + // call that flips the flag -- conservative because it might + // still take effect when the host hasn't yet instantiated a + // GPU device, but matches what the engine-ctor documentation + // promises and avoids the same silent-failure mode as + // set_backends_directory's previous gate. + std::lock_guard lock(g_backends_dir_mutex); + if (g_backends_loaded.load(std::memory_order_acquire)) { + if (!dir.empty() && dir != g_recorded_opencl_cache_dir && + !g_opencl_cache_dir_warned.exchange(true)) { + if (g_recorded_opencl_cache_dir.empty()) { + PARAKEET_LOG_WARN( + "parakeet: set_opencl_cache_dir('%s') ignored -- backends " + "were already loaded with no explicit OpenCL cache dir " + "earlier in this process ($GGML_OPENCL_CACHE_DIR either " + "unset or set by another consumer). Call " + "set_opencl_cache_dir() before the first Engine to take " + "effect.\n", + dir.c_str()); + } else { + PARAKEET_LOG_WARN( + "parakeet: set_opencl_cache_dir('%s') ignored -- " + "$GGML_OPENCL_CACHE_DIR already pinned to '%s' earlier in " + "this process.\n", + dir.c_str(), g_recorded_opencl_cache_dir.c_str()); + } + } + return; + } + if (dir.empty()) return; + // ggml-opencl's program-binary-cache patch reads this once per + // process at backend init (before the first kernel build). Set + // it before constructing the first Engine; later calls don't + // re-bind the cache but cost nothing. + ::setenv("GGML_OPENCL_CACHE_DIR", dir.c_str(), /*overwrite=*/1); + g_recorded_opencl_cache_dir = dir; +#else + (void) dir; +#endif +} + int load_from_gguf(const std::string & gguf_path, ParakeetCtcModel & out_model, int n_threads, diff --git a/parakeet-cpp/src/parakeet_ctc.h b/parakeet-cpp/src/parakeet_ctc.h index be0f3ad42b8..32fefe2947d 100644 --- a/parakeet-cpp/src/parakeet_ctc.h +++ b/parakeet-cpp/src/parakeet_ctc.h @@ -305,6 +305,14 @@ struct ParakeetCtcModel { // `TdtConfig` + `SortformerConfig`. using ParakeetModel = ParakeetCtcModel; +// Backend init configuration. Call before the first `load_from_gguf` +// (or Engine construction) in the process. Both are no-ops once the +// ggml-backend registry has been populated (the registry is a +// process-wide singleton); see implementation comments for the +// detailed lifetime contract. +void set_backends_directory(const std::string & dir); +void set_opencl_cache_dir(const std::string & dir); + int load_from_gguf(const std::string & gguf_path, ParakeetCtcModel & out_model, int n_threads, diff --git a/parakeet-cpp/src/parakeet_engine.cpp b/parakeet-cpp/src/parakeet_engine.cpp index 50ec54baa4d..e78e19e333f 100644 --- a/parakeet-cpp/src/parakeet_engine.cpp +++ b/parakeet-cpp/src/parakeet_engine.cpp @@ -141,6 +141,21 @@ static void prewarm_encoder(ParakeetCtcModel & model, float audio_seconds) { Engine::Engine(const EngineOptions & opts) : pimpl_(std::make_unique()) { pimpl_->opts = opts; + // Apply backend-init knobs before the first ggml call. Both are + // process-singleton-scoped (the ggml-backend registry only ever + // gets populated once per process; `$GGML_OPENCL_CACHE_DIR` is + // read once by ggml-opencl at first init), so this is effectively + // a "first Engine wins" race -- a second Engine with a different + // backends_dir is logged + ignored by set_backends_directory(). + // Hosts that need per-Engine isolation should run each Engine in + // its own subprocess. + if (!opts.backends_dir.empty()) { + set_backends_directory(opts.backends_dir); + } + if (!opts.opencl_cache_dir.empty()) { + set_opencl_cache_dir(opts.opencl_cache_dir); + } + const int rc = load_from_gguf(opts.model_gguf_path, pimpl_->model, opts.n_threads,