diff --git a/parakeet-cpp/.gitignore b/parakeet-cpp/.gitignore
index fa06a7c0cb4..ae20949e54e 100644
--- a/parakeet-cpp/.gitignore
+++ b/parakeet-cpp/.gitignore
@@ -1,5 +1,6 @@
 # ggml is cloned via scripts/setup-ggml.sh at a pinned commit; don't track it.
 ggml/
+ggml
 
 # Python virtualenv for the converter + reference-dump scripts.
 venv/
diff --git a/parakeet-cpp/CMakeLists.txt b/parakeet-cpp/CMakeLists.txt
index b1ae5c25fd2..eac64cc6957 100644
--- a/parakeet-cpp/CMakeLists.txt
+++ b/parakeet-cpp/CMakeLists.txt
@@ -63,6 +63,53 @@ if (CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "Android")
     endif()
 endif()
 
+# Android default backend stack: dynamic loading of Vulkan + OpenCL +
+# per-arch CPU variants. Mirrors the qvac llm-llamacpp Android config
+# (see qvac-registry-vcpkg/ports/llama-cpp/portfile.cmake) so the
+# parakeet prebuilds drop into the same `qvac__transcription-parakeet/`
+# folder shape as the llamacpp ones: a `.bare` module + sibling
+# `lib<prefix>ggml-{vulkan,opencl,cpu-android_armv*_*}.so` files that
+# `ggml_backend_load_all_from_path()` discovers at runtime.
+#
+# Selection at runtime is centralised in `init_gpu_backend()`
+# (src/parakeet_ctc.cpp): OpenCL when an Adreno 700+ device is
+# present, Vulkan for every other GPU (non-Adreno, Adreno < 700,
+# Mali, Xclipse, ...). No static GPU backend entry points are linked
+# anywhere in libparakeet; the registry walk reaches the right
+# backend in both GGML_BACKEND_DL=ON (Android prebuild) and
+# GGML_BACKEND_DL=OFF (desktop dev) modes.
+#
+# Callers that have specific reasons to deviate (e.g. a desktop bring-
+# up build that wants Vulkan only) can still override any of these
+# at the cmake command line; we only set defaults that haven't already
+# been provided.
+if (CMAKE_SYSTEM_NAME STREQUAL "Android")
+    if (NOT DEFINED CACHE{GGML_BACKEND_DL})
+        set(GGML_BACKEND_DL ON CACHE BOOL "" FORCE)
+    endif()
+    if (NOT DEFINED CACHE{GGML_CPU_ALL_VARIANTS})
+        set(GGML_CPU_ALL_VARIANTS ON CACHE BOOL "" FORCE)
+    endif()
+    if (NOT DEFINED CACHE{GGML_CPU_REPACK})
+        set(GGML_CPU_REPACK ON CACHE BOOL "" FORCE)
+    endif()
+    if (NOT DEFINED CACHE{GGML_VULKAN})
+        set(GGML_VULKAN ON CACHE BOOL "" FORCE)
+    endif()
+    if (NOT DEFINED CACHE{GGML_OPENCL})
+        set(GGML_OPENCL ON CACHE BOOL "" FORCE)
+    endif()
+    # ggml-vulkan's coopmat / coopmat2 shader compile pulls in extensions
+    # that most Android Vulkan drivers don't expose; the upstream llama
+    # Android build disables both for the same reason.
+    if (NOT DEFINED CACHE{GGML_VULKAN_DISABLE_COOPMAT})
+        set(GGML_VULKAN_DISABLE_COOPMAT ON CACHE BOOL "" FORCE)
+    endif()
+    if (NOT DEFINED CACHE{GGML_VULKAN_DISABLE_COOPMAT2})
+        set(GGML_VULKAN_DISABLE_COOPMAT2 ON CACHE BOOL "" FORCE)
+    endif()
+endif()
+
 # Two related workarounds for clang-cl / MSVC builds on Windows. Both
 # come from msys2 sneaking GCC-flavoured libraries onto CMake's search
 # paths and then being mismatched against clang-cl-compiled translation
@@ -108,6 +155,30 @@ if (WIN32 AND NOT MINGW)
     endif()
 endif()
 
+# Bundled-ggml library filename prefix. qvac-ext-ggml's `speech` branch
+# exposes `GGML_LIB_OUTPUT_PREFIX` (commit 4cec2d3a) which handles both
+# the OUTPUT_NAME rename for every ggml target (core + per-backend
+# .so/.dll/.a) AND the runtime loader's filename prefix
+# (`GGML_BACKEND_DL_PROJECT_PREFIX` compile define on ggml-base), so
+# the renamed `libspeech-ggml-{vulkan,opencl,cpu-*}.so` files are
+# actually discovered by `ggml_backend_load_all_from_path()` at
+# runtime.
+#
+# Setting `GGML_LIB_OUTPUT_PREFIX` here (as a cache variable, before
+# `add_subdirectory(ggml)`) is the supported way to override the
+# branch default (`qvac-speech-`) on a per-consumer basis without
+# editing the ggml subtree. The `speech-` prefix is shared across the
+# QVAC speech stack (whisper, parakeet, chatterbox, supertonic, ...)
+# so they can vendor a single ggml file set side-by-side without
+# colliding with the `qvac-` prefix used by the llm fork.
+if (PARAKEET_GGML_LIB_PREFIX AND NOT PARAKEET_USE_SYSTEM_GGML)
+    if (NOT DEFINED CACHE{GGML_LIB_OUTPUT_PREFIX})
+        set(GGML_LIB_OUTPUT_PREFIX "speech-" CACHE STRING
+            "ggml: prefix for built ggml library filenames (parakeet default)" FORCE)
+    endif()
+    message(STATUS "parakeet: bundled ggml libraries will be emitted with prefix '${GGML_LIB_OUTPUT_PREFIX}' (set PARAKEET_GGML_LIB_PREFIX=OFF to use the qvac-ext-ggml@speech default, or override -DGGML_LIB_OUTPUT_PREFIX=<other>)")
+endif()
+
 if (NOT TARGET ggml)
     if (PARAKEET_USE_SYSTEM_GGML)
         find_package(ggml CONFIG REQUIRED)
@@ -120,50 +191,6 @@ if (NOT TARGET ggml)
     endif()
 endif()
 
-function(parakeet_apply_ggml_prefix target)
-    if (NOT TARGET ${target})
-        return()
-    endif()
-    get_target_property(_qpgp_type ${target} TYPE)
-    if (_qpgp_type STREQUAL "INTERFACE_LIBRARY" OR _qpgp_type STREQUAL "OBJECT_LIBRARY")
-        return()
-    endif()
-    get_target_property(_qpgp_old_name ${target} OUTPUT_NAME)
-    if (NOT _qpgp_old_name OR _qpgp_old_name STREQUAL "_qpgp_old_name-NOTFOUND")
-        set(_qpgp_old_name ${target})
-    endif()
-    set_target_properties(${target} PROPERTIES
-        OUTPUT_NAME "speech-${_qpgp_old_name}"
-    )
-endfunction()
-
-if (PARAKEET_GGML_LIB_PREFIX AND NOT PARAKEET_USE_SYSTEM_GGML)
-    foreach (_qpgp_target ggml ggml-base)
-        parakeet_apply_ggml_prefix(${_qpgp_target})
-    endforeach()
-    if (DEFINED GGML_AVAILABLE_BACKENDS)
-        foreach (_qpgp_target ${GGML_AVAILABLE_BACKENDS})
-            parakeet_apply_ggml_prefix(${_qpgp_target})
-        endforeach()
-    endif()
-    # Renaming the bundled backend .so/.dll files alone is not enough:
-    # ggml's runtime loader (`ggml_backend_load_best`) hard-codes the
-    # `libggml-` / `ggml-` filename prefix when scanning for backends
-    # under `GGML_BACKEND_DL=ON`. The companion patch
-    # `patches/ggml-backend-reg-filename-prefix.patch` adds a
-    # `GGML_BACKEND_DL_PROJECT_PREFIX` macro to that loader; defining
-    # it here teaches the runtime to look for our prefixed filenames
-    # instead. Otherwise the renamed .so/.dll files exist on disk but
-    # are never discovered, and Vulkan/OpenCL/CUDA backends silently
-    # fail to load.
-    if (TARGET ggml)
-        target_compile_definitions(ggml PRIVATE
-            GGML_BACKEND_DL_PROJECT_PREFIX="speech-"
-        )
-    endif()
-    message(STATUS "parakeet: bundled ggml libraries will be emitted as libspeech-ggml-* (set PARAKEET_GGML_LIB_PREFIX=OFF to keep upstream filenames)")
-endif()
-
 # Same OpenMP avoidance as for ggml above: on Windows non-MinGW builds
 # CMake's FindOpenMP picks LLVM's `-fopenmp=libomp` compile flag but
 # resolves OpenMP_*_LIBRARIES to msys2 libgomp -> link-time mismatch.
@@ -180,29 +207,25 @@ if (PARAKEET_OPENMP)
     find_package(OpenMP)
 endif()
 
-# Centralised GGML_USE_* backend defines. Anything that compiles
-# parakeet_ctc.cpp (the library target plus the standalone test
-# executables that recompile it) must link against this so the
-# `init_gpu_backend` / BLAS / CUDA / Metal / Vulkan code paths get
-# selected consistently. Without this, e.g. test-encoder would silently
-# build with the GPU branch compiled out and `--n-gpu-layers 1` would
-# be a no-op.
+# Legacy interface library kept for export-set compatibility (it is
+# still part of `install(EXPORT parakeet-cpp-targets)` below and
+# downstream `find_package(parakeet-cpp)` consumers list it as a link
+# dep). Body intentionally empty: parakeet routes every backend
+# decision through the ggml-backend registry
+# (`ggml_backend_load_all` + `ggml_backend_dev_*`, see
+# `init_gpu_backend()` / `init_cpu_backend()` / `init_blas_backend()`
+# in src/parakeet_ctc.cpp) and does NOT call any
+# `ggml_backend_<backend>_init` / `ggml_backend_is_<backend>` entry
+# point directly. The `GGML_USE_VULKAN` / `GGML_USE_OPENCL` /
+# `GGML_USE_METAL` / `GGML_USE_CUDA` / `GGML_USE_BLAS` compile defines
+# that used to live here were only consumed by `#ifdef` cascades that
+# called those static entry points; with the registry-only design
+# they're dead, and shipping them would falsely advertise a static
+# backend dependency that the GGML_BACKEND_DL=ON Android/Linux builds
+# explicitly do not have (their backends live in separately-loadable
+# `.so` files that are dlopen()'d by `ggml_backend_load_all_from_path`
+# at runtime).
 add_library(parakeet-backend-defs INTERFACE)
-if (GGML_CUDA)
-    target_compile_definitions(parakeet-backend-defs INTERFACE GGML_USE_CUDA)
-endif()
-if (GGML_METAL)
-    target_compile_definitions(parakeet-backend-defs INTERFACE GGML_USE_METAL)
-endif()
-if (GGML_VULKAN)
-    target_compile_definitions(parakeet-backend-defs INTERFACE GGML_USE_VULKAN)
-endif()
-if (GGML_BLAS)
-    target_compile_definitions(parakeet-backend-defs INTERFACE GGML_USE_BLAS)
-endif()
-if (GGML_OPENCL)
-    target_compile_definitions(parakeet-backend-defs INTERFACE GGML_USE_OPENCL)
-endif()
 
 set(PARAKEET_LIB_SOURCES
     src/parakeet_ctc.cpp
@@ -421,28 +444,14 @@ if (PARAKEET_BUILD_TESTS)
         endif()
     endfunction()
 
-    # Helper: keep the parakeet_ctc.cpp #ifdefs (BLAS / CUDA / Metal / Vulkan /
-    # OpenCL backend init) consistent across the parakeet library and any
-    # test executable that compiles parakeet_ctc.cpp from source. Without this,
-    # tests that don't link the library would always evaluate the #ifdefs as
-    # "no backend defined", producing link errors against ggml-blas / ggml-vk
-    # / ggml-opencl when the parent build did enable them.
+    # Helper: keep PARAKEET_EXPERIMENTAL_FLASH_ATTN consistent across the
+    # parakeet library and any test executable that recompiles
+    # parakeet_ctc.cpp from source. Backend selection itself goes
+    # through the ggml-backend registry (no per-backend `GGML_USE_*`
+    # #ifdef cascade in parakeet_ctc.cpp anymore -- see the comment on
+    # `parakeet-backend-defs` above), so this helper only carries the
+    # flash-attn gate plus the shared ccache launcher.
     function(parakeet_apply_backend_defs target)
-        if (GGML_BLAS)
-            target_compile_definitions(${target} PRIVATE GGML_USE_BLAS)
-        endif()
-        if (GGML_CUDA)
-            target_compile_definitions(${target} PRIVATE GGML_USE_CUDA)
-        endif()
-        if (GGML_METAL)
-            target_compile_definitions(${target} PRIVATE GGML_USE_METAL)
-        endif()
-        if (GGML_VULKAN)
-            target_compile_definitions(${target} PRIVATE GGML_USE_VULKAN)
-        endif()
-        if (GGML_OPENCL)
-            target_compile_definitions(${target} PRIVATE GGML_USE_OPENCL)
-        endif()
         if (PARAKEET_FLASH_ATTN)
             target_compile_definitions(${target} PRIVATE PARAKEET_EXPERIMENTAL_FLASH_ATTN)
         endif()
diff --git a/parakeet-cpp/include/parakeet/engine.h b/parakeet-cpp/include/parakeet/engine.h
index 9aa9f9616c6..236157e1746 100644
--- a/parakeet-cpp/include/parakeet/engine.h
+++ b/parakeet-cpp/include/parakeet/engine.h
@@ -81,6 +81,46 @@ struct EngineOptions {
 
     bool verbose     = false;
 
+    // Directory to scan for dynamically-loaded ggml backends
+    // (`libspeech-ggml-vulkan.so`, `libspeech-ggml-opencl.so`,
+    // `libspeech-ggml-cpu-android_armv8.2_1.so`, ...). Forwarded to
+    // `ggml_backend_load_all_from_path()` on the first Engine
+    // construction in the process; subsequent constructions reuse the
+    // already-populated registry.
+    //
+    // Leave empty to fall back to ggml's default search path
+    // (`ggml_backend_load_all()`), which walks compile-time defaults
+    // (`$EXE_DIR`, `LD_LIBRARY_PATH`, ...). Embedded host applications
+    // built with `GGML_BACKEND_DL=ON` (the Android / Linux non-Apple
+    // default; see CMakeLists.txt) should pass an explicit dir
+    // because the .so files ship next to the host's binary in a
+    // platform-specific subfolder rather than on the system loader's
+    // path.
+    //
+    // No-op on builds where ggml is statically linked
+    // (`GGML_BACKEND_DL=OFF`, e.g. desktop dev cmake builds and the
+    // Apple xcframework). On those, every backend is registered at
+    // constructor time from inside libggml and no filesystem scan
+    // takes place.
+    std::string backends_dir;
+
+    // Sets `$GGML_OPENCL_CACHE_DIR` before the first backend init so
+    // ggml-opencl persists `clCreateProgramWithBinary` blobs across
+    // process restarts (see the program-binary-cache patch on
+    // qvac-ext-ggml@speech). Strongly recommended on Android where
+    // the cold `clBuildProgram` cost dominates first-utterance
+    // latency; pass a writable per-app directory (typically the
+    // app's `cacheDir` from the host platform).
+    //
+    // Honoured only on `__ANDROID__` builds; ignored elsewhere
+    // (desktop OpenCL platforms don't ship the binary-cache patch
+    // and would otherwise pollute the user's tmpdir).
+    //
+    // Leave empty to keep the existing `$GGML_OPENCL_CACHE_DIR` env
+    // value (or no cache at all). Wrapper scripts that already
+    // export the env take precedence.
+    std::string opencl_cache_dir;
+
     // Opt-in cold-start mitigation.
     //
     // When `prewarm == true`, the Engine constructor runs one
diff --git a/parakeet-cpp/patches/README.md b/parakeet-cpp/patches/README.md
deleted file mode 100644
index d55e53e27a4..00000000000
--- a/parakeet-cpp/patches/README.md
+++ /dev/null
@@ -1,264 +0,0 @@
-# ggml patches for parakeet.cpp
-
-`ggml` is vendored as a pristine upstream clone (see the top-level
-[`README.md`](../README.md) and [`scripts/setup-ggml.sh`](../scripts/setup-ggml.sh)),
-so the local fixes parakeet.cpp depends on live here as standalone
-patches and are applied after the clone.
-
-Three patches ship today:
-
-1. [`ggml-backend-reg-filename-prefix.patch`](#ggml-backend-reg-filename-prefixpatch)
-   — teaches `ggml_backend_load_best()` to honour a compile-time
-   `GGML_BACKEND_DL_PROJECT_PREFIX` macro, so renaming the bundled
-   backend .so/.dll files (parakeet does this to avoid colliding with
-   another consumer's `libggml-*` files in the same host process) does
-   not break runtime backend discovery under `GGML_BACKEND_DL=ON`.
-   No-op when the macro is undefined.
-2. [`ggml-opencl-allow-non-adreno.patch`](#ggml-opencl-allow-non-adrenopatch)
-   — lets the OpenCL backend bring up on commodity desktop GPUs
-   (NVIDIA, AMD, Apple) so `parakeet.cpp` can be built and parity-
-   tested with `-DGGML_OPENCL=ON` outside Adreno-only environments.
-   No-op on real Adreno targets (the patch only relaxes the rejection
-   of unknown GPU vendors and the assertion in
-   `ggml_backend_opencl_init()` when no devices were found).
-3. [`ggml-opencl-program-binary-cache.patch`](#ggml-opencl-program-binary-cachepatch)
-   — adds a persistent on-disk cache for compiled OpenCL kernel
-   binaries, removing the multi-second `clBuildProgram` wave at every
-   cold start. Honours `$GGML_OPENCL_CACHE_DIR`, with
-   `$XDG_CACHE_HOME/ggml/opencl` → `$HOME/.cache/ggml/opencl`
-   fallbacks. Opt-out via `GGML_OPENCL_CACHE_DIR=""`.
-
-`scripts/setup-ggml.sh` applies every `patches/ggml-*.patch` in
-lexicographic order; the script is idempotent and resets the ggml
-worktree to the pinned commit before applying.
-
-## Apply
-
-The top-level [`scripts/setup-ggml.sh`](../scripts/setup-ggml.sh) does
-everything for you:
-
-```bash
-# From the repo root.  Clones ggml if needed, checks out the pinned
-# commit, and applies every patch under patches/.  Idempotent --
-# re-running is a no-op.
-./scripts/setup-ggml.sh
-```
-
-Then configure + build as usual. Pick the backend flags for your
-platform; OpenCL pulls in the patch automatically:
-
-```bash
-# Apple Silicon
-cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_METAL=ON -DGGML_METAL_EMBED_LIBRARY=ON
-
-# NVIDIA / desktop
-cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
-
-# Vulkan (anything else)
-cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_VULKAN=ON
-
-# OpenCL: Adreno (Android) target
-cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_OPENCL=ON
-
-# OpenCL: NVIDIA / AMD / Apple desktop (dev / CI parity testing) --
-# Adreno-tuned matmul kernels OFF, generic OpenCL paths only:
-cmake -S . -B build -DCMAKE_BUILD_TYPE=Release \
-    -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=OFF
-```
-
-If you'd rather run the steps by hand (e.g. to pin a different
-upstream commit), the script is effectively:
-
-```bash
-git clone https://github.com/ggml-org/ggml.git ggml
-cd ggml && git checkout $GGML_COMMIT
-git apply ../patches/ggml-backend-reg-filename-prefix.patch
-git apply ../patches/ggml-opencl-allow-non-adreno.patch
-git apply ../patches/ggml-opencl-program-binary-cache.patch
-```
-
-`GGML_COMMIT` lives at the top of `scripts/setup-ggml.sh` as the
-single source of truth -- bump it when re-generating the patches
-against a newer upstream ggml. To confirm everything applied
-cleanly:
-
-```bash
-(cd ggml && git status --short)
-# Expected: 2 modified files
-#   ggml/src/ggml-backend-reg.cpp     (filename-prefix patch)
-#   ggml/src/ggml-opencl/ggml-opencl.cpp  (both OpenCL patches stack on this file)
-```
-
-CPU / CUDA / Metal / Vulkan builds get the pinned commit and the
-filename-prefix patch (which is a strict no-op when the host
-project does not define `GGML_BACKEND_DL_PROJECT_PREFIX`); the
-OpenCL changes are no-op for every other backend.
-
-## `ggml-backend-reg-filename-prefix.patch`
-
-Base commit: `58c38058` (`sync : llama.cpp`, 2026-04-09).
-
-Adds a single compile-time switch
-`GGML_BACKEND_DL_PROJECT_PREFIX` to `ggml_backend_load_best()` so
-the runtime backend-discovery walk can be retargeted at the
-filename prefix used by a host project that renames the bundled
-`libggml-*` files to avoid colliding with another consumer's
-`libggml-*` files in the same host process.
-
-Background: parakeet ships its bundled ggml backends as
-`libspeech-ggml-*.{so,dll}` (CMake option
-`PARAKEET_GGML_LIB_PREFIX=ON`, default) so a host process that
-loads two consumers each vendoring its own ggml does not see a
-name clash on `libggml-vulkan.so` / `libggml-cuda.so` / etc. The
-`speech-` prefix is shared with the rest of the QVAC speech stack
-(whisper, parakeet, chatterbox, supertonic, ...) so the family
-co-vendors a single ggml file set.
-Without this patch, the rename works at link time but
-`ggml_backend_load_best()` still searches for `libggml-*.so` /
-`ggml-*.dll`, so under `GGML_BACKEND_DL=ON` the renamed files are
-on disk but never discovered and Vulkan/OpenCL/CUDA backends
-silently fail to load.
-
-| Symptom | Root cause | What this patch does |
-|---------|-----------|----------------------|
-| `speech-ggml-vulkan.so` (etc.) is on disk but ggml's loader never picks it up under `GGML_BACKEND_DL=ON` | `backend_filename_prefix()` hard-codes `libggml-` / `ggml-` and `ggml_backend_load_best` filters directory entries by that fixed prefix | Honour an optional compile-time `GGML_BACKEND_DL_PROJECT_PREFIX` string literal (e.g. `"speech-"`); when defined, the loader searches for `lib<prefix>ggml-*` / `<prefix>ggml-*` instead. Macro undefined ⇒ behaviour byte-equal to upstream. |
-
-The CMake side wires the macro from `PARAKEET_GGML_LIB_PREFIX`:
-when that option is on (the default), parakeet's top-level
-`CMakeLists.txt` does
-`target_compile_definitions(ggml PRIVATE GGML_BACKEND_DL_PROJECT_PREFIX="speech-")`
-on the `ggml` target (which is what compiles
-`ggml-backend-reg.cpp`). Consumers that prefer the upstream
-filenames (system ggml, single-consumer hosts) configure with
-`-DPARAKEET_GGML_LIB_PREFIX=OFF` and the macro stays undefined,
-so the loader behaviour matches stock ggml exactly.
-
-## `ggml-opencl-allow-non-adreno.patch`
-
-Base commit: `58c38058` (`sync : llama.cpp`, 2026-04-09).
-
-Fixes two gaps in `ggml-opencl` that make `-DGGML_OPENCL=ON` builds of
-`parakeet.cpp` impossible to bring up outside an Adreno-only
-environment:
-
-| Symptom                                                                                                | Root cause in `ggml-opencl`                                                                                                                                                                                                                                                                                            | What this patch does                                                                                                                                                                                                          |
-|--------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Every NVIDIA / AMD / Apple OpenCL device is dropped at init with `Unsupported GPU: <device-name>`      | `ggml_cl2_init()` whitelists `Adreno` / `Qualcomm` / `Intel` and returns `nullptr` for everything else. Even with `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`, a non-Adreno GPU never reaches the generic kernels.                                                                                                           | Default behaviour is byte-equal to upstream (still returns `nullptr`). Set `GGML_OPENCL_ALLOW_UNKNOWN_GPU=1` to opt the device through with `GPU_FAMILY::UNKNOWN`; we additionally require `cl_intel_required_subgroup_size` *or* `cl_qcom_reqd_sub_group_size` (the matmul-vec kernels need one to define `N_DST`/`N_SIMDGROUP`/`N_SIMDWIDTH`), so AMD/NVIDIA still fall back to host instead of crashing in `clBuildProgram`. |
-| `parakeet --n-gpu-layers 1` aborts with `GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg))` when zero usable devices were found | `ggml_backend_opencl_init()` calls `ggml_backend_reg_dev_get(reg, 0)` unconditionally. When the device discovery cleared the list (e.g. only an unsupported GPU was present), `dev_get(0)` asserts and the host process aborts. parakeet's `init_gpu_backend()` cascade expects a nullable result so it can fall back. | Check `ggml_backend_reg_dev_count(reg) == 0` before `dev_get` and return `nullptr` on empty. Also propagate `nullptr` when `ggml_cl2_init()` rejects the device, so the host-side fallback path actually runs.                |
-
-The patch is **strictly additive** for real Adreno targets:
-`gpu_family == ADRENO` is computed exactly as before, the Adreno
-shuffle / large-buffer paths still trigger when (and only when) the
-device is Adreno, and without `GGML_OPENCL_ALLOW_UNKNOWN_GPU=1` the
-non-Adreno reject path is byte-equal to upstream so production Android
-builds get the same compile-time guarantees as before.
-
-The intended audience for the patch is:
-
-  * `parakeet.cpp` developers running CI on Intel iGPU desktop
-    hardware (the matmul-vec kernels gate on
-    `cl_intel_required_subgroup_size`, so Intel iGPU is the only
-    desktop class that can actually execute the OpenCL kernels;
-    AMD/NVIDIA users get a clean CPU fallback instead of crashing
-    inside `clBuildProgram`).
-  * Anyone who wants to reproduce the OpenCL backend's mel/encoder
-    parity numbers without an Adreno device.
-
-Opt-in is gated behind `GGML_OPENCL_ALLOW_UNKNOWN_GPU=1` so misconfigured
-production builds still get the same explicit `Unsupported GPU` error
-upstream returned, instead of a silent "running with an untested GPU".
-
-It is **not** intended to ship a fast OpenCL path on NVIDIA / AMD /
-Apple desktops (CUDA / Vulkan / Metal are far better suited there);
-its only purpose is bring-up + parity testing.
-
-## `ggml-opencl-program-binary-cache.patch`
-
-Base commit: `58c38058` (`sync : llama.cpp`, 2026-04-09).
-
-Adds a persistent on-disk cache for compiled OpenCL kernel binaries
-to `ggml-opencl`. Upstream `build_program_from_source()` calls
-`clCreateProgramWithSource` + `clBuildProgram` on every cold start,
-re-paying the driver's shader-compile wave (multiple seconds on
-Adreno / Mesa / Mali; tens of ms on most desktop drivers). This
-patch drops the call to `clCreateProgramWithBinary` against a
-device-specific cache blob whenever one exists, and persists every
-freshly-compiled program back to disk on miss.
-
-| Symptom                                                                                | Root cause                                                                              | What this patch does                                                                                              |
-|----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------|
-| Every cold-start `parakeet --n-gpu-layers 1` re-compiles all 88 OpenCL kernels    | `build_program_from_source` always calls `clCreateProgramWithSource` + `clBuildProgram` | Look up `<cache_dir>/<key>.bin` first via `clCreateProgramWithBinary`; only fall through to source compile on miss |
-| Hosts already `setenv` `GGML_OPENCL_CACHE_DIR` for the same goal, but ggml-opencl ignores it | The env var is read **nowhere** in upstream ggml-opencl at this commit  | Resolves cache dir from `$GGML_OPENCL_CACHE_DIR` → `$XDG_CACHE_HOME/ggml/opencl` → `$HOME/.cache/ggml/opencl`, so the env-var contract takes effect. |
-
-### Cache key
-
-`<src_hash>_<opts_hash>_<driver_hash>_<dev_name_hash>_<dev_ver_hash>.bin`,
-where each component is FNV-1a-64. Each kernel's `program_buffer`
-hashes independently (88 different cache files per device); a
-driver upgrade or moving to a different device silently invalidates
-the cache because either `driver_hash` or `dev_*_hash` changes.
-There is no manual invalidation step.
-
-### Atomic writes
-
-The cache writer dumps `getProgramInfo(CL_PROGRAM_BINARIES)` to
-`<path>.tmp` then `rename(2)`s into place. POSIX rename is atomic,
-so concurrent processes can't read a half-written file; the
-last-writer-wins result is fine because each blob is independently
-valid for the same `(src, opts, driver, dev)` combination.
-
-### Footprint
-
-Each kernel binary lands at ~10-200 KB on Adreno (driver-dependent);
-88 kernels × ~50 KB average ≈ 4-5 MB on disk per device per process
-family. No size cap on disk today -- if it ever becomes a concern
-on tightly-budgeted mobile installs, wrap the writer with a
-ceiling.
-
-### Opt-out / disable
-
-`GGML_OPENCL_CACHE_DIR=""` (literal empty string) short-circuits
-both the read and the write paths and runs the original
-source-compile route. Useful for benchmarking the cold-start cost,
-or in a CI runner that wants every run to re-compile.
-
-When the cache dir resolves but `mkdir -p` fails (read-only
-filesystem, permissions, ...), the writer logs nothing and falls
-through to source compile silently -- no behavioural difference
-versus running with the patch absent.
-
-### Stale-cache handling
-
-`clCreateProgramWithBinary` can return `CL_INVALID_BINARY` (or the
-subsequent `clBuildProgram` can fail) when the on-disk blob is
-stale (driver upgrade, different shader IR version, mismatched
-device). The patch handles every such failure by releasing the
-program and falling through to source compile. The next run then
-overwrites the bad blob.
-
-### Measured impact
-
-This patch is **not yet benchmarked on a real Adreno device**: the
-benchmark hosts the patch was developed on are NVIDIA-only, and
-NVIDIA's OpenCL driver lacks the fp16 / OpenCL C 2.0 features
-ggml-opencl mandates -- the kernels never compile at all there, so
-there is nothing to cache. Expected impact:
-
-  * **Cold start (no cache)**: same as upstream -- multi-second
-    shader compile wave on Adreno.
-  * **Warm cache** (any subsequent invocation): saves the entire
-    `clBuildProgram` wave; typical Adreno saving is multiple
-    seconds per process.
-
-Once Adreno hardware is available for follow-up benchmarking, the
-expected bench shape is the standard pipeline-cache curve:
-cold ≫ ggml-warm ≈ both-warm.
-
-## Dropping the patches
-
-If upstream ggml-opencl decides to relax the GPU-vendor whitelist
-itself, or ships its own kernel binary cache, delete the patch
-file(s) and remove the corresponding entry from the `PATCHES=(…)`
-glob in `scripts/setup-ggml.sh`. The C++ side of parakeet uses
-only ops that ggml-opencl already supports natively (per the
-op-coverage audit), so nothing else needs to change.
diff --git a/parakeet-cpp/patches/ggml-backend-reg-filename-prefix.patch b/parakeet-cpp/patches/ggml-backend-reg-filename-prefix.patch
deleted file mode 100644
index e5e824e592c..00000000000
--- a/parakeet-cpp/patches/ggml-backend-reg-filename-prefix.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-diff --git a/src/ggml-backend-reg.cpp b/src/ggml-backend-reg.cpp
---- a/src/ggml-backend-reg.cpp
-+++ b/src/ggml-backend-reg.cpp
-@@ -442,12 +442,31 @@ static std::string get_executable_path() {
- #endif
- }
- 
-+// parakeet patch: allow consuming projects to override the backend
-+// shared-library filename prefix at compile time. Without this, the
-+// loader hard-codes "ggml-" (Windows) / "libggml-" (other), so two
-+// addons that vendor different ggml versions and rename their bundled
-+// backend .so/.dll files to avoid filename collisions still cannot be
-+// loaded with `GGML_BACKEND_DL=ON`: the discovery walk in
-+// `ggml_backend_load_best` only matches the unprefixed names. Define
-+// `GGML_BACKEND_DL_PROJECT_PREFIX` (a string literal, e.g.
-+// "speech-") at compile time and the loader will instead search for
-+// "<prefix>ggml-*" / "lib<prefix>ggml-*". Default behaviour (macro
-+// undefined) is byte-equal to upstream.
- static fs::path backend_filename_prefix() {
-+#if defined(GGML_BACKEND_DL_PROJECT_PREFIX)
-+#ifdef _WIN32
-+    return fs::u8path(GGML_BACKEND_DL_PROJECT_PREFIX "ggml-");
-+#else
-+    return fs::u8path("lib" GGML_BACKEND_DL_PROJECT_PREFIX "ggml-");
-+#endif
-+#else
- #ifdef _WIN32
-     return fs::u8path("ggml-");
- #else
-     return fs::u8path("libggml-");
- #endif
-+#endif
- }
- 
- static fs::path backend_filename_extension() {
diff --git a/parakeet-cpp/patches/ggml-opencl-allow-non-adreno.patch b/parakeet-cpp/patches/ggml-opencl-allow-non-adreno.patch
deleted file mode 100644
index 458c10f8768..00000000000
--- a/parakeet-cpp/patches/ggml-opencl-allow-non-adreno.patch
+++ /dev/null
@@ -1,91 +0,0 @@
-diff --git a/src/ggml-opencl/ggml-opencl.cpp b/src/ggml-opencl/ggml-opencl.cpp
-index 6f3fc588..96942915 100644
---- a/src/ggml-opencl/ggml-opencl.cpp
-+++ b/src/ggml-opencl/ggml-opencl.cpp
-@@ -3020,9 +3020,57 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
-     } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
-         backend_ctx->gpu_family = GPU_FAMILY::INTEL;
-     } else {
--        GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
-+        // parakeet patch: upstream ggml-opencl rejects any GPU that is
-+        // not Adreno/Qualcomm or Intel. Parakeet's real OpenCL deployment
-+        // target is Adreno (Android); for desktop dev/CI parity on Intel
-+        // iGPUs we let the device through with `gpu_family = UNKNOWN`
-+        // when the host opts in via `GGML_OPENCL_ALLOW_UNKNOWN_GPU=1`.
-+        //
-+        // Default (env var unset) preserves upstream behaviour byte-equal,
-+        // so production Adreno builds get no behavioural change and a
-+        // misconfigured non-Adreno consumer gets the same clear error as
-+        // before instead of crashing later in kernel-compile.
-+        //
-+        // The matmul-vec kernels (mul_mv_q4_0_f32_v.cl etc.) auto-define
-+        // INTEL_GPU / ADRENO_GPU based on `cl_intel_required_subgroup_size`
-+        // / `cl_qcom_reqd_sub_group_size`. Without one of those extensions
-+        // the kernel source has no way to define N_DST / N_SIMDGROUP /
-+        // N_SIMDWIDTH and `clBuildProgram` aborts the host process. So we
-+        // additionally require one of those two extensions before letting
-+        // the device through; AMD/NVIDIA desktop drivers expose neither
-+        // and now fall back cleanly to CPU instead of crashing.
-+        const char * allow = getenv("GGML_OPENCL_ALLOW_UNKNOWN_GPU");
-+        if (!allow || allow[0] != '1') {
-+            GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
-+            backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-+            return nullptr;
-+        }
-+
-+        size_t ext_size = 0;
-+        clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_size);
-+        std::string ext;
-+        if (ext_size > 0) {
-+            ext.resize(ext_size);
-+            clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, ext_size, ext.data(), NULL);
-+        }
-+        const bool has_intel_sg = ext.find("cl_intel_required_subgroup_size") != std::string::npos;
-+        const bool has_qcom_sg  = ext.find("cl_qcom_reqd_sub_group_size")     != std::string::npos;
-+        if (!has_intel_sg && !has_qcom_sg) {
-+            GGML_LOG_ERROR("ggml_opencl: GPU '%s' has neither cl_intel_required_subgroup_size "
-+                "nor cl_qcom_reqd_sub_group_size; matmul-vec kernels cannot define "
-+                "N_DST/N_SIMDGROUP/N_SIMDWIDTH and clBuildProgram would abort. "
-+                "Falling back to host (parakeet patch).\n",
-+                dev_ctx->device_name.c_str());
-+            backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-+            return nullptr;
-+        }
-+
-+        GGML_LOG_WARN("ggml_opencl: GPU '%s' is not Adreno/Qualcomm or Intel; "
-+                      "running with generic OpenCL kernels (parakeet patch + "
-+                      "GGML_OPENCL_ALLOW_UNKNOWN_GPU=1). "
-+                      "Adreno-specific kernels and large-buffer paths stay off.\n",
-+                      dev_ctx->device_name.c_str());
-         backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
--        return nullptr;
-     }
- 
- #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
-@@ -4075,8 +4123,25 @@ static ggml_backend_i ggml_backend_opencl_i = {
- };
- 
- ggml_backend_t ggml_backend_opencl_init(void) {
--    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0);
-+    // parakeet patch: bail out cleanly when the OpenCL backend
-+    // discovery saw zero usable devices. Upstream calls
-+    // ggml_backend_reg_dev_get() unconditionally, which asserts on an
-+    // empty device list. Parakeet's host code expects a nullable result
-+    // from ggml_backend_opencl_init() (it falls back to CPU when the
-+    // returned backend is null); the assertion makes that fallback path
-+    // unreachable on hosts where ggml-opencl can't find any GPU it
-+    // accepts (Adreno-only environments without an Adreno device,
-+    // headless CI runners, etc.).
-+    ggml_backend_reg_t reg = ggml_backend_opencl_reg();
-+    if (ggml_backend_reg_dev_count(reg) == 0) {
-+        return nullptr;
-+    }
-+
-+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(reg, 0);
-     ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
-+    if (backend_ctx == nullptr) {
-+        return nullptr;
-+    }
- 
-     ggml_backend_t backend = new ggml_backend {
-         /* .guid    = */ ggml_backend_opencl_guid(),
diff --git a/parakeet-cpp/patches/ggml-opencl-program-binary-cache.patch b/parakeet-cpp/patches/ggml-opencl-program-binary-cache.patch
deleted file mode 100644
index bdf15bf2169..00000000000
--- a/parakeet-cpp/patches/ggml-opencl-program-binary-cache.patch
+++ /dev/null
@@ -1,269 +0,0 @@
-diff --git a/src/ggml-opencl/ggml-opencl.cpp b/src/ggml-opencl/ggml-opencl.cpp
-index 96942915..7c2e4bc2 100644
---- a/src/ggml-opencl/ggml-opencl.cpp
-+++ b/src/ggml-opencl/ggml-opencl.cpp
-@@ -20,6 +20,7 @@
- 
- #include <cstddef>
- #include <cstdint>
-+#include <cstdio>
- #include <fstream>
- #include <vector>
- #include <string>
-@@ -29,6 +30,32 @@
- #include <charconv>
- #include <mutex>
- 
-+// parakeet patch: persistent kernel binary cache support. The
-+// helpers below sit on POSIX file primitives (mkdir/unlink/fsync) but
-+// also need to build on MinGW / MSVC where those names map to the
-+// `_`-prefixed Windows variants and mkdir takes a single argument.
-+// Wrap them in parakeet_* macros so the rest of the patch stays
-+// platform-agnostic.
-+#include <cerrno>
-+#include <fcntl.h>
-+#include <sys/stat.h>
-+#ifdef _WIN32
-+#  include <direct.h>
-+#  include <io.h>
-+#  define parakeet_mkdir(path)   _mkdir(path)
-+#  define parakeet_unlink(path)  _unlink(path)
-+#  define parakeet_open_ro(path) _open((path), _O_RDONLY | _O_BINARY)
-+#  define parakeet_close(fd)     _close(fd)
-+#  define parakeet_fsync(fd)     _commit(fd)
-+#else
-+#  include <unistd.h>
-+#  define parakeet_mkdir(path)   mkdir((path), 0755)
-+#  define parakeet_unlink(path)  unlink(path)
-+#  define parakeet_open_ro(path) open((path), O_RDONLY)
-+#  define parakeet_close(fd)     close(fd)
-+#  define parakeet_fsync(fd)     fsync(fd)
-+#endif
-+
- #undef MIN
- #undef MAX
- #define MIN(a, b) ((a) < (b) ? (a) : (b))
-@@ -755,6 +782,193 @@ inline std::string read_file(const std::string &path) {
-   return text;
- }
- 
-+// parakeet patch: persistent OpenCL kernel-binary cache.
-+// ggml-opencl as shipped at this commit JIT-compiles every embedded
-+// kernel via `clBuildProgram(clCreateProgramWithSource)` on each cold
-+// start. On Adreno that's tens of seconds of shader compile per
-+// process invocation; on Mesa / Mali / iGPU drivers it's similar.
-+// This patch caches the device-specific compiled binaries under
-+// `$GGML_OPENCL_CACHE_DIR` (or `$XDG_CACHE_HOME/ggml/opencl` →
-+// `$HOME/.cache/ggml/opencl` fallback) keyed on a 64-bit FNV-1a hash of
-+// (source + compile_opts + driver_version + device_name + ggml_commit).
-+// Cache hit -> `clCreateProgramWithBinary`; miss / corrupted blob ->
-+// fall through to source compile and write the resulting binary back.
-+//
-+// The opt-out path is `GGML_OPENCL_CACHE_DIR=""` (empty string) which
-+// short-circuits the cache and runs the original source path. With no
-+// cache directory writable, the helper logs a warning and falls
-+// through to source compile silently.
-+//
-+// Hosts that already `setenv("GGML_OPENCL_CACHE_DIR", ...)` to point
-+// the runtime at a writable location (typical pattern on Android
-+// Adreno deployments) get the cache for free; this patch makes that
-+// env-var contract take effect rather than being ignored upstream.
-+
-+static uint64_t fnv1a_hash64(const void * data, size_t n) {
-+    const uint8_t * p = static_cast<const uint8_t *>(data);
-+    uint64_t h = 0xcbf29ce484222325ULL;
-+    for (size_t i = 0; i < n; ++i) {
-+        h ^= p[i];
-+        h *= 0x100000001b3ULL;
-+    }
-+    return h;
-+}
-+
-+static std::string opencl_cache_dir(cl_device_id dev) {
-+    const char * env = getenv("GGML_OPENCL_CACHE_DIR");
-+    if (env && *env == '\0') return ""; // explicit opt-out: empty string
-+    if (env && *env != '\0') return env;
-+    if (const char * xdg = getenv("XDG_CACHE_HOME"); xdg && *xdg) {
-+        return std::string(xdg) + "/ggml/opencl";
-+    }
-+    if (const char * home = getenv("HOME"); home && *home) {
-+        return std::string(home) + "/.cache/ggml/opencl";
-+    }
-+    GGML_UNUSED(dev);
-+    return ""; // no plausible default; opt out gracefully
-+}
-+
-+static bool opencl_mkdir_p(const std::string & path) {
-+    // Lightweight `mkdir -p` without C++17 <filesystem> dep on the
-+    // ggml-opencl side (some downstream consumers compile against
-+    // libstdc++ versions where std::filesystem requires linking
-+    // -lstdc++fs explicitly). Returns true if the directory exists
-+    // afterwards.
-+    if (path.empty()) return false;
-+    std::string cur;
-+    cur.reserve(path.size());
-+    for (size_t i = 0; i <= path.size(); ++i) {
-+        const char c = i < path.size() ? path[i] : '/';
-+        if ((c == '/' || c == '\\') && !cur.empty()) {
-+            if (parakeet_mkdir(cur.c_str()) != 0 && errno != EEXIST) {
-+                return false;
-+            }
-+        }
-+        if (i < path.size()) cur.push_back(c);
-+    }
-+    return true;
-+}
-+
-+static std::string opencl_cache_key(const char * program_buffer,
-+                                    size_t program_size,
-+                                    const std::string & compile_opts,
-+                                    cl_device_id dev) {
-+    // Combine source + opts + device + driver into the cache key so a
-+    // driver bump or a different SoC reuses different blobs. We hash
-+    // each component separately and combine to avoid pathological
-+    // FNV behaviour on long buffers.
-+    uint64_t h_src    = fnv1a_hash64(program_buffer, program_size);
-+    uint64_t h_opts   = fnv1a_hash64(compile_opts.data(), compile_opts.size());
-+
-+    // Driver version + device name + OpenCL C version pinpoint the
-+    // driver instance the binary was emitted by. Pinpointing too
-+    // tightly is a feature: a driver bump silently invalidates the
-+    // cache, exactly the policy you want.
-+    char driver_buf[256] = {0};
-+    char devname_buf[256] = {0};
-+    char devver_buf[256]  = {0};
-+    size_t n;
-+    clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_buf) - 1, driver_buf, &n);
-+    clGetDeviceInfo(dev, CL_DEVICE_NAME,    sizeof(devname_buf) - 1, devname_buf, &n);
-+    clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(devver_buf) - 1,  devver_buf, &n);
-+    uint64_t h_drv    = fnv1a_hash64(driver_buf,  strlen(driver_buf));
-+    uint64_t h_dev    = fnv1a_hash64(devname_buf, strlen(devname_buf));
-+    uint64_t h_devver = fnv1a_hash64(devver_buf,  strlen(devver_buf));
-+
-+    // Five 16-char hex tokens + 4 underscores + ".bin" + NUL = 89 bytes.
-+    // Use PRIx64 + (uint64_t) so the format-spec width is correct on
-+    // both LP64 (Linux/Android) and LLP64 (Windows MinGW/MSVC) where
-+    // `unsigned long` is 32 bits and `%016lx` would silently truncate
-+    // the upper half of each FNV hash.
-+    char buf[128];
-+    std::snprintf(buf, sizeof(buf),
-+                  "%016" PRIx64 "_%016" PRIx64 "_%016" PRIx64
-+                  "_%016" PRIx64 "_%016" PRIx64 ".bin",
-+                  h_src, h_opts, h_drv, h_dev, h_devver);
-+    return buf;
-+}
-+
-+static cl_program opencl_build_program_with_cache(cl_context ctx,
-+                                                  cl_device_id dev,
-+                                                  const char * program_buffer,
-+                                                  size_t program_size,
-+                                                  const std::string & compile_opts,
-+                                                  const std::string & cache_dir,
-+                                                  const std::string & key) {
-+    if (cache_dir.empty() || key.empty()) return nullptr;
-+    const std::string path = cache_dir + "/" + key;
-+    std::ifstream ifs(path, std::ios::binary);
-+    if (!ifs) return nullptr;
-+    ifs.seekg(0, std::ios::end);
-+    const std::streamsize n = ifs.tellg();
-+    if (n <= 0) return nullptr;
-+    ifs.seekg(0, std::ios::beg);
-+    std::vector<unsigned char> blob((size_t) n);
-+    if (!ifs.read(reinterpret_cast<char*>(blob.data()), n)) return nullptr;
-+
-+    cl_int err1 = CL_SUCCESS, err2 = CL_SUCCESS;
-+    const unsigned char * data = blob.data();
-+    const size_t len = blob.size();
-+    cl_program p = clCreateProgramWithBinary(ctx, 1, &dev, &len, &data, &err1, &err2);
-+    if (err1 != CL_SUCCESS || err2 != CL_SUCCESS || !p) {
-+        if (p) clReleaseProgram(p);
-+        return nullptr;
-+    }
-+    if (clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL) != CL_SUCCESS) {
-+        clReleaseProgram(p);
-+        return nullptr;
-+    }
-+    GGML_UNUSED(program_buffer);
-+    GGML_UNUSED(program_size);
-+    return p;
-+}
-+
-+static void opencl_save_program_binary(cl_program p, cl_device_id /*dev*/,
-+                                       const std::string & cache_dir,
-+                                       const std::string & key) {
-+    if (cache_dir.empty() || key.empty()) return;
-+    if (!opencl_mkdir_p(cache_dir)) return;
-+
-+    size_t bin_size = 0;
-+    if (clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(size_t),
-+                         &bin_size, nullptr) != CL_SUCCESS || bin_size == 0) return;
-+    std::vector<unsigned char> blob(bin_size);
-+    unsigned char * blob_ptr = blob.data();
-+    if (clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(unsigned char *),
-+                         &blob_ptr, nullptr) != CL_SUCCESS) return;
-+
-+    // Atomic write: tmp + fsync + rename. Without the fsync the kernel
-+    // can flush blocks out of order on power loss, leaving the renamed
-+    // file pointing at zero/garbage data and forcing the next process
-+    // into the source-compile fallback (and the bad blob lives forever
-+    // unless explicitly invalidated).
-+    const std::string final_path = cache_dir + "/" + key;
-+    const std::string tmp_path   = final_path + ".tmp";
-+    {
-+        std::ofstream ofs(tmp_path, std::ios::binary);
-+        if (!ofs) return;
-+        ofs.write(reinterpret_cast<const char*>(blob.data()), (std::streamsize) blob.size());
-+        ofs.close();
-+        if (!ofs) { parakeet_unlink(tmp_path.c_str()); return; }
-+    }
-+    {
-+        int fd = parakeet_open_ro(tmp_path.c_str());
-+        if (fd >= 0) {
-+            parakeet_fsync(fd);
-+            parakeet_close(fd);
-+        }
-+    }
-+    // Windows rename() refuses to overwrite an existing destination, so
-+    // unlink it first. POSIX rename is atomic and replaces silently;
-+    // the redundant unlink there is a no-op when the target is missing.
-+#ifdef _WIN32
-+    parakeet_unlink(final_path.c_str());
-+#endif
-+    if (rename(tmp_path.c_str(), final_path.c_str()) != 0) {
-+        parakeet_unlink(tmp_path.c_str());
-+    }
-+}
-+
- static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
-     cl_program p;
-     char *program_log;
-@@ -764,6 +978,17 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
- 
-     program_size = strlen(program_buffer);
- 
-+    // parakeet patch: try the persistent cache first.
-+    const std::string cache_dir = opencl_cache_dir(dev);
-+    const std::string cache_key = cache_dir.empty()
-+        ? std::string()
-+        : opencl_cache_key(program_buffer, program_size, compile_opts, dev);
-+    if (cl_program cached = opencl_build_program_with_cache(
-+            ctx, dev, program_buffer, program_size, compile_opts,
-+            cache_dir, cache_key)) {
-+        return cached;
-+    }
-+
-     p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
-     if(err < 0) {
-         GGML_LOG_ERROR("OpenCL error creating program");
-@@ -781,6 +1006,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
-         exit(1);
-     }
- 
-+    // parakeet patch: save the freshly compiled binary. Fast path
-+    // (cache hit) above avoids re-compiling next time. Failures here
-+    // are non-fatal -- next process just re-pays the compile cost.
-+    opencl_save_program_binary(p, dev, cache_dir, cache_key);
-+
-     return p;
- }
- 
diff --git a/parakeet-cpp/scripts/setup-ggml.sh b/parakeet-cpp/scripts/setup-ggml.sh
index 69674f5e577..dccf0c36dec 100644
--- a/parakeet-cpp/scripts/setup-ggml.sh
+++ b/parakeet-cpp/scripts/setup-ggml.sh
@@ -1,113 +1,63 @@
 #!/usr/bin/env bash
-# Clone ggml into ./ggml at the commit this repo is pinned against, and
-# apply every patch under patches/ in lexicographic order.  Idempotent:
-# safe to re-run.
+# Clone qvac-ext-ggml into ./ggml on the branch this repo is pinned against.
+# Idempotent: safe to re-run.
 #
-# Update GGML_COMMIT here whenever the pin is bumped; this file is the
-# single source of truth for which upstream ggml parakeet.cpp builds
+# Update GGML_URL / GGML_BRANCH here whenever the pin is bumped; this file
+# is the single source of truth for which ggml fork parakeet.cpp builds
 # against.
 #
-# Patches we ship today:
-#   patches/ggml-backend-reg-filename-prefix.patch
-#       Teaches ggml_backend_load_best() to honour a compile-time
-#       GGML_BACKEND_DL_PROJECT_PREFIX macro so renaming the bundled
-#       backend .so/.dll files (PARAKEET_GGML_LIB_PREFIX=ON, the default,
-#       emits libspeech-ggml-*.so) does not break runtime backend
-#       discovery under GGML_BACKEND_DL=ON. No-op when the macro is
-#       undefined.
-#   patches/ggml-opencl-allow-non-adreno.patch
-#       Lets the ggml-opencl backend run on non-Adreno/Intel GPUs
-#       (NVIDIA, AMD, Apple) so the build can be parity-tested on
-#       commodity desktop hardware. Real Adreno deployments build with
-#       the patch applied as a no-op (Adreno path is unchanged).
-#   patches/ggml-opencl-program-binary-cache.patch
-#       Persistent OpenCL kernel binary cache via clCreateProgramWithBinary +
-#       CL_PROGRAM_BINARIES. Removes seconds of cold-start shader compile on
-#       every Adreno / Mesa / Mali / iGPU launch by serialising compiled
-#       kernels under $GGML_OPENCL_CACHE_DIR (or XDG/HOME fallback).
-#       See patches/README.md for the full rationale.
+# qvac-ext-ggml's `speech` branch carries the equivalents of the patches
+# that used to live under patches/ggml-*.patch (backend-reg filename
+# prefix, opencl non-Adreno support, opencl program binary cache). The
+# script therefore does not apply local patches anymore.
 
 set -euo pipefail
 
-GGML_COMMIT="58c38058"
-GGML_URL="https://github.com/ggml-org/ggml.git"
+GGML_URL="https://github.com/tetherto/qvac-ext-ggml.git"
+GGML_BRANCH="speech"
 
 REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
 cd "$REPO_ROOT"
 
-echo "parakeet.cpp: setting up ggml at pinned commit ${GGML_COMMIT}"
+echo "parakeet.cpp: setting up ggml from ${GGML_URL} (branch: ${GGML_BRANCH})"
 
-if [ ! -d ggml/.git ]; then
-    echo "  -> cloning ${GGML_URL}"
-    git clone "$GGML_URL" ggml
+if [ -e ggml ] && [ ! -d ggml/.git ]; then
+    if [ -L ggml ]; then
+        echo "  -> ggml is a symlink to '$(readlink ggml)'; leaving it alone"
+        echo "     (delete the symlink and re-run this script to clone fresh)"
+        exit 0
+    fi
+    echo "  ERROR: ./ggml exists but is not a git checkout." >&2
+    echo "         Remove it and re-run this script." >&2
+    exit 1
 fi
 
-# Find every patch under patches/ matching ggml-*.patch, sorted.
-shopt -s nullglob
-PATCHES=( "$REPO_ROOT"/patches/ggml-*.patch )
-shopt -u nullglob
+if [ ! -d ggml/.git ]; then
+    echo "  -> cloning ${GGML_URL} (branch ${GGML_BRANCH})"
+    git clone --branch "$GGML_BRANCH" "$GGML_URL" ggml
+fi
 
 cd ggml
 
-CURRENT="$(git rev-parse --short=8 HEAD 2>/dev/null || echo '')"
-NEED_CHECKOUT="0"
-if [ "$CURRENT" != "$GGML_COMMIT" ]; then
-    NEED_CHECKOUT="1"
+# Make sure the local checkout actually has the requested branch fetched
+# (e.g. when the user previously cloned with a narrow refspec).
+if ! git rev-parse --verify --quiet "refs/heads/${GGML_BRANCH}" >/dev/null; then
+    echo "  -> fetching ${GGML_BRANCH}"
+    git fetch origin "${GGML_BRANCH}:${GGML_BRANCH}"
 fi
 
-if [ "$NEED_CHECKOUT" = "1" ]; then
-    git checkout -- . 2>/dev/null || true
-    git checkout "$GGML_COMMIT"
-    echo "  -> ok, at $(git rev-parse --short=8 HEAD)"
-fi
-
-# Apply patches.  We always reset to the pinned commit before applying so
-# this is fully idempotent: re-running the script never stacks patches on
-# top of patches.  We bail loudly on a real failure (CRLF in working
-# tree, conflict, ...) instead of silently linking against unpatched ggml.
-if [ ${#PATCHES[@]} -gt 0 ]; then
-    if [ "$NEED_CHECKOUT" = "0" ]; then
-        # Same commit as last run, but patches may already be applied;
-        # reset to pristine before re-applying.
-        if ! git diff --quiet || ! git diff --cached --quiet; then
-            echo "  -> resetting ggml worktree to pristine ${GGML_COMMIT}"
-            git checkout -- .
-        fi
+CURRENT_BRANCH="$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo '')"
+if [ "$CURRENT_BRANCH" != "$GGML_BRANCH" ]; then
+    if ! git diff --quiet || ! git diff --cached --quiet; then
+        echo "  -> resetting ggml worktree to pristine before switching branches"
+        git checkout -- .
     fi
-    for patch in "${PATCHES[@]}"; do
-        name="$(basename "$patch")"
-        # Detect whether the patch has already been applied (idempotent
-        # re-run of the script). `git apply --reverse --check` succeeds
-        # iff every hunk reverses cleanly, which only happens when the
-        # patch is currently applied to the working tree.
-        if git apply --reverse --check "$patch" 2>/dev/null; then
-            echo "  -> $name: already applied, skipping"
-            continue
-        fi
-
-        # Strip CR line endings from the patch on the fly. Windows checkouts
-        # with `core.autocrlf=true` (git's default on Windows) leave the
-        # patch as CRLF in the working tree even though it is LF in the
-        # index, and `git apply` then refuses with a context-mismatch
-        # error.  This converts on read instead of mutating the file.
-        sanitized="$(mktemp)"
-        # shellcheck disable=SC2064
-        trap "rm -f '$sanitized'" EXIT
-        tr -d '\r' < "$patch" > "$sanitized"
-
-        echo "  -> applying $name"
-        if ! git apply --check "$sanitized" 2>/tmp/setup-ggml-apply.err; then
-            echo "    ERROR: patch '$name' does not apply against ggml@${GGML_COMMIT}." >&2
-            sed 's/^/    /' /tmp/setup-ggml-apply.err >&2
-            echo "    Aborting so the build does not silently link unpatched ggml." >&2
-            rm -f /tmp/setup-ggml-apply.err
-            exit 1
-        fi
-        rm -f /tmp/setup-ggml-apply.err
-        git apply "$sanitized"
-    done
+    echo "  -> checking out ${GGML_BRANCH}"
+    git checkout "$GGML_BRANCH"
 fi
 
+echo "  -> ok, on ${GGML_BRANCH} at $(git rev-parse --short=8 HEAD)"
+
 echo
 echo "ggml is ready. Next:"
 echo "    cmake -S . -B build -DCMAKE_BUILD_TYPE=Release"
diff --git a/parakeet-cpp/src/main.cpp b/parakeet-cpp/src/main.cpp
index 7f7e3dbeee5..48b0538c7ac 100644
--- a/parakeet-cpp/src/main.cpp
+++ b/parakeet-cpp/src/main.cpp
@@ -68,6 +68,14 @@ void print_usage(const char * argv0) {
         "                       patch under patches/ relaxes the upstream Adreno-only\n"
         "                       device whitelist for dev/CI parity testing). Production\n"
         "                       Adreno deployments leave both at their defaults.\n"
+        "  --backends-dir DIR                 directory to scan for dynamically-loaded\n"
+        "                                     ggml backend .so/.dll/.dylib files\n"
+        "                                     (e.g. libspeech-ggml-vulkan.so,\n"
+        "                                     libspeech-ggml-opencl.so,\n"
+        "                                     libspeech-ggml-cpu-android_armv8.2_1.so).\n"
+        "                                     Forwarded to ggml_backend_load_all_from_path()\n"
+        "                                     on first backend init. Empty => ggml's compile-\n"
+        "                                     time default search path.\n"
         "  --opencl-cache-dir DIR             persistent OpenCL kernel binary cache directory\n"
         "                                     (sets $GGML_OPENCL_CACHE_DIR; consumed by\n"
         "                                     patches/ggml-opencl-program-binary-cache.patch).\n"
@@ -272,6 +280,14 @@ struct ExtraCliOpts {
     std::string opencl_device;       // GGML_OPENCL_DEVICE
     bool        opencl_disable_fusion = false; // GGML_OPENCL_DISABLE_FUSION=1
     bool        opencl_adreno_use_large_buffer = false; // GGML_OPENCL_ADRENO_USE_LARGE_BUFFER=1
+
+    // Forwarded to `parakeet::set_backends_directory()` before any
+    // backend init so `ggml_backend_load_all_from_path()` finds the
+    // `lib<prefix>ggml-{vulkan,opencl,cpu-*}.so` files in a custom
+    // location. Mirrors `--opencl-cache-dir`'s "applied before
+    // backend init" lifetime contract. Empty => use ggml's default
+    // search path (`$LD_LIBRARY_PATH`, exe dir, etc.).
+    std::string backends_dir;
 };
 
 // Apply OpenCL runtime overrides from the CLI to the process env.
@@ -377,6 +393,8 @@ extern "C" int parakeet_cli_main(int argc, char ** argv) {
             opts.n_gpu_layers = std::atoi(argv[++i]);
         } else if (a == "--opencl-cache-dir" && i + 1 < argc) {
             extra.opencl_cache_dir = argv[++i];
+        } else if (a == "--backends-dir" && i + 1 < argc) {
+            extra.backends_dir = argv[++i];
         } else if (a == "--opencl-platform" && i + 1 < argc) {
             extra.opencl_platform = argv[++i];
         } else if (a == "--opencl-device" && i + 1 < argc) {
@@ -466,6 +484,14 @@ extern "C" int parakeet_cli_main(int argc, char ** argv) {
     // just aren't read by anything).
     apply_opencl_cli_env(extra);
 
+    // Same lifetime contract as the OpenCL env overrides above:
+    // applied before any backend init so `ggml_backend_load_all_from_path`
+    // runs against the requested directory on first use. Empty =>
+    // fall back to ggml's compile-time default search path.
+    if (!extra.backends_dir.empty()) {
+        set_backends_directory(extra.backends_dir);
+    }
+
     const auto t_load = clock::now();
     ParakeetCtcModel model;
     if (int rc = load_from_gguf(opts.model_gguf_path, model, opts.n_threads, opts.n_gpu_layers, opts.verbose); rc != 0) {
diff --git a/parakeet-cpp/src/parakeet_ctc.cpp b/parakeet-cpp/src/parakeet_ctc.cpp
index f9d1c9b1220..62a95cf1c63 100644
--- a/parakeet-cpp/src/parakeet_ctc.cpp
+++ b/parakeet-cpp/src/parakeet_ctc.cpp
@@ -10,17 +10,24 @@
 #include "gguf.h"
 
 #include <algorithm>
+#include <atomic>
 #include <chrono>
 #include <cmath>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <limits>
+#include <mutex>
 #include <stdexcept>
 #include <string>
 #include <thread>
 #include <vector>
 
+#if defined(__ANDROID__) || defined(__unix__) || defined(__APPLE__)
+#include <unistd.h>
+#endif
+
 namespace parakeet {
 
 struct EncoderGraph {
@@ -128,6 +135,34 @@ ggml_context * ParakeetCtcModel::weights_ctx() const {
 
 namespace {
 
+// Backends-dir / OpenCL-cache-dir override + warning state. The
+// setters are intended to be called by the first Engine
+// construction; both are consumed once and then frozen for the rest
+// of the process lifetime (the ggml-backend registry and
+// $GGML_OPENCL_CACHE_DIR are both process-singleton state -- see
+// comment on `ensure_backends_loaded` and the analogous note in
+// `set_opencl_cache_dir`).
+//
+// `g_backends_loaded` is the canonical "registry already populated"
+// flag, set inside `ensure_backends_loaded()` *before* the load-all
+// call returns AND under the mutex so concurrent `set_*` calls
+// either land their write (and have it picked up by the in-flight
+// load) or atomically observe the flag and warn. We track it
+// separately from `g_recorded_backends_dir` because the first
+// Engine may have legitimately constructed with an empty
+// `backends_dir` (default ggml search path), in which case
+// `g_recorded_backends_dir` stays empty and is no longer a reliable
+// "have we loaded?" sentinel -- a subsequent setter would otherwise
+// silently write to `g_backends_dir`, never get re-scanned, and
+// surface zero diagnostic to the caller.
+std::mutex     g_backends_dir_mutex;
+std::string    g_backends_dir;
+std::string    g_recorded_backends_dir;
+std::string    g_recorded_opencl_cache_dir;
+std::atomic<bool> g_backends_loaded{false};
+std::atomic<bool> g_backends_dir_warned{false};
+std::atomic<bool> g_opencl_cache_dir_warned{false};
+
 // Trigger one-time discovery + load of every available ggml backend.
 // Idempotent: repeated calls inside the same process are no-ops once
 // the registry is populated. Routed through a static guard so we don't
@@ -145,23 +180,92 @@ namespace {
 // ggml_backend_load_all() is a cheap no-op. Both modes therefore
 // reach the same registry walk below, matching the convention used
 // by llama.cpp and other ggml-based libraries.
+//
+// The optional backends dir comes from `set_backends_directory()`
+// (typically wired from `EngineOptions::backends_dir`). When set and
+// non-empty, the loader walks that single directory instead of the
+// compile-time defaults so embedded host apps can ship the
+// `lib<prefix>ggml-{vulkan,opencl,cpu-*}.so` files in their own
+// per-module folder rather than relying on `LD_LIBRARY_PATH` /
+// `dlopen()` heuristics.
 void ensure_backends_loaded() {
     static const bool loaded = []() {
-        ggml_backend_load_all();
+        std::string dir;
+        {
+            std::lock_guard<std::mutex> lock(g_backends_dir_mutex);
+            dir = g_backends_dir;
+            g_recorded_backends_dir = g_backends_dir;
+            // Flip the loaded sentinel under the mutex (and *before*
+            // we release it for the load-all call below) so any
+            // concurrent setter that's about to acquire the mutex
+            // sees the registry as already-claimed and falls into
+            // its warn-once branch. Without this, a setter racing
+            // a first Engine construction would land its value
+            // *after* we already captured `dir` into the local --
+            // the registry would scan against the wrong directory
+            // (or the default), and the second Engine would have
+            // no idea its override was lost.
+            g_backends_loaded.store(true, std::memory_order_release);
+        }
+        if (!dir.empty()) {
+            ggml_backend_load_all_from_path(dir.c_str());
+        } else {
+            ggml_backend_load_all();
+        }
         return true;
     }();
     (void) loaded;
 }
 
+// Parse the Adreno generation number from a device name /
+// description string. Returns:
+//   - a 3-or-4-digit generation number ("Adreno (TM) 750" -> 750,
+//     "Adreno 830" -> 830, "Adreno 660" -> 660)
+//   - a synthetic 800 for the "Adreno X<n>" naming used by
+//     Snapdragon X Elite parts (X1-85 / X1-45 etc.). These are
+//     7xx/8xx-tier silicon with kernels that ggml-opencl supports
+//     and outperform Vulkan on. Mapped to 800 here so they take
+//     the OpenCL branch in the tier policy.
+//   - -1 when no Adreno marker is present (Mali, desktop GPUs, ...)
+//
+// Used to drive the OpenCL vs Vulkan tier policy below: Adreno
+// 7xx/8xx/X<n> ship OpenCL kernels that outperform Vulkan on those
+// parts, while Adreno 6xx ggml-opencl is known broken (incorrect
+// results). Mirrors the equivalent helper in llm-llamacpp's
+// BackendSelection.cpp::parseAdrenoVersion so the two stacks reach
+// the same decision on the same hardware.
+int parse_adreno_version(const char * s) {
+    if (!s) return -1;
+    const char * p = strstr(s, "Adreno");
+    if (!p) p = strstr(s, "adreno");
+    if (!p) return -1;
+    p += 6; // strlen("Adreno") == strlen("adreno") == 6
+    // Skip whitespace, "(TM)", punctuation; stop at first letter or digit.
+    while (*p && !(*p >= '0' && *p <= '9') && *p != 'X' && *p != 'x') ++p;
+    if (!*p) return -1;
+    // X1 / X2 ... naming for Snapdragon X Elite -> treat as 800-tier.
+    if (*p == 'X' || *p == 'x') {
+        ++p;
+        if (*p < '0' || *p > '9') return -1; // "Xclipse" etc. is not Adreno-X
+        return 800;
+    }
+    int v = 0;
+    while (*p >= '0' && *p <= '9') {
+        v = v * 10 + (*p - '0');
+        ++p;
+        if (v > 100000) return -1;
+    }
+    return v;
+}
+
 bool is_adreno_6xx(const char * s) {
-    if (!s) return false;
-    if (!strstr(s, "Adreno")) return false;
-    for (const char * q = s; *q; ++q) {
-        if (*q == '6' && q[1] >= '0' && q[1] <= '9' && q[2] >= '0' && q[2] <= '9') {
-            return true;
-        }
-    }
-    return false;
+    const int v = parse_adreno_version(s);
+    return v >= 600 && v < 700;
+}
+
+bool is_adreno_700plus(const char * s) {
+    const int v = parse_adreno_version(s);
+    return v >= 700;
 }
 
 const char * dev_reg_name(ggml_backend_dev_t dev) {
@@ -171,19 +275,42 @@ const char * dev_reg_name(ggml_backend_dev_t dev) {
 }
 
 
+// Pick a GPU backend using the same tier policy as llm-llamacpp's
+// BackendSelection: ggml-opencl is only used when an Adreno 700+
+// device is present (where its kernels are validated and faster than
+// Vulkan); every other GPU (Vulkan, Metal, CUDA, Mali, Intel iGPU,
+// ...) goes through the non-OpenCL preference. Adreno 6xx OpenCL is
+// known broken (incorrect outputs) and is force-skipped unless the
+// caller opts in via `PARAKEET_ALLOW_ADRENO_6XX=1`.
+//
+// Routed exclusively through the ggml-backend registry
+// (`ggml_backend_load_all` + `ggml_backend_dev_*`). No direct calls
+// to `ggml_backend_vulkan_init` / `ggml_backend_opencl_init` /
+// `ggml_backend_metal_init` are made anywhere in parakeet — under
+// the GGML_BACKEND_DL=ON build mode embedded host applications ship
+// with, those entry points live in separate shared libraries that
+// are dlopen()'d at runtime and are not linkable from libparakeet.
+// The registry walk reaches the same backends in both modes.
 ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose) {
     if (n_gpu_layers <= 0) return nullptr;
 
     ensure_backends_loaded();
 
-    // Walk the registry in registration order and pick the first
-    // GPU/IGPU device. Registry order is defined by the ggml-backend
-    // registry's static init list (CUDA -> Metal -> Vulkan -> OpenCL
-    // -> ...), so this preserves the priority of the legacy direct-
-    // init cascade. The Adreno-6xx fallback policy stays on top:
-    // ggml-opencl produces incorrect results on Adreno 6xx; force-
-    // skip and continue the walk (or fall through to CPU) unless
-    // `PARAKEET_ALLOW_ADRENO_6XX=1` is set.
+    // Collect GPU/IGPU devices into three buckets so we can apply the
+    // tier policy after the walk. We keep the device handles + their
+    // human-readable names for both the policy decision and the final
+    // log line.
+    struct Cand {
+        ggml_backend_dev_t dev;
+        const char *       name;
+        const char *       desc;
+        const char *       reg_name;
+    };
+    std::vector<Cand> opencl_adreno_700plus;
+    std::vector<Cand> other_gpu;   // Vulkan / Metal / CUDA / Mali / Intel / ...
+    std::vector<Cand> opencl_other; // Non-Adreno OpenCL (e.g. desktop)
+    int max_adreno_version = -1;
+
     const size_t n_dev = ggml_backend_dev_count();
     for (size_t i = 0; i < n_dev; ++i) {
         ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -198,32 +325,72 @@ ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose) {
         const char * reg_name = dev_reg_name(dev);
         const bool   is_opencl = std::strcmp(reg_name, "OpenCL") == 0;
 
-        if (is_opencl && (is_adreno_6xx(name) || is_adreno_6xx(desc))) {
-            const char * reported = name ? name : (desc ? desc : "unknown");
-            const char * override_env = getenv("PARAKEET_ALLOW_ADRENO_6XX");
-            if (!override_env || override_env[0] != '1') {
-                if (verbose) PARAKEET_LOG_WARN(
-                    "parakeet: OpenCL device '%s' is Adreno 6xx; "
-                    "skipping (7xx/8xx/X1E supported, set "
-                    "PARAKEET_ALLOW_ADRENO_6XX=1 to override)\n",
-                    reported);
-                continue;
+        const int adreno_v = std::max(parse_adreno_version(name),
+                                      parse_adreno_version(desc));
+        if (adreno_v > max_adreno_version) max_adreno_version = adreno_v;
+
+        if (is_opencl) {
+            if (adreno_v >= 700) {
+                opencl_adreno_700plus.push_back({dev, name, desc, reg_name});
+            } else if (adreno_v >= 600 && adreno_v < 700) {
+                const char * reported = name ? name : (desc ? desc : "unknown");
+                const char * override_env = getenv("PARAKEET_ALLOW_ADRENO_6XX");
+                if (!override_env || override_env[0] != '1') {
+                    if (verbose) PARAKEET_LOG_WARN(
+                        "parakeet: OpenCL device '%s' is Adreno 6xx; "
+                        "skipping (7xx/8xx/X1E supported, set "
+                        "PARAKEET_ALLOW_ADRENO_6XX=1 to override)\n",
+                        reported);
+                    continue;
+                }
+                if (verbose) PARAKEET_LOG_INFO(
+                    "parakeet: PARAKEET_ALLOW_ADRENO_6XX=1 set; "
+                    "keeping OpenCL backend on '%s' anyway\n", reported);
+                opencl_other.push_back({dev, name, desc, reg_name});
+            } else {
+                opencl_other.push_back({dev, name, desc, reg_name});
             }
+        } else {
+            other_gpu.push_back({dev, name, desc, reg_name});
+        }
+    }
+
+    // Tier policy:
+    //   1. Adreno 700+: prefer OpenCL (validated, faster than Vulkan
+    //      on Snapdragon 8 Gen 2/3/4 etc.).
+    //   2. Anything else with a non-OpenCL GPU: prefer that
+    //      (Vulkan on all non-Adreno Android, Metal on Apple, CUDA
+    //      on Linux/Windows desktop, Mali iGPU via Vulkan, ...).
+    //   3. Last resort: any other OpenCL device (e.g. desktop OpenCL
+    //      or non-Adreno mobile when no Vulkan is registered).
+    auto try_init = [&](const std::vector<Cand> & bucket) -> ggml_backend_t {
+        for (const Cand & c : bucket) {
+            ggml_backend_t b = ggml_backend_dev_init(c.dev, nullptr);
+            if (!b) continue;
             if (verbose) PARAKEET_LOG_INFO(
-                "parakeet: PARAKEET_ALLOW_ADRENO_6XX=1 set; "
-                "keeping OpenCL backend on '%s' anyway\n", reported);
+                "parakeet: using %s backend (%s)\n",
+                c.reg_name && *c.reg_name ? c.reg_name : "GPU",
+                c.name ? c.name : (c.desc ? c.desc : "unknown"));
+            return b;
         }
+        return nullptr;
+    };
 
-        ggml_backend_t b = ggml_backend_dev_init(dev, nullptr);
-        if (!b) continue;
-        if (verbose) PARAKEET_LOG_INFO(
-            "parakeet: using %s backend (%s)\n",
-            reg_name && *reg_name ? reg_name : "GPU",
-            name ? name : (desc ? desc : "unknown"));
-        return b;
+    if (!opencl_adreno_700plus.empty()) {
+        if (ggml_backend_t b = try_init(opencl_adreno_700plus)) return b;
     }
+    if (ggml_backend_t b = try_init(other_gpu)) return b;
+    if (ggml_backend_t b = try_init(opencl_other)) return b;
 
-    if (verbose) PARAKEET_LOG_INFO("parakeet: no GPU backend available, falling back to CPU\n");
+    if (verbose) {
+        if (max_adreno_version >= 600 && max_adreno_version < 700) {
+            PARAKEET_LOG_INFO(
+                "parakeet: only Adreno 6xx OpenCL detected (broken); "
+                "falling back to CPU\n");
+        } else {
+            PARAKEET_LOG_INFO("parakeet: no GPU backend available, falling back to CPU\n");
+        }
+    }
     return nullptr;
 }
 
@@ -300,6 +467,91 @@ std::vector<float> read_filterbank_to_vector(ggml_tensor * t) {
 
 }
 
+void set_backends_directory(const std::string & dir) {
+    std::lock_guard<std::mutex> lock(g_backends_dir_mutex);
+    if (g_backends_loaded.load(std::memory_order_acquire)) {
+        // Registry already populated for this process. We can't
+        // re-scan a different directory mid-flight (ggml's registry
+        // is a process-wide singleton), so log the conflict at most
+        // once and otherwise stay silent on subsequent identical
+        // sets (the common case when a host instantiates several
+        // Engines back-to-back from the same backends folder, or
+        // when the second value happens to match the recorded one).
+        if (dir != g_recorded_backends_dir &&
+            !g_backends_dir_warned.exchange(true)) {
+            if (g_recorded_backends_dir.empty()) {
+                // First Engine constructed without an explicit
+                // backends_dir, so ggml's compile-time default
+                // search path was used. The current caller wanted
+                // a specific dir but missed the window.
+                PARAKEET_LOG_WARN(
+                    "parakeet: set_backends_directory('%s') ignored -- the "
+                    "ggml-backend registry was already populated against "
+                    "ggml's default search path (no explicit backends_dir on "
+                    "the first Engine). Call set_backends_directory() (or "
+                    "construct an Engine with backends_dir set) before the "
+                    "first Engine to influence which directory is scanned.\n",
+                    dir.c_str());
+            } else {
+                PARAKEET_LOG_WARN(
+                    "parakeet: set_backends_directory('%s') ignored -- backends "
+                    "already loaded from '%s' earlier in this process.\n",
+                    dir.c_str(), g_recorded_backends_dir.c_str());
+            }
+        }
+        return;
+    }
+    g_backends_dir = dir;
+}
+
+void set_opencl_cache_dir(const std::string & dir) {
+#if defined(__ANDROID__)
+    // Same "first Engine wins" contract as set_backends_directory:
+    // ggml-opencl reads $GGML_OPENCL_CACHE_DIR once per process at
+    // backend init (before the first kernel build), so a setenv
+    // after init is effectively a no-op on the cache binding. Gate
+    // on the shared g_backends_loaded flag because the OpenCL
+    // backend is registered at the same `ggml_backend_load_all*`
+    // call that flips the flag -- conservative because it might
+    // still take effect when the host hasn't yet instantiated a
+    // GPU device, but matches what the engine-ctor documentation
+    // promises and avoids the same silent-failure mode as
+    // set_backends_directory's previous gate.
+    std::lock_guard<std::mutex> lock(g_backends_dir_mutex);
+    if (g_backends_loaded.load(std::memory_order_acquire)) {
+        if (!dir.empty() && dir != g_recorded_opencl_cache_dir &&
+            !g_opencl_cache_dir_warned.exchange(true)) {
+            if (g_recorded_opencl_cache_dir.empty()) {
+                PARAKEET_LOG_WARN(
+                    "parakeet: set_opencl_cache_dir('%s') ignored -- backends "
+                    "were already loaded with no explicit OpenCL cache dir "
+                    "earlier in this process ($GGML_OPENCL_CACHE_DIR either "
+                    "unset or set by another consumer). Call "
+                    "set_opencl_cache_dir() before the first Engine to take "
+                    "effect.\n",
+                    dir.c_str());
+            } else {
+                PARAKEET_LOG_WARN(
+                    "parakeet: set_opencl_cache_dir('%s') ignored -- "
+                    "$GGML_OPENCL_CACHE_DIR already pinned to '%s' earlier in "
+                    "this process.\n",
+                    dir.c_str(), g_recorded_opencl_cache_dir.c_str());
+            }
+        }
+        return;
+    }
+    if (dir.empty()) return;
+    // ggml-opencl's program-binary-cache patch reads this once per
+    // process at backend init (before the first kernel build). Set
+    // it before constructing the first Engine; later calls don't
+    // re-bind the cache but cost nothing.
+    ::setenv("GGML_OPENCL_CACHE_DIR", dir.c_str(), /*overwrite=*/1);
+    g_recorded_opencl_cache_dir = dir;
+#else
+    (void) dir;
+#endif
+}
+
 int load_from_gguf(const std::string & gguf_path,
                    ParakeetCtcModel  & out_model,
                    int                 n_threads,
diff --git a/parakeet-cpp/src/parakeet_ctc.h b/parakeet-cpp/src/parakeet_ctc.h
index be0f3ad42b8..32fefe2947d 100644
--- a/parakeet-cpp/src/parakeet_ctc.h
+++ b/parakeet-cpp/src/parakeet_ctc.h
@@ -305,6 +305,14 @@ struct ParakeetCtcModel {
 // `TdtConfig` + `SortformerConfig`.
 using ParakeetModel = ParakeetCtcModel;
 
+// Backend init configuration. Call before the first `load_from_gguf`
+// (or Engine construction) in the process. Both are no-ops once the
+// ggml-backend registry has been populated (the registry is a
+// process-wide singleton); see implementation comments for the
+// detailed lifetime contract.
+void set_backends_directory(const std::string & dir);
+void set_opencl_cache_dir(const std::string & dir);
+
 int load_from_gguf(const std::string & gguf_path,
                    ParakeetCtcModel  & out_model,
                    int                 n_threads,
diff --git a/parakeet-cpp/src/parakeet_engine.cpp b/parakeet-cpp/src/parakeet_engine.cpp
index 50ec54baa4d..e78e19e333f 100644
--- a/parakeet-cpp/src/parakeet_engine.cpp
+++ b/parakeet-cpp/src/parakeet_engine.cpp
@@ -141,6 +141,21 @@ static void prewarm_encoder(ParakeetCtcModel & model, float audio_seconds) {
 Engine::Engine(const EngineOptions & opts) : pimpl_(std::make_unique<Impl>()) {
     pimpl_->opts = opts;
 
+    // Apply backend-init knobs before the first ggml call. Both are
+    // process-singleton-scoped (the ggml-backend registry only ever
+    // gets populated once per process; `$GGML_OPENCL_CACHE_DIR` is
+    // read once by ggml-opencl at first init), so this is effectively
+    // a "first Engine wins" race -- a second Engine with a different
+    // backends_dir is logged + ignored by set_backends_directory().
+    // Hosts that need per-Engine isolation should run each Engine in
+    // its own subprocess.
+    if (!opts.backends_dir.empty()) {
+        set_backends_directory(opts.backends_dir);
+    }
+    if (!opts.opencl_cache_dir.empty()) {
+        set_opencl_cache_dir(opts.opencl_cache_dir);
+    }
+
     const int rc = load_from_gguf(opts.model_gguf_path,
                                   pimpl_->model,
                                   opts.n_threads,