From 907f315158343e70ae847344355c88bd40d8808e Mon Sep 17 00:00:00 2001
From: GustavoA1604 <gustavogefa@hotmail.com>
Date: Wed, 20 May 2026 14:36:33 -0300
Subject: [PATCH] tts-cpp: Add dynamic backend selection for android

---
 tts-cpp/CMakeLists.txt                      | 100 +++--
 tts-cpp/include/tts-cpp/chatterbox/engine.h |  49 ++-
 tts-cpp/include/tts-cpp/supertonic/engine.h |  27 ++
 tts-cpp/src/backend_selection.cpp           | 394 ++++++++++++++++++++
 tts-cpp/src/backend_selection.h             |  90 +++++
 tts-cpp/src/backend_util.h                  |  53 +++
 tts-cpp/src/campplus.cpp                    |   1 +
 tts-cpp/src/campplus_forward.inc            |   4 +-
 tts-cpp/src/chatterbox_cli.cpp              |  26 +-
 tts-cpp/src/chatterbox_engine.cpp           |  16 +
 tts-cpp/src/chatterbox_tts.cpp              |  83 ++---
 tts-cpp/src/main.cpp                        |  93 ++---
 tts-cpp/src/mel_extract_stft.cpp            |   8 +-
 tts-cpp/src/s3tokenizer.cpp                 |  13 +-
 tts-cpp/src/supertonic_engine.cpp           |  13 +
 tts-cpp/src/supertonic_gguf.cpp             |  68 ++--
 tts-cpp/src/t3_mtl.cpp                      |  43 ++-
 tts-cpp/src/voice_encoder.cpp               |  11 +-
 18 files changed, 856 insertions(+), 236 deletions(-)
 create mode 100644 tts-cpp/src/backend_selection.cpp
 create mode 100644 tts-cpp/src/backend_selection.h
 create mode 100644 tts-cpp/src/backend_util.h
diff --git a/tts-cpp/CMakeLists.txt b/tts-cpp/CMakeLists.txt
index 225897554ea..21cadbc8ae6 100644
--- a/tts-cpp/CMakeLists.txt
+++ b/tts-cpp/CMakeLists.txt
@@ -70,6 +70,55 @@ if (CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "Android")
     endif()
 endif()
 
+# Android default backend stack: dynamic loading of Vulkan + OpenCL +
+# per-arch CPU variants. Mirrors parakeet-cpp's same-repo sibling and
+# the qvac llm-llamacpp Android config (see
+# qvac-registry-vcpkg/ports/llama-cpp/portfile.cmake) so the tts-cpp
+# Android prebuilds drop into the same `qvac__tts-ggml/` folder shape
+# as the parakeet / llamacpp ones: a `.bare` module + sibling
+# `lib<prefix>ggml-{vulkan,opencl,cpu-android_armv*_*}.so` files that
+# `ggml_backend_load_all_from_path()` discovers at runtime.
+#
+# Selection at runtime is centralised in
+# `tts_cpp::detail::init_gpu_backend()` (src/backend_selection.cpp):
+# OpenCL when an Adreno 700+ device is present, Vulkan for every
+# other GPU (non-Adreno, Adreno < 700, Mali, Xclipse, ...). No
+# static GPU backend entry points are linked anywhere in libtts-cpp;
+# the registry walk reaches the right backend in both
+# GGML_BACKEND_DL=ON (Android prebuild) and GGML_BACKEND_DL=OFF
+# (desktop dev) modes.
+#
+# Callers that have specific reasons to deviate (e.g. a desktop
+# bring-up build that wants Vulkan only) can still override any of
+# these at the cmake command line; we only set defaults that haven't
+# already been provided.
+if (CMAKE_SYSTEM_NAME STREQUAL "Android")
+    if (NOT DEFINED CACHE{GGML_BACKEND_DL})
+        set(GGML_BACKEND_DL ON CACHE BOOL "" FORCE)
+    endif()
+    if (NOT DEFINED CACHE{GGML_CPU_ALL_VARIANTS})
+        set(GGML_CPU_ALL_VARIANTS ON CACHE BOOL "" FORCE)
+    endif()
+    if (NOT DEFINED CACHE{GGML_CPU_REPACK})
+        set(GGML_CPU_REPACK ON CACHE BOOL "" FORCE)
+    endif()
+    if (NOT DEFINED CACHE{GGML_VULKAN})
+        set(GGML_VULKAN ON CACHE BOOL "" FORCE)
+    endif()
+    if (NOT DEFINED CACHE{GGML_OPENCL})
+        set(GGML_OPENCL ON CACHE BOOL "" FORCE)
+    endif()
+    # ggml-vulkan's coopmat / coopmat2 shader compile pulls in
+    # extensions that most Android Vulkan drivers don't expose; the
+    # upstream llama Android build disables both for the same reason.
+    if (NOT DEFINED CACHE{GGML_VULKAN_DISABLE_COOPMAT})
+        set(GGML_VULKAN_DISABLE_COOPMAT ON CACHE BOOL "" FORCE)
+    endif()
+    if (NOT DEFINED CACHE{GGML_VULKAN_DISABLE_COOPMAT2})
+        set(GGML_VULKAN_DISABLE_COOPMAT2 ON CACHE BOOL "" FORCE)
+    endif()
+endif()
+
 # Two related workarounds for clang-cl / MSVC builds on Windows.  Both
 # come from msys2 sneaking GCC-flavoured libraries onto CMake's search
 # paths and being mismatched against MSVC-compiled translation units.
@@ -161,33 +210,28 @@ if (MSVC)
     add_compile_definitions(_USE_MATH_DEFINES _CRT_SECURE_NO_WARNINGS)
 endif()
 
-# INTERFACE library that holds the GGML_USE_<BACKEND> compile defines
-# every TU that includes ggml.h needs to dispatch correctly on the
-# enabled backend.  The tts-cpp library AND any test executable that
-# recompiles src/chatterbox_tts.cpp / src/main.cpp from source (i.e.
-# bypasses the tts-cpp link) must link against this; otherwise the
-# #ifdef GGML_USE_<BACKEND> branches inside those TUs evaluate as
-# undefined and the GPU code paths get silently compiled out of the
-# test executable, even when the parent build did enable the backend.
-# Mirrors parakeet-cpp's parakeet-backend-defs INTERFACE lib.
+# Legacy interface library kept for export-set compatibility (it is
+# still part of `install(EXPORT tts-cppTargets)` below and downstream
+# `find_package(tts-cpp)` consumers list it as a link dep). Body
+# intentionally empty: tts-cpp now routes every backend decision
+# through the ggml-backend registry
+# (`ggml_backend_load_all` + `ggml_backend_dev_*`, see
+# `init_gpu_backend()` / `init_cpu_backend()` / `init_blas_backend()`
+# in src/backend_selection.cpp) and does NOT call any
+# `ggml_backend_<backend>_init` / `ggml_backend_is_<backend>` entry
+# point directly. The `GGML_USE_VULKAN` / `GGML_USE_OPENCL` /
+# `GGML_USE_METAL` / `GGML_USE_CUDA` / `GGML_USE_BLAS` compile defines
+# that used to live here were only consumed by `#ifdef` cascades that
+# called those static entry points; with the registry-only design
+# they're dead, and shipping them would falsely advertise a static
+# backend dependency that the GGML_BACKEND_DL=ON Android/Linux builds
+# explicitly do not have (their backends live in separately-loadable
+# `.so` files that are dlopen()'d by `ggml_backend_load_all_from_path`
+# at runtime). Mirrors parakeet-cpp's `parakeet-backend-defs`.
 add_library(tts-cpp-backend-defs INTERFACE)
-if (GGML_CUDA)
-    target_compile_definitions(tts-cpp-backend-defs INTERFACE GGML_USE_CUDA)
-endif()
-if (GGML_METAL)
-    target_compile_definitions(tts-cpp-backend-defs INTERFACE GGML_USE_METAL)
-endif()
-if (GGML_VULKAN)
-    target_compile_definitions(tts-cpp-backend-defs INTERFACE GGML_USE_VULKAN)
-endif()
-if (GGML_BLAS)
-    target_compile_definitions(tts-cpp-backend-defs INTERFACE GGML_USE_BLAS)
-endif()
-if (GGML_OPENCL)
-    target_compile_definitions(tts-cpp-backend-defs INTERFACE GGML_USE_OPENCL)
-endif()
 
 set(TTS_CPP_LIB_SOURCES
+    src/backend_selection.cpp
     src/main.cpp
     src/chatterbox_cli.cpp
     src/gpt2_bpe.cpp
@@ -594,7 +638,15 @@ if (TTS_CPP_BUILD_TESTS)
     tts_cpp_apply_ccache(test-metal-ops)
     # Metal-only kernel parity check.  Useful only when built with
     # -DGGML_METAL=ON; skipped on CI fleets without Metal via `ctest -LE gpu`.
+    # GGML_USE_METAL is supplied locally here (rather than via
+    # tts-cpp-backend-defs) because the library itself no longer
+    # consumes the macro -- every #ifdef GGML_USE_<X> in src/ was
+    # removed alongside the registry-only refactor. The test still
+    # uses the macro to gate its direct ggml_backend_metal_init()
+    # call site (it's exercising the Metal-backend implementation
+    # directly, not going through tts-cpp's backend selection).
     if (GGML_METAL)
+        target_compile_definitions(test-metal-ops PRIVATE GGML_USE_METAL)
         tts_cpp_register_test(test-metal-ops LABEL "gpu")
     endif()
 
diff --git a/tts-cpp/include/tts-cpp/chatterbox/engine.h b/tts-cpp/include/tts-cpp/chatterbox/engine.h
index e60ef1db3b8..daef5e97c9e 100644
--- a/tts-cpp/include/tts-cpp/chatterbox/engine.h
+++ b/tts-cpp/include/tts-cpp/chatterbox/engine.h
@@ -75,12 +75,57 @@ struct EngineOptions {
     std::string voice_dir;
 
     // Backend selection.  n_gpu_layers > 0 enables the first available
-    // GPU backend (CUDA → Metal → Vulkan → OpenCL in build order), falling
-    // back to the CPU backend when none is compiled in or initialisation fails.
+    // GPU backend via the Adreno-tier policy: Adreno 700+ -> OpenCL,
+    // every other GPU (Vulkan on non-Adreno Android, Metal on Apple,
+    // CUDA on Linux/Windows desktop, Mali iGPU via Vulkan, ...) -> the
+    // non-OpenCL preference. Adreno 6xx OpenCL is force-skipped (broken
+    // kernels) unless `TTS_CPP_ALLOW_ADRENO_6XX=1` is set in the env.
+    // Falls back to the CPU backend when no GPU was requested, none is
+    // registered, or every candidate refused init.
     // The exact per-layer split is not used today; any positive value
     // moves the whole model to the GPU.
     int n_gpu_layers = 0;
 
+    // Directory to scan for dynamically-loaded ggml backends
+    // (`libspeech-ggml-vulkan.so`, `libspeech-ggml-opencl.so`,
+    // `libspeech-ggml-cpu-android_armv8.2_1.so`, ...). Forwarded to
+    // `ggml_backend_load_all_from_path()` on the first Engine
+    // construction in the process; subsequent constructions reuse the
+    // already-populated registry.
+    //
+    // Leave empty to fall back to ggml's default search path
+    // (`ggml_backend_load_all()`), which walks compile-time defaults
+    // (`$EXE_DIR`, `LD_LIBRARY_PATH`, ...). Embedded host applications
+    // built with `GGML_BACKEND_DL=ON` (the Android / Linux non-Apple
+    // default; see CMakeLists.txt) should pass an explicit dir
+    // because the .so files ship next to the host's binary in a
+    // platform-specific subfolder rather than on the system loader's
+    // path.
+    //
+    // No-op on builds where ggml is statically linked
+    // (`GGML_BACKEND_DL=OFF`, e.g. desktop dev cmake builds and the
+    // Apple xcframework). On those, every backend is registered at
+    // constructor time from inside libggml and no filesystem scan
+    // takes place.
+    std::string backends_dir;
+
+    // Sets `$GGML_OPENCL_CACHE_DIR` before the first backend init so
+    // ggml-opencl persists `clCreateProgramWithBinary` blobs across
+    // process restarts (see the program-binary-cache patch on
+    // qvac-ext-ggml@speech). Strongly recommended on Android where
+    // the cold `clBuildProgram` cost dominates first-utterance
+    // latency; pass a writable per-app directory (typically the
+    // app's `cacheDir` from the host platform).
+    //
+    // Honoured only on `__ANDROID__` builds; ignored elsewhere
+    // (desktop OpenCL platforms don't ship the binary-cache patch
+    // and would otherwise pollute the user's tmpdir).
+    //
+    // Leave empty to keep the existing `$GGML_OPENCL_CACHE_DIR` env
+    // value (or no cache at all). Wrapper scripts that already
+    // export the env take precedence.
+    std::string opencl_cache_dir;
+
     // 0 = std::thread::hardware_concurrency() (capped at 4 by default).
     int n_threads = 0;
 
diff --git a/tts-cpp/include/tts-cpp/supertonic/engine.h b/tts-cpp/include/tts-cpp/supertonic/engine.h
index b32e51fefc5..76bd692e516 100644
--- a/tts-cpp/include/tts-cpp/supertonic/engine.h
+++ b/tts-cpp/include/tts-cpp/supertonic/engine.h
@@ -56,6 +56,33 @@ struct EngineOptions {
     int   n_threads     = 0;
     int   n_gpu_layers  = 0;
 
+    // Directory to scan for dynamically-loaded ggml backends
+    // (`libspeech-ggml-vulkan.so`, `libspeech-ggml-opencl.so`,
+    // `libspeech-ggml-cpu-android_armv8.2_1.so`, ...). Forwarded to
+    // `ggml_backend_load_all_from_path()` on the first Engine
+    // construction in the process; subsequent constructions reuse the
+    // already-populated registry.
+    //
+    // Leave empty to fall back to ggml's default search path
+    // (`ggml_backend_load_all()`). Embedded host applications built
+    // with `GGML_BACKEND_DL=ON` (the Android / Linux non-Apple
+    // default; see CMakeLists.txt) should pass an explicit dir so the
+    // .so files ship next to the host's binary in a per-module
+    // folder rather than relying on `LD_LIBRARY_PATH` / `dlopen()`
+    // heuristics. No-op on `GGML_BACKEND_DL=OFF` (static-link)
+    // builds.
+    std::string backends_dir;
+
+    // Sets `$GGML_OPENCL_CACHE_DIR` before the first backend init so
+    // ggml-opencl persists `clCreateProgramWithBinary` blobs across
+    // process restarts. Strongly recommended on Android where the
+    // cold `clBuildProgram` cost dominates first-utterance latency;
+    // pass a writable per-app directory (typically the app's
+    // `cacheDir` from the host platform).
+    //
+    // Honoured only on `__ANDROID__` builds; ignored elsewhere.
+    std::string opencl_cache_dir;
+
     // Optional path to a .npy file containing the initial noise tensor of
     // shape [1, latent_channels, latent_len] (float32).  When provided,
     // latent_len is taken from the npy file (overriding the duration-
diff --git a/tts-cpp/src/backend_selection.cpp b/tts-cpp/src/backend_selection.cpp
new file mode 100644
index 00000000000..2c36287827c
--- /dev/null
+++ b/tts-cpp/src/backend_selection.cpp
@@ -0,0 +1,394 @@
+#include "backend_selection.h"
+
+#include "ggml-backend.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace tts_cpp::detail {
+namespace {
+
+// Backends-dir / OpenCL-cache-dir override + warning state. The
+// setters are intended to be called by the first Engine
+// construction; both are consumed once and then frozen for the rest
+// of the process lifetime (the ggml-backend registry and
+// $GGML_OPENCL_CACHE_DIR are both process-singleton state).
+//
+// `g_backends_loaded` is the canonical "registry already populated"
+// flag, set inside `ensure_backends_loaded()` *before* the load-all
+// call returns AND under the mutex so concurrent `set_*` calls
+// either land their write (and have it picked up by the in-flight
+// load) or atomically observe the flag and warn. We track it
+// separately from `g_recorded_backends_dir` because the first
+// Engine may have legitimately constructed with an empty
+// `backends_dir` (default ggml search path), in which case
+// `g_recorded_backends_dir` stays empty and is no longer a reliable
+// "have we loaded?" sentinel -- a subsequent setter would otherwise
+// silently write to `g_backends_dir`, never get re-scanned, and
+// surface zero diagnostic to the caller.
+//
+// Mirrors parakeet-cpp/src/parakeet_ctc.cpp 1:1 (same Engine ctor /
+// process-singleton-registry interaction). Kept in a tts-cpp-local
+// anon namespace so the two libraries can be vendored side-by-side
+// without ODR collisions on the static state.
+std::mutex     g_backends_dir_mutex;
+std::string    g_backends_dir;
+std::string    g_recorded_backends_dir;
+std::string    g_recorded_opencl_cache_dir;
+std::atomic<bool> g_backends_loaded{false};
+std::atomic<bool> g_backends_dir_warned{false};
+std::atomic<bool> g_opencl_cache_dir_warned{false};
+
+const char * dev_reg_name(ggml_backend_dev_t dev) {
+    if (!dev) return "";
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    return reg ? ggml_backend_reg_name(reg) : "";
+}
+
+} // namespace
+
+void set_backends_directory(const std::string & dir) {
+    std::lock_guard<std::mutex> lock(g_backends_dir_mutex);
+    if (g_backends_loaded.load(std::memory_order_acquire)) {
+        // Registry already populated for this process. We can't
+        // re-scan a different directory mid-flight (ggml's registry
+        // is a process-wide singleton), so log the conflict at most
+        // once and otherwise stay silent on subsequent identical
+        // sets (the common case when a host instantiates several
+        // Engines back-to-back from the same backends folder, or
+        // when the second value happens to match the recorded one).
+        if (dir != g_recorded_backends_dir &&
+            !g_backends_dir_warned.exchange(true)) {
+            if (g_recorded_backends_dir.empty()) {
+                // First Engine constructed without an explicit
+                // backends_dir, so ggml's compile-time default
+                // search path was used. The current caller wanted
+                // a specific dir but missed the window.
+                fprintf(stderr,
+                    "tts-cpp: set_backends_directory('%s') ignored -- the "
+                    "ggml-backend registry was already populated against "
+                    "ggml's default search path (no explicit backends_dir on "
+                    "the first Engine). Call set_backends_directory() (or "
+                    "construct an Engine with backends_dir set) before the "
+                    "first Engine to influence which directory is scanned.\n",
+                    dir.c_str());
+            } else {
+                fprintf(stderr,
+                    "tts-cpp: set_backends_directory('%s') ignored -- backends "
+                    "already loaded from '%s' earlier in this process.\n",
+                    dir.c_str(), g_recorded_backends_dir.c_str());
+            }
+        }
+        return;
+    }
+    g_backends_dir = dir;
+}
+
+void set_opencl_cache_dir(const std::string & dir) {
+#if defined(__ANDROID__)
+    // Same "first Engine wins" contract as set_backends_directory:
+    // ggml-opencl reads $GGML_OPENCL_CACHE_DIR once per process at
+    // backend init (before the first kernel build), so a setenv
+    // after init is effectively a no-op on the cache binding. Gate
+    // on the shared g_backends_loaded flag because the OpenCL
+    // backend is registered at the same `ggml_backend_load_all*`
+    // call that flips the flag -- conservative because it might
+    // still take effect when the host hasn't yet instantiated a
+    // GPU device, but matches what the engine-ctor documentation
+    // promises and avoids the same silent-failure mode as
+    // set_backends_directory's previous gate.
+    std::lock_guard<std::mutex> lock(g_backends_dir_mutex);
+    if (g_backends_loaded.load(std::memory_order_acquire)) {
+        if (!dir.empty() && dir != g_recorded_opencl_cache_dir &&
+            !g_opencl_cache_dir_warned.exchange(true)) {
+            if (g_recorded_opencl_cache_dir.empty()) {
+                fprintf(stderr,
+                    "tts-cpp: set_opencl_cache_dir('%s') ignored -- backends "
+                    "were already loaded with no explicit OpenCL cache dir "
+                    "earlier in this process ($GGML_OPENCL_CACHE_DIR either "
+                    "unset or set by another consumer). Call "
+                    "set_opencl_cache_dir() before the first Engine to take "
+                    "effect.\n",
+                    dir.c_str());
+            } else {
+                fprintf(stderr,
+                    "tts-cpp: set_opencl_cache_dir('%s') ignored -- "
+                    "$GGML_OPENCL_CACHE_DIR already pinned to '%s' earlier in "
+                    "this process.\n",
+                    dir.c_str(), g_recorded_opencl_cache_dir.c_str());
+            }
+        }
+        return;
+    }
+    if (dir.empty()) return;
+    // ggml-opencl's program-binary-cache patch reads this once per
+    // process at backend init (before the first kernel build). Set
+    // it before constructing the first Engine; later calls don't
+    // re-bind the cache but cost nothing.
+    ::setenv("GGML_OPENCL_CACHE_DIR", dir.c_str(), /*overwrite=*/1);
+    g_recorded_opencl_cache_dir = dir;
+#else
+    (void) dir;
+#endif
+}
+
+// Trigger one-time discovery + load of every available ggml backend.
+// Idempotent: repeated calls inside the same process are no-ops once
+// the registry is populated. Routed through a static guard so we
+// don't pay the directory-walk cost on every model load.
+//
+// Why this instead of the per-backend ggml_backend_<x>_init() entry
+// points the cascade used to call directly: with GGML_BACKEND_DL=ON
+// (the dynamic-loader mode embedded host applications typically
+// ship with) the CUDA / Metal / Vulkan / OpenCL / BLAS / ggml-cpu
+// backends live in separate shared libraries that are dlopened at
+// runtime; their concrete init symbols are not linkable from
+// libtts-cpp, and the only supported entry point is the registry.
+// With GGML_BACKEND_DL=OFF the backends are statically linked into
+// libggml, registered at constructor time, and
+// ggml_backend_load_all() is a cheap no-op. Both modes therefore
+// reach the same registry walk below, matching the convention used
+// by llama.cpp / parakeet-cpp / other ggml-based libraries.
+//
+// The optional backends dir comes from `set_backends_directory()`
+// (typically wired from `EngineOptions::backends_dir`). When set and
+// non-empty, the loader walks that single directory instead of the
+// compile-time defaults so embedded host apps can ship the
+// `lib<prefix>ggml-{vulkan,opencl,cpu-*}.so` files in their own
+// per-module folder rather than relying on `LD_LIBRARY_PATH` /
+// `dlopen()` heuristics.
+void ensure_backends_loaded() {
+    static const bool loaded = []() {
+        std::string dir;
+        {
+            std::lock_guard<std::mutex> lock(g_backends_dir_mutex);
+            dir = g_backends_dir;
+            g_recorded_backends_dir = g_backends_dir;
+            // Flip the loaded sentinel under the mutex (and *before*
+            // we release it for the load-all call below) so any
+            // concurrent setter that's about to acquire the mutex
+            // sees the registry as already-claimed and falls into
+            // its warn-once branch. Without this, a setter racing
+            // a first Engine construction would land its value
+            // *after* we already captured `dir` into the local --
+            // the registry would scan against the wrong directory
+            // (or the default), and the second Engine would have
+            // no idea its override was lost.
+            g_backends_loaded.store(true, std::memory_order_release);
+        }
+        if (!dir.empty()) {
+            ggml_backend_load_all_from_path(dir.c_str());
+        } else {
+            ggml_backend_load_all();
+        }
+        return true;
+    }();
+    (void) loaded;
+}
+
+// Parse the Adreno generation number from a device name /
+// description string. Returns:
+//   - a 3-or-4-digit generation number ("Adreno (TM) 750" -> 750,
+//     "Adreno 830" -> 830, "Adreno 660" -> 660)
+//   - a synthetic 800 for the "Adreno X<n>" naming used by
+//     Snapdragon X Elite parts (X1-85 / X1-45 etc.). These are
+//     7xx/8xx-tier silicon with kernels that ggml-opencl supports
+//     and outperform Vulkan on. Mapped to 800 here so they take
+//     the OpenCL branch in the tier policy.
+//   - -1 when no Adreno marker is present (Mali, desktop GPUs, ...)
+//
+// Used to drive the OpenCL vs Vulkan tier policy below: Adreno
+// 7xx/8xx/X<n> ship OpenCL kernels that outperform Vulkan on those
+// parts, while Adreno 6xx ggml-opencl is known broken (incorrect
+// results). Mirrors parakeet-cpp's `parse_adreno_version` and the
+// equivalent helper in llm-llamacpp's
+// BackendSelection.cpp::parseAdrenoVersion so the three stacks
+// reach the same decision on the same hardware.
+int parse_adreno_version(const char * s) {
+    if (!s) return -1;
+    const char * p = std::strstr(s, "Adreno");
+    if (!p) p = std::strstr(s, "adreno");
+    if (!p) return -1;
+    p += 6; // strlen("Adreno") == strlen("adreno") == 6
+    while (*p && !(*p >= '0' && *p <= '9') && *p != 'X' && *p != 'x') ++p;
+    if (!*p) return -1;
+    if (*p == 'X' || *p == 'x') {
+        ++p;
+        if (*p < '0' || *p > '9') return -1; // "Xclipse" etc. is not Adreno-X
+        return 800;
+    }
+    int v = 0;
+    while (*p >= '0' && *p <= '9') {
+        v = v * 10 + (*p - '0');
+        ++p;
+        if (v > 100000) return -1;
+    }
+    return v;
+}
+
+bool is_adreno_6xx(const char * s) {
+    const int v = parse_adreno_version(s);
+    return v >= 600 && v < 700;
+}
+
+bool is_adreno_700plus(const char * s) {
+    const int v = parse_adreno_version(s);
+    return v >= 700;
+}
+
+// Pick a GPU backend using the same tier policy as parakeet-cpp's
+// `init_gpu_backend` / llm-llamacpp's BackendSelection: ggml-opencl
+// is only used when an Adreno 700+ device is present (where its
+// kernels are validated and faster than Vulkan); every other GPU
+// (Vulkan, Metal, CUDA, Mali, Intel iGPU, ...) goes through the
+// non-OpenCL preference. Adreno 6xx OpenCL is known broken
+// (incorrect outputs) and is force-skipped unless the caller opts
+// in via `TTS_CPP_ALLOW_ADRENO_6XX=1`.
+//
+// Routed exclusively through the ggml-backend registry
+// (`ggml_backend_load_all` + `ggml_backend_dev_*`). No direct calls
+// to `ggml_backend_vulkan_init` / `ggml_backend_opencl_init` /
+// `ggml_backend_metal_init` are made anywhere in tts-cpp -- under
+// the GGML_BACKEND_DL=ON build mode embedded host applications ship
+// with, those entry points live in separate shared libraries that
+// are dlopen()'d at runtime and are not linkable from libtts-cpp.
+// The registry walk reaches the same backends in both modes.
+ggml_backend_t init_gpu_backend(int n_gpu_layers,
+                                bool verbose,
+                                const char * log_prefix) {
+    if (n_gpu_layers <= 0) return nullptr;
+    if (!log_prefix) log_prefix = "tts-cpp";
+
+    ensure_backends_loaded();
+
+    struct Cand {
+        ggml_backend_dev_t dev;
+        const char *       name;
+        const char *       desc;
+        const char *       reg_name;
+    };
+    std::vector<Cand> opencl_adreno_700plus;
+    std::vector<Cand> other_gpu;    // Vulkan / Metal / CUDA / Mali / Intel / ...
+    std::vector<Cand> opencl_other; // Non-Adreno OpenCL (e.g. desktop)
+    int max_adreno_version = -1;
+
+    const size_t n_dev = ggml_backend_dev_count();
+    for (size_t i = 0; i < n_dev; ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (!dev) continue;
+        const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev);
+        if (type != GGML_BACKEND_DEVICE_TYPE_GPU &&
+            type != GGML_BACKEND_DEVICE_TYPE_IGPU) {
+            continue;
+        }
+        const char * name     = ggml_backend_dev_name(dev);
+        const char * desc     = ggml_backend_dev_description(dev);
+        const char * reg_name = dev_reg_name(dev);
+        const bool   is_opencl = reg_name && std::strcmp(reg_name, "OpenCL") == 0;
+
+        const int adreno_v = std::max(parse_adreno_version(name),
+                                      parse_adreno_version(desc));
+        if (adreno_v > max_adreno_version) max_adreno_version = adreno_v;
+
+        if (is_opencl) {
+            if (adreno_v >= 700) {
+                opencl_adreno_700plus.push_back({dev, name, desc, reg_name});
+            } else if (adreno_v >= 600 && adreno_v < 700) {
+                const char * reported = name ? name : (desc ? desc : "unknown");
+                const char * override_env = std::getenv("TTS_CPP_ALLOW_ADRENO_6XX");
+                if (!override_env || override_env[0] != '1') {
+                    if (verbose) {
+                        fprintf(stderr,
+                            "%s: OpenCL device '%s' is Adreno 6xx; "
+                            "skipping (7xx/8xx/X1E supported, set "
+                            "TTS_CPP_ALLOW_ADRENO_6XX=1 to override)\n",
+                            log_prefix, reported);
+                    }
+                    continue;
+                }
+                if (verbose) {
+                    fprintf(stderr,
+                        "%s: TTS_CPP_ALLOW_ADRENO_6XX=1 set; "
+                        "keeping OpenCL backend on '%s' anyway\n",
+                        log_prefix, reported);
+                }
+                opencl_other.push_back({dev, name, desc, reg_name});
+            } else {
+                opencl_other.push_back({dev, name, desc, reg_name});
+            }
+        } else {
+            other_gpu.push_back({dev, name, desc, reg_name});
+        }
+    }
+
+    // Tier policy:
+    //   1. Adreno 700+: prefer OpenCL (validated, faster than Vulkan
+    //      on Snapdragon 8 Gen 2/3/4 etc.).
+    //   2. Anything else with a non-OpenCL GPU: prefer that
+    //      (Vulkan on all non-Adreno Android, Metal on Apple, CUDA
+    //      on Linux/Windows desktop, Mali iGPU via Vulkan, ...).
+    //   3. Last resort: any other OpenCL device (e.g. desktop OpenCL
+    //      or non-Adreno mobile when no Vulkan is registered).
+    auto try_init = [&](const std::vector<Cand> & bucket) -> ggml_backend_t {
+        for (const Cand & c : bucket) {
+            ggml_backend_t b = ggml_backend_dev_init(c.dev, nullptr);
+            if (!b) continue;
+            if (verbose) {
+                fprintf(stderr,
+                    "%s: using %s backend (%s)\n",
+                    log_prefix,
+                    c.reg_name && *c.reg_name ? c.reg_name : "GPU",
+                    c.name ? c.name : (c.desc ? c.desc : "unknown"));
+            }
+            return b;
+        }
+        return nullptr;
+    };
+
+    if (!opencl_adreno_700plus.empty()) {
+        if (ggml_backend_t b = try_init(opencl_adreno_700plus)) return b;
+    }
+    if (ggml_backend_t b = try_init(other_gpu)) return b;
+    if (ggml_backend_t b = try_init(opencl_other)) return b;
+
+    if (verbose) {
+        if (max_adreno_version >= 600 && max_adreno_version < 700) {
+            fprintf(stderr,
+                "%s: only Adreno 6xx OpenCL detected (broken); "
+                "falling back to CPU\n",
+                log_prefix);
+        } else {
+            fprintf(stderr,
+                "%s: no GPU backend available, falling back to CPU\n",
+                log_prefix);
+        }
+    }
+    return nullptr;
+}
+
+ggml_backend_t init_cpu_backend() {
+    ensure_backends_loaded();
+    return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+}
+
+ggml_backend_t init_blas_backend() {
+    ensure_backends_loaded();
+    const size_t n_dev = ggml_backend_dev_count();
+    for (size_t i = 0; i < n_dev; ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (!dev) continue;
+        if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_ACCEL) continue;
+        const char * reg_name = dev_reg_name(dev);
+        if (!reg_name || std::strcmp(reg_name, "BLAS") != 0) continue;
+        return ggml_backend_dev_init(dev, nullptr);
+    }
+    return nullptr;
+}
+
+} // namespace tts_cpp::detail
diff --git a/tts-cpp/src/backend_selection.h b/tts-cpp/src/backend_selection.h
new file mode 100644
index 00000000000..60c99104e9f
--- /dev/null
+++ b/tts-cpp/src/backend_selection.h
@@ -0,0 +1,90 @@
+#pragma once
+
+// Registry-only GPU backend selection for tts-cpp.
+//
+// Replaces the three legacy `init_backend` / `s3gen_init_backend` /
+// `init_supertonic_backend` `#ifdef GGML_USE_<X>` ladders that called
+// `ggml_backend_{cuda,metal,vk,opencl}_init` directly. Under the
+// dynamic-loader build mode embedded host applications ship with
+// (`GGML_BACKEND_DL=ON`) those static entry points live in separate
+// `.so` files that are dlopen()'d at runtime and are not linkable
+// from libtts-cpp; the ggml-backend registry walk reaches the same
+// backends in both `GGML_BACKEND_DL=ON` and `=OFF` modes, mirroring
+// parakeet-cpp's design.
+//
+// Selection follows the same Adreno tier policy as parakeet-cpp's
+// `init_gpu_backend` and the qvac llm-llamacpp `BackendSelection.cpp::
+// chooseBackend`: Adreno 700+ devices take the OpenCL branch
+// (validated, faster than Vulkan on Snapdragon 8 Gen 2/3/4 and on the
+// Snapdragon X Elite parts that report as `Adreno X<n>`); every other
+// GPU (Vulkan on all non-Adreno Android, Metal on Apple, CUDA on
+// Linux/Windows desktop, Mali iGPU via Vulkan, ...) goes through the
+// non-OpenCL preference. Adreno 6xx OpenCL is force-skipped (known
+// broken kernels) unless the caller opts in via
+// `TTS_CPP_ALLOW_ADRENO_6XX=1`.
+
+#include "ggml-backend.h"
+
+#include <string>
+
+namespace tts_cpp::detail {
+
+// First-Engine-wins override for the directory `ggml_backend_load_all*()`
+// scans on the first `ensure_backends_loaded()` call. Call before
+// constructing the first Engine; later calls log a one-shot warn and
+// are ignored (the ggml-backend registry is a process-wide singleton).
+void set_backends_directory(const std::string & dir);
+
+// First-Engine-wins override for `$GGML_OPENCL_CACHE_DIR`. Honoured
+// only on `__ANDROID__` builds; ignored elsewhere (desktop OpenCL
+// platforms don't ship the program-binary-cache patch that reads this
+// env var). Call before constructing the first Engine.
+void set_opencl_cache_dir(const std::string & dir);
+
+// Idempotent process-wide load of every registered ggml backend.
+// Routed through a function-static guard so callers can invoke it
+// from every init helper without paying the directory walk cost
+// more than once.
+void ensure_backends_loaded();
+
+// Pick a GPU backend using the Adreno tier policy described above.
+// Returns nullptr when no GPU was requested (`n_gpu_layers <= 0`),
+// when no GPU device is registered, or when every candidate device
+// refused `ggml_backend_dev_init`. `log_prefix` controls the
+// per-call log line tag (e.g. "s3gen", "supertonic", "chatterbox")
+// so the existing user-visible logs in the three init sites stay
+// distinguishable; verbose=false suppresses everything except hard
+// errors.
+ggml_backend_t init_gpu_backend(int n_gpu_layers,
+                                bool verbose,
+                                const char * log_prefix);
+
+// Convenience wrapper that picks up the registered CPU device and
+// returns its init handle. Mirrors parakeet-cpp's
+// `init_cpu_backend()`. Never throws; returns nullptr when the
+// ggml-cpu backend isn't available (no .so on disk and not
+// statically linked).
+ggml_backend_t init_cpu_backend();
+
+// Returns the first registered BLAS accel backend (if any) or
+// nullptr. Mirrors parakeet-cpp's `init_blas_backend()`. Today no
+// tts-cpp call site uses this but it is exposed for parity with
+// the parakeet helper API so callers that want to mirror parakeet's
+// (cpu + blas accel + gpu) cascade can.
+ggml_backend_t init_blas_backend();
+
+// Adreno-generation parser. Returns:
+//   - a 3-or-4-digit generation number ("Adreno (TM) 750" -> 750,
+//     "Adreno 830" -> 830, "Adreno 660" -> 660)
+//   - a synthetic 800 for the "Adreno X<n>" naming used by
+//     Snapdragon X Elite parts (X1-85 / X1-45 etc.)
+//   - -1 when no Adreno marker is present (Mali, desktop GPUs, ...)
+//
+// Exposed for the tier-policy implementation; safe to call on
+// nullptr / empty strings.
+int parse_adreno_version(const char * s);
+
+bool is_adreno_6xx(const char * s);
+bool is_adreno_700plus(const char * s);
+
+} // namespace tts_cpp::detail
diff --git a/tts-cpp/src/backend_util.h b/tts-cpp/src/backend_util.h
new file mode 100644
index 00000000000..2eb8a966ac3
--- /dev/null
+++ b/tts-cpp/src/backend_util.h
@@ -0,0 +1,53 @@
+#pragma once
+
+// Backend-introspection helpers that work uniformly under both
+// GGML_BACKEND_DL=ON and GGML_BACKEND_DL=OFF. The legacy
+// ggml_backend_is_cpu / ggml_backend_is_metal entry points live in
+// the per-backend shared libraries (libggml-cpu.* / libggml-metal.*),
+// so they are unlinkable from libtts-cpp under the dynamic-loader
+// build mode embedded host applications typically ship with. Routing
+// through the registry (ggml_backend_get_device + ggml_backend_dev_*)
+// reaches the same answer in both modes.
+//
+// Mirrors parakeet-cpp/src/backend_util.h 1:1 (same QVAC speech-stack
+// pattern); kept in a tts-cpp namespace so the two libraries can be
+// vendored side-by-side without ODR collisions on the helpers.
+
+#include "ggml-backend.h"
+
+#include <cstring>
+
+namespace tts_cpp::detail {
+
+inline const char * backend_reg_name(ggml_backend_t b) {
+    if (!b) return "";
+    ggml_backend_dev_t dev = ggml_backend_get_device(b);
+    if (!dev) return "";
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    if (!reg) return "";
+    const char * n = ggml_backend_reg_name(reg);
+    return n ? n : "";
+}
+
+inline bool backend_is_cpu(ggml_backend_t b) {
+    if (!b) return false;
+    ggml_backend_dev_t dev = ggml_backend_get_device(b);
+    return dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU;
+}
+
+inline bool backend_is_metal(ggml_backend_t b) {
+    return std::strcmp(backend_reg_name(b), "Metal") == 0;
+}
+
+inline void backend_set_n_threads(ggml_backend_t b, int n_threads) {
+    if (!b || n_threads <= 0) return;
+    ggml_backend_dev_t dev = ggml_backend_get_device(b);
+    if (!dev) return;
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    if (!reg) return;
+    auto fn = (ggml_backend_set_n_threads_t)
+        ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+    if (fn) fn(b, n_threads);
+}
+
+} // namespace tts_cpp::detail
diff --git a/tts-cpp/src/campplus.cpp b/tts-cpp/src/campplus.cpp
index d52ca6d709d..8d051047812 100644
--- a/tts-cpp/src/campplus.cpp
+++ b/tts-cpp/src/campplus.cpp
@@ -1,4 +1,5 @@
 #include "campplus.h"
+#include "backend_selection.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
diff --git a/tts-cpp/src/campplus_forward.inc b/tts-cpp/src/campplus_forward.inc
index 09d247fdb86..fafacecd324 100644
--- a/tts-cpp/src/campplus_forward.inc
+++ b/tts-cpp/src/campplus_forward.inc
@@ -703,7 +703,9 @@ static bool campplus_embed_ggml(const std::vector<float> & fbank_t_by_c, int T,
     campplus_gctx g;
     g.backend = backend;
     if (!g.backend) {
-        g.backend      = ggml_backend_cpu_init();
+        // Registry-routed CPU init (works under GGML_BACKEND_DL=ON and OFF).
+        // See voice_encoder.cpp for the longer rationale.
+        g.backend      = ::tts_cpp::detail::init_cpu_backend();
         g.owns_backend = true;
         if (!g.backend) { fprintf(stderr, "campplus_ggml: cpu init failed\n"); return false; }
     }
diff --git a/tts-cpp/src/chatterbox_cli.cpp b/tts-cpp/src/chatterbox_cli.cpp
index 78f3ec8e6b8..d112adcc8a4 100644
--- a/tts-cpp/src/chatterbox_cli.cpp
+++ b/tts-cpp/src/chatterbox_cli.cpp
@@ -21,17 +21,12 @@
 #include "ggml-backend.h"
 #include "gguf.h"
 
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
+// Per-backend `#include "ggml-{cuda,metal,vulkan}.h"` blocks used
+// to sit here gated on `GGML_USE_<X>` so callers could reach the
+// static `ggml_backend_<x>_init` entry points directly. Removed
+// alongside the cascade in `main.cpp::init_backend`: every backend
+// decision now routes through the ggml-backend registry
+// (`backend_selection.{h,cpp}`).
 
 #include <algorithm>
 #include <atomic>
@@ -907,9 +902,12 @@ int tts_cpp_cli_main(int argc, char ** argv) {
                 }
             }
 
-            // Voice-cloning preprocessing shares a backend: on Mac we pick
-            // Metal, on Linux + NVIDIA we pick CUDA / Vulkan.  Falls back to
-            // the ggml-cpu NEON/AVX kernels when n_gpu_layers == 0.
+            // Voice-cloning preprocessing shares a backend with T3: the
+            // backend_selection registry walk reaches Metal on Apple,
+            // CUDA/Vulkan on Linux/Windows desktop, OpenCL on Adreno 700+
+            // and Vulkan on every other Android GPU. Falls back to the
+            // ggml-cpu NEON/AVX kernels when n_gpu_layers == 0 or no GPU
+            // device is registered.
             ggml_backend_t vc_backend = init_backend(params.n_gpu_layers);
 
             // (1) speaker_emb via VoiceEncoder (3-layer LSTM + proj + L2-norm
diff --git a/tts-cpp/src/chatterbox_engine.cpp b/tts-cpp/src/chatterbox_engine.cpp
index 21f47832b6f..e34cf244db8 100644
--- a/tts-cpp/src/chatterbox_engine.cpp
+++ b/tts-cpp/src/chatterbox_engine.cpp
@@ -9,6 +9,7 @@
 #include <thread>
 #include <vector>
 
+#include "backend_selection.h"
 #include "chatterbox_t3_internal.h"
 #include "gpt2_bpe.h"
 #include "mtl_tokenizer.h"
@@ -89,6 +90,21 @@ struct Engine::Impl {
 
         ggml_time_init();
         g_log_verbose = opts.verbose ? 1 : 0;
+
+        // Wire backends_dir + opencl_cache_dir BEFORE any backend init.
+        // The ggml-backend registry is a process-singleton: only the
+        // first Engine construction's `set_backends_directory` /
+        // `set_opencl_cache_dir` actually take effect (second + later
+        // Engines log a one-shot warn and reuse the already-loaded
+        // registry; see backend_selection.cpp for the contract). Mirrors
+        // parakeet-cpp's Engine ctor.
+        if (!opts.backends_dir.empty()) {
+            ::tts_cpp::detail::set_backends_directory(opts.backends_dir);
+        }
+        if (!opts.opencl_cache_dir.empty()) {
+            ::tts_cpp::detail::set_opencl_cache_dir(opts.opencl_cache_dir);
+        }
+
         // Note: we deliberately do NOT call ggml_log_set here.  The
         // process-global sink is owned by the host application via
         // tts_cpp_log_set (see <tts-cpp/log.h>); installing one
diff --git a/tts-cpp/src/chatterbox_tts.cpp b/tts-cpp/src/chatterbox_tts.cpp
index 603cbc74f3f..24c43b5ecf9 100644
--- a/tts-cpp/src/chatterbox_tts.cpp
+++ b/tts-cpp/src/chatterbox_tts.cpp
@@ -21,6 +21,8 @@
 //   chatterbox_tts --s3gen-gguf MODEL.gguf --ref-dir DIR \
 //                  --tokens-file TOKENS.txt --out OUT.wav
 
+#include "backend_selection.h"
+#include "backend_util.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -29,18 +31,12 @@
 #include "npy.h"
 #include "chatterbox_tts_test_hooks.h"
 
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-#ifdef GGML_USE_OPENCL
-#include "ggml-opencl.h"
-#endif
+// The per-backend `#include "ggml-{cuda,metal,vulkan,opencl}.h"`
+// blocks gated on `GGML_USE_<X>` that used to live here are gone:
+// `s3gen_init_backend` below forwards to `backend_selection`'s
+// registry walk, which reaches every backend through
+// `ggml_backend_dev_*` without linking the per-backend static init
+// symbols. Same shape as parakeet-cpp.
 
 #include <algorithm>
 #include <atomic>
@@ -71,7 +67,10 @@ static double now_ms() {
 }
 
 static void compute(ggml_backend_t backend, ggml_cgraph * gf) {
-    if (ggml_backend_is_cpu(backend)) ggml_backend_cpu_set_n_threads(backend, g_n_threads);
+    // Registry-routed n_threads (no-op on non-CPU backends); see
+    // src/t3_mtl.cpp for the GGML_BACKEND_DL=ON unresolvable-symbol
+    // rationale.
+    ::tts_cpp::detail::backend_set_n_threads(backend, g_n_threads);
     ggml_backend_graph_compute(backend, gf);
 }
 struct scoped_timer {
@@ -103,55 +102,17 @@ struct model_ctx {
 };
 
 static ggml_backend_t s3gen_init_backend(int n_gpu_layers, bool verbose) {
-#ifdef GGML_USE_CUDA
-    if (n_gpu_layers > 0) {
-        auto * b = ggml_backend_cuda_init(0);
-        if (b) { if (verbose) fprintf(stderr, "s3gen: using CUDA backend\n"); return b; }
-    }
-#endif
-#ifdef GGML_USE_METAL
-    if (n_gpu_layers > 0) {
-        auto * b = ggml_backend_metal_init();
-        if (b) { if (verbose) fprintf(stderr, "s3gen: using Metal backend\n"); return b; }
-    }
-#endif
-#ifdef GGML_USE_VULKAN
-    if (n_gpu_layers > 0) {
-        auto * b = ggml_backend_vk_init(0);
-        if (b) {
-            if (verbose) {
-                char desc[256] = {0};
-                ggml_backend_vk_get_device_description(0, desc, sizeof(desc));
-                fprintf(stderr, "s3gen: using Vulkan backend (device 0: %s)\n", desc);
-            }
-            return b;
-        }
+    // GPU cascade is centralised in backend_selection.cpp's
+    // `init_gpu_backend` (Adreno 700+ -> OpenCL, every other GPU ->
+    // Vulkan/Metal/CUDA/Mali, with Adreno 6xx OpenCL force-skipped).
+    if (ggml_backend_t b = ::tts_cpp::detail::init_gpu_backend(n_gpu_layers, verbose, "s3gen")) {
+        return b;
     }
-#endif
-#if defined(GGML_USE_OPENCL)
-    if (n_gpu_layers > 0) {
-        ggml_backend_reg_t ocl_reg = ggml_backend_opencl_reg();
-        if (ocl_reg && ggml_backend_reg_dev_count(ocl_reg) > 0) {
-            auto * b = ggml_backend_opencl_init();
-            if (b) {
-                if (verbose) {
-                    fprintf(stderr, "s3gen: using OpenCL backend\n");
-                }
-                return b;
-            }
-        } else if (verbose && ocl_reg) {
-            if (ggml_backend_reg_dev_count(ocl_reg) == 0) {
-                fprintf(stderr, "s3gen: no OpenCL device; using CPU\n");
-            } else {
-                fprintf(stderr, "s3gen: OpenCL init failed; using CPU\n");
-            }
-        }
+    if (ggml_backend_t b = ::tts_cpp::detail::init_cpu_backend()) {
+        if (verbose) fprintf(stderr, "s3gen: using CPU backend\n");
+        return b;
     }
-#endif
-    auto * b = ggml_backend_cpu_init();
-    if (!b) throw std::runtime_error("ggml_backend_cpu_init() failed");
-    if (verbose) fprintf(stderr, "s3gen: using CPU backend\n");
-    return b;
+    throw std::runtime_error("s3gen_init_backend: no CPU device registered");
 }
 
 // Process-wide cache of the loaded S3Gen GGUF so repeated calls (streaming
@@ -2661,7 +2622,7 @@ int s3gen_synthesize_to_wav(
     // throughput (measured +11% S3Gen wall time on M4 CPU), so we keep the
     // two-call path there.  Meanflow has no CFG to begin with.
     const bool use_b2 = (!meanflow) && (cfg_rate != 0.0f) &&
-                        !ggml_backend_is_cpu(m.backend);
+                        !::tts_cpp::detail::backend_is_cpu(m.backend);
 
     // Persistent CFM estimator graph cache.  Re-used across synth
     // calls when T matches — multi-synth chunks 2..N skip the graph
diff --git a/tts-cpp/src/main.cpp b/tts-cpp/src/main.cpp
index aa87cc73fa3..38b1e722111 100644
--- a/tts-cpp/src/main.cpp
+++ b/tts-cpp/src/main.cpp
@@ -1,3 +1,5 @@
+#include "backend_selection.h"
+#include "backend_util.h"
 #include "gpt2_bpe.h"
 #include "mtl_tokenizer.h"
 #include "ggml.h"
@@ -6,21 +8,15 @@
 #include "ggml-backend.h"
 #include "gguf.h"
 
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#ifdef GGML_USE_OPENCL
-#include "ggml-opencl.h"
-#endif
+// Per-backend `#include "ggml-{cuda,metal,vulkan,opencl}.h"` blocks
+// used to sit here gated on `GGML_USE_<X>` so the legacy
+// `init_backend` cascade below could call the static
+// `ggml_backend_<x>_init` entry points directly. Removed alongside
+// the cascade: tts-cpp routes every backend decision through the
+// ggml-backend registry (`backend_selection.{h,cpp}`), which reaches
+// the same backends in both `GGML_BACKEND_DL=ON` (Android prebuild)
+// and `=OFF` (desktop dev) modes without linking those static
+// symbols. Mirrors parakeet-cpp's design.
 
 #include <algorithm>
 #include <atomic>
@@ -291,55 +287,20 @@ int g_log_verbose = 0;
 
 ggml_backend_t init_backend(int n_gpu_layers) {
     const bool v = g_log_verbose != 0;
-#ifdef GGML_USE_CUDA
-    if (n_gpu_layers > 0) {
-        auto * b = ggml_backend_cuda_init(0);
-        if (b) { if (v) fprintf(stderr, "%s: using CUDA backend\n", __func__); return b; }
-    }
-#endif
-#ifdef GGML_USE_METAL
-    if (n_gpu_layers > 0) {
-        auto * b = ggml_backend_metal_init();
-        if (b) { if (v) fprintf(stderr, "%s: using Metal backend\n", __func__); return b; }
-    }
-#endif
-#ifdef GGML_USE_VULKAN
-    if (n_gpu_layers > 0) {
-        auto * b = ggml_backend_vk_init(0);
-        if (b) {
-            if (v) {
-                char desc[256] = {0};
-                ggml_backend_vk_get_device_description(0, desc, sizeof(desc));
-                fprintf(stderr, "%s: using Vulkan backend (device 0: %s)\n", __func__, desc);
-            }
-            return b;
-        }
+    // GPU cascade is centralised in backend_selection.cpp's
+    // `init_gpu_backend` (Adreno 700+ -> OpenCL, every other GPU ->
+    // Vulkan/Metal/CUDA/Mali, with Adreno 6xx OpenCL force-skipped).
+    // The registry walk it does reaches the same set of backends in
+    // both `GGML_BACKEND_DL=ON` and `=OFF` modes without linking the
+    // per-backend static `ggml_backend_<x>_init` entry points.
+    if (ggml_backend_t b = ::tts_cpp::detail::init_gpu_backend(n_gpu_layers, v, "chatterbox")) {
+        return b;
     }
-#endif
-#ifdef GGML_USE_OPENCL
-    if (n_gpu_layers > 0) {
-        ggml_backend_reg_t ocl_reg = ggml_backend_opencl_reg();
-        if (ocl_reg && ggml_backend_reg_dev_count(ocl_reg) > 0) {
-            auto * b = ggml_backend_opencl_init();
-            if (b) {
-                if (v) {
-                    fprintf(stderr, "%s: using OpenCL backend\n", __func__);
-                }
-                return b;
-            }
-        } else if (v && ocl_reg) {
-            if (ggml_backend_reg_dev_count(ocl_reg) == 0) {
-                fprintf(stderr, "%s: no OpenCL device; using CPU\n", __func__);
-            } else {
-                fprintf(stderr, "%s: OpenCL init failed; using CPU\n", __func__);
-            }
-        }
+    if (ggml_backend_t b = ::tts_cpp::detail::init_cpu_backend()) {
+        if (v) fprintf(stderr, "chatterbox: using CPU backend\n");
+        return b;
     }
-#endif
-    auto * b = ggml_backend_cpu_init();
-    if (!b) throw std::runtime_error("ggml_backend_cpu_init() failed");
-    if (v) fprintf(stderr, "%s: using CPU backend\n", __func__);
-    return b;
+    throw std::runtime_error("init_backend: no CPU device registered");
 }
 
 // --------------------------------------------------------------------------
@@ -700,7 +661,10 @@ bool eval_prompt(
         }
     }
 
-    if (ggml_backend_is_cpu(model.backend)) ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    // Registry-routed n_threads (no-op on non-CPU backends); see
+    // src/t3_mtl.cpp for the GGML_BACKEND_DL=ON unresolvable-symbol
+    // rationale.
+    ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads);
     ggml_backend_graph_compute(model.backend, gf);
 
     ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits");
@@ -723,7 +687,8 @@ bool eval_step(
     int32_t position = n_past;
     ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "position"), &position, 0, sizeof(position));
 
-    if (ggml_backend_is_cpu(model.backend)) ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    // Registry-routed n_threads; see src/t3_mtl.cpp for rationale.
+    ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads);
     ggml_backend_graph_compute(model.backend, gf);
 
     ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits");
diff --git a/tts-cpp/src/mel_extract_stft.cpp b/tts-cpp/src/mel_extract_stft.cpp
index 3453d814d68..949298d8969 100644
--- a/tts-cpp/src/mel_extract_stft.cpp
+++ b/tts-cpp/src/mel_extract_stft.cpp
@@ -18,6 +18,8 @@
 // ggml-cpu's mul_mat uses NEON on ARM and AVX on x86, so this path is both
 // faster and more portable than the scalar loops in voice_features.cpp.
 
+#include "backend_selection.h"
+
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -148,10 +150,12 @@ static std::vector<float> mel_graph_run(
     make_dft_basis(n_fft, F, cos_basis, neg_sin_basis);
 
     ggml_ctx gc;
-    gc.backend      = ggml_backend_cpu_init();
+    // Registry-routed CPU init (works under GGML_BACKEND_DL=ON and OFF).
+    // See voice_encoder.cpp for the longer rationale.
+    gc.backend      = ::tts_cpp::detail::init_cpu_backend();
     gc.owns_backend = true;
     if (!gc.backend) {
-        fprintf(stderr, "mel_graph_run: ggml_backend_cpu_init failed\n");
+        fprintf(stderr, "mel_graph_run: init_cpu_backend failed\n");
         return {};
     }
 
diff --git a/tts-cpp/src/s3tokenizer.cpp b/tts-cpp/src/s3tokenizer.cpp
index a6d1a12ef48..5af41a3e34a 100644
--- a/tts-cpp/src/s3tokenizer.cpp
+++ b/tts-cpp/src/s3tokenizer.cpp
@@ -1,4 +1,6 @@
 #include "s3tokenizer.h"
+#include "backend_selection.h"
+#include "backend_util.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -333,9 +335,11 @@ static bool build_encoder_ctx(encoder_ctx & ec, const s3tokv2_weights & w,
         ec.backend      = backend;
         ec.owns_backend = false;
     } else {
-        ec.backend      = ggml_backend_cpu_init();
+        // Registry-routed CPU init (works under GGML_BACKEND_DL=ON and OFF).
+        // See voice_encoder.cpp for the longer rationale.
+        ec.backend      = ::tts_cpp::detail::init_cpu_backend();
         ec.owns_backend = true;
-        if (!ec.backend) { fprintf(stderr, "s3tokv2: ggml_backend_cpu_init failed\n"); return false; }
+        if (!ec.backend) { fprintf(stderr, "s3tokv2: init_cpu_backend failed\n"); return false; }
     }
 
     // Enough tensors: stem (4) + 16*6 blocks = 100.  Bump a bit for safety.
@@ -643,9 +647,8 @@ bool s3tokv2_tokenize(const std::vector<float> & wav,
     }
 
     if (n_threads <= 0) n_threads = (int)std::thread::hardware_concurrency();
-    if (ggml_backend_is_cpu(ec.backend)) {
-        ggml_backend_cpu_set_n_threads(ec.backend, n_threads);
-    }
+    // Registry-routed n_threads; see t3_mtl.cpp for rationale.
+    ::tts_cpp::detail::backend_set_n_threads(ec.backend, n_threads);
 
     if (ggml_backend_graph_compute(ec.backend, gf) != GGML_STATUS_SUCCESS) {
         fprintf(stderr, "s3tokv2: graph_compute failed\n");
diff --git a/tts-cpp/src/supertonic_engine.cpp b/tts-cpp/src/supertonic_engine.cpp
index 46c195a2c7c..cc87c09e084 100644
--- a/tts-cpp/src/supertonic_engine.cpp
+++ b/tts-cpp/src/supertonic_engine.cpp
@@ -1,6 +1,7 @@
 #define TTS_CPP_BUILD
 #include "tts-cpp/supertonic/engine.h"
 
+#include "backend_selection.h"
 #include "supertonic_internal.h"
 #include "npy.h"
 
@@ -122,6 +123,18 @@ struct Engine::Impl {
         if (!std::filesystem::exists(opts.model_gguf_path)) {
             throw std::runtime_error(supertonic_setup_hint(opts.model_gguf_path));
         }
+
+        // Wire backends_dir + opencl_cache_dir BEFORE any backend
+        // init. First-Engine-wins across the whole process; second
+        // and later Engines reuse the already-loaded registry. See
+        // backend_selection.cpp.
+        if (!opts.backends_dir.empty()) {
+            ::tts_cpp::detail::set_backends_directory(opts.backends_dir);
+        }
+        if (!opts.opencl_cache_dir.empty()) {
+            ::tts_cpp::detail::set_opencl_cache_dir(opts.opencl_cache_dir);
+        }
+
         if (!load_supertonic_gguf(opts.model_gguf_path, model, opts.n_gpu_layers, false)) {
             throw std::runtime_error("Supertonic Engine: failed to load GGUF: " +
                                      opts.model_gguf_path);
diff --git a/tts-cpp/src/supertonic_gguf.cpp b/tts-cpp/src/supertonic_gguf.cpp
index 477d9ff6fda..1c33ebe41e7 100644
--- a/tts-cpp/src/supertonic_gguf.cpp
+++ b/tts-cpp/src/supertonic_gguf.cpp
@@ -1,20 +1,16 @@
 #include "supertonic_internal.h"
 
+#include "backend_selection.h"
+#include "backend_util.h"
 #include "ggml-cpu.h"
 #include "gguf.h"
 
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-#ifdef GGML_USE_OPENCL
-#include "ggml-opencl.h"
-#endif
+// The per-backend `#include "ggml-{cuda,metal,vulkan,opencl}.h"`
+// blocks gated on `GGML_USE_<X>` that used to live here are gone:
+// `init_supertonic_backend` below forwards to `backend_selection`'s
+// registry walk, which reaches every backend through
+// `ggml_backend_dev_*` without linking the per-backend static init
+// symbols. Same shape as parakeet-cpp.
 
 #include <algorithm>
 #include <atomic>
@@ -94,40 +90,17 @@ std::vector<float> expand_supertonic_tensor_to_f32(const ggml_tensor * src) {
 }
 
 ggml_backend_t init_supertonic_backend(int n_gpu_layers, bool verbose) {
-#ifdef GGML_USE_CUDA
-    if (n_gpu_layers > 0) {
-        ggml_backend_t b = ggml_backend_cuda_init(0);
-        if (b) { if (verbose) fprintf(stderr, "supertonic: using CUDA backend\n"); return b; }
-    }
-#endif
-#ifdef GGML_USE_METAL
-    if (n_gpu_layers > 0) {
-        ggml_backend_t b = ggml_backend_metal_init();
-        if (b) { if (verbose) fprintf(stderr, "supertonic: using Metal backend\n"); return b; }
+    // GPU cascade is centralised in backend_selection.cpp's
+    // `init_gpu_backend` (Adreno 700+ -> OpenCL, every other GPU ->
+    // Vulkan/Metal/CUDA/Mali, with Adreno 6xx OpenCL force-skipped).
+    if (ggml_backend_t b = ::tts_cpp::detail::init_gpu_backend(n_gpu_layers, verbose, "supertonic")) {
+        return b;
     }
-#endif
-#ifdef GGML_USE_VULKAN
-    if (n_gpu_layers > 0) {
-        ggml_backend_t b = ggml_backend_vk_init(0);
-        if (b) {
-            if (verbose) fprintf(stderr, "supertonic: using Vulkan backend\n");
-            return b;
-        }
-    }
-#endif
-#ifdef GGML_USE_OPENCL
-    if (n_gpu_layers > 0) {
-        ggml_backend_reg_t reg = ggml_backend_opencl_reg();
-        if (reg && ggml_backend_reg_dev_count(reg) > 0) {
-            ggml_backend_t b = ggml_backend_opencl_init();
-            if (b) { if (verbose) fprintf(stderr, "supertonic: using OpenCL backend\n"); return b; }
-        }
+    if (ggml_backend_t b = ::tts_cpp::detail::init_cpu_backend()) {
+        if (verbose) fprintf(stderr, "supertonic: using CPU backend\n");
+        return b;
     }
-#endif
-    ggml_backend_t b = ggml_backend_cpu_init();
-    if (!b) throw std::runtime_error("ggml_backend_cpu_init failed");
-    if (verbose) fprintf(stderr, "supertonic: using CPU backend\n");
-    return b;
+    throw std::runtime_error("init_supertonic_backend: no CPU device registered");
 }
 
 void set_env_if_unset(const char * name, const char * value) {
@@ -230,8 +203,11 @@ void supertonic_set_n_threads(supertonic_model & model, int n_threads) {
 }
 
 void supertonic_graph_compute(const supertonic_model & model, ggml_cgraph * graph) {
-    if (ggml_backend_is_cpu(model.backend) && model.n_threads > 0) {
-        ggml_backend_cpu_set_n_threads(model.backend, model.n_threads);
+    // Registry-routed n_threads (no-op on non-CPU backends); see
+    // src/t3_mtl.cpp for the GGML_BACKEND_DL=ON unresolvable-symbol
+    // rationale.
+    if (model.n_threads > 0) {
+        ::tts_cpp::detail::backend_set_n_threads(model.backend, model.n_threads);
     }
     ggml_backend_graph_compute(model.backend, graph);
 }
diff --git a/tts-cpp/src/t3_mtl.cpp b/tts-cpp/src/t3_mtl.cpp
index 316f1747dea..bdae73bef47 100644
--- a/tts-cpp/src/t3_mtl.cpp
+++ b/tts-cpp/src/t3_mtl.cpp
@@ -24,6 +24,7 @@
 #include "chatterbox_t3_internal.h"
 #include "t3_mtl.h"
 
+#include "backend_util.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -1135,9 +1136,12 @@ bool run_prompt_pass(const chatterbox_model & model,
     fill_causal_mask_f16(mask, N);
     set_in("kq_mask", mask.data(), mask.size() * sizeof(ggml_fp16_t));
 
-    if (ggml_backend_is_cpu(model.backend)) {
-        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-    }
+    // Registry-routed n_threads (works under GGML_BACKEND_DL=ON: the CPU
+    // backend lives in a dlopen'd per-arch .so, so the static
+    // `ggml_backend_cpu_set_n_threads` symbol is unresolvable at link time).
+    // The helper is a no-op on non-CPU backends and on CPU backends that
+    // don't export `ggml_backend_set_n_threads`.
+    ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads);
     ggml_backend_graph_compute(model.backend, gf);
 
     ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits");
@@ -1202,9 +1206,12 @@ bool run_prompt_pass_b2(const chatterbox_model & model,
     fill_causal_mask_f16(mask, N);
     set_in("kq_mask", mask.data(), mask.size() * sizeof(ggml_fp16_t));
 
-    if (ggml_backend_is_cpu(model.backend)) {
-        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-    }
+    // Registry-routed n_threads (works under GGML_BACKEND_DL=ON: the CPU
+    // backend lives in a dlopen'd per-arch .so, so the static
+    // `ggml_backend_cpu_set_n_threads` symbol is unresolvable at link time).
+    // The helper is a no-op on non-CPU backends and on CPU backends that
+    // don't export `ggml_backend_set_n_threads`.
+    ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads);
     ggml_backend_graph_compute(model.backend, gf);
 
     ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits");
@@ -1247,9 +1254,12 @@ bool run_step_pass_b2(const chatterbox_model & model,
     int32_t pos = n_past;
     ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "pos_ids"), &pos, 0, sizeof(pos));
 
-    if (ggml_backend_is_cpu(model.backend)) {
-        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-    }
+    // Registry-routed n_threads (works under GGML_BACKEND_DL=ON: the CPU
+    // backend lives in a dlopen'd per-arch .so, so the static
+    // `ggml_backend_cpu_set_n_threads` symbol is unresolvable at link time).
+    // The helper is a no-op on non-CPU backends and on CPU backends that
+    // don't export `ggml_backend_set_n_threads`.
+    ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads);
     ggml_backend_graph_compute(model.backend, gf);
 
     ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits");
@@ -1301,9 +1311,12 @@ bool run_step_pass(const chatterbox_model & model,
     int32_t pos = n_past;
     ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "pos_ids"), &pos, 0, sizeof(pos));
 
-    if (ggml_backend_is_cpu(model.backend)) {
-        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-    }
+    // Registry-routed n_threads (works under GGML_BACKEND_DL=ON: the CPU
+    // backend lives in a dlopen'd per-arch .so, so the static
+    // `ggml_backend_cpu_set_n_threads` symbol is unresolvable at link time).
+    // The helper is a no-op on non-CPU backends and on CPU backends that
+    // don't export `ggml_backend_set_n_threads`.
+    ::tts_cpp::detail::backend_set_n_threads(model.backend, n_threads);
     ggml_backend_graph_compute(model.backend, gf);
 
     ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits");
@@ -1673,7 +1686,7 @@ bool load_model_gguf_mtl(const std::string & path,
         // ggml-cpu's per-kernel overhead is already negligible and the
         // extra weight memory footprint (~75 MB for the multilingual
         // T3) trades unfavourably with thread-cache locality there.
-        if (!ggml_backend_is_cpu(model.backend)) {
+        if (!::tts_cpp::detail::backend_is_cpu(model.backend)) {
             const int n_embd = hp.n_embd;
             const int n_ff   = hp.intermediate_size;
 
@@ -1814,7 +1827,7 @@ bool eval_prompt_mtl(const chatterbox_model & model,
     // op processes B=2 in a tight loop, so batching just doubles the
     // per-op work without saving ops; mirrors §3.20's S3Gen B=2 finding
     // that on CPU the two-call path stayed the winner).
-    const bool use_b2 = !ggml_backend_is_cpu(model.backend);
+    const bool use_b2 = !::tts_cpp::detail::backend_is_cpu(model.backend);
     if (use_b2) {
         return run_prompt_pass_b2(model, allocr, n_threads, text_tokens,
                                   exaggeration, logits_cond_out,
@@ -1860,7 +1873,7 @@ bool eval_step_mtl(const chatterbox_model & model,
         return false;
     }
     // Metal: cond+uncond batched into a single forward.  See eval_prompt_mtl.
-    const bool use_b2 = !ggml_backend_is_cpu(model.backend);
+    const bool use_b2 = !::tts_cpp::detail::backend_is_cpu(model.backend);
     if (use_b2) {
         return run_step_pass_b2(model, allocr, n_threads, n_past, token,
                                 logits_cond_out, logits_uncond_out);
diff --git a/tts-cpp/src/voice_encoder.cpp b/tts-cpp/src/voice_encoder.cpp
index da3fe395012..f10a53bdc48 100644
--- a/tts-cpp/src/voice_encoder.cpp
+++ b/tts-cpp/src/voice_encoder.cpp
@@ -1,4 +1,5 @@
 #include "voice_encoder.h"
+#include "backend_selection.h"
 #include "voice_features.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -459,9 +460,15 @@ bool voice_encoder_embed(const std::vector<float> & wav_16k,
     ve_graph G;
     G.backend = backend;
     if (!G.backend) {
-        G.backend = ggml_backend_cpu_init();
+        // Route through the registry so this works under GGML_BACKEND_DL=ON
+        // (Android per-arch CPU dlopen variants) as well as the legacy
+        // statically-linked GGML_BACKEND_DL=OFF builds. Direct
+        // `ggml_backend_cpu_init()` is unresolvable in the dl mode because
+        // the symbol lives in the per-arch dlopen'd .so, not in the link
+        // line.
+        G.backend = ::tts_cpp::detail::init_cpu_backend();
         if (!G.backend) {
-            fprintf(stderr, "voice_encoder_embed: ggml_backend_cpu_init failed\n");
+            fprintf(stderr, "voice_encoder_embed: init_cpu_backend failed\n");
             return false;
         }
         G.owns_backend = true;